From 61dad979a56eaefa315dbe8b01c52f0cb2723105 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Sun, 26 Aug 2012 20:26:37 -0600
Subject: [PATCH] Add utility and .h for character's UTF-8

This add regen/utf8_strings.pl takes Unicode characters and generates
utf8_strings.h to contains #defines for macros that translate from the
name to the UTF-8.  This is needed in a few places, where previously
things were manually figured out and hard-coded in.  Doing this instead
makes this easier, and removes EBCDIC dependencies/bugs, as the file
would simply be regen'd on an EBCDIC platform.
---
 MANIFEST              |   2 +
 regcomp.c             |  31 +++++++--------
 regen/utf8_strings.pl | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++
 regexec.c             |  35 ++++++----------
 utf8_strings.h        |  30 ++++++++++++++
 5 files changed, 167 insertions(+), 39 deletions(-)
 create mode 100644 regen/utf8_strings.pl
 create mode 100644 utf8_strings.h

diff --git a/MANIFEST b/MANIFEST
index 4a8e104..41a4aa5 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -4914,6 +4914,7 @@ regen/regcharclass.pl		Generate regcharclass.h from inline data
 regen/regcomp.pl		Builder of regnodes.h
 regen/regen_lib.pl		Common file routines for generator scripts
 regen/uconfig_h.pl		generate uconfig.h (requires /bin/sh)
+regen/utf8_strings.pl		generate utf8_strings.h
 regen/warnings.pl		Program to write warnings.h and lib/warnings.pm
 regexec.c			Regular expression evaluator
 regexp.h			Public declarations for the above
@@ -5551,6 +5552,7 @@ universal.c			The default UNIVERSAL package methods
 unixish.h			Defines that are assumed on Unix
 utf8.c				Unicode routines
 utf8.h				Unicode header
+utf8_strings.h			compile-time macros for characters in UTF-8
 utfebcdic.h			Unicode on EBCDIC (UTF-EBCDIC, tr16) header
 util.c				Utility routines
 util.h				Dummy header
diff --git a/regcomp.c b/regcomp.c
index 02382c4..921c0e9 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -89,6 +89,7 @@ extern const struct regexp_engine my_reg_engine;
 #include "dquote_static.c"
 #include "charclass_invlists.h"
 #include "inline_invlist.c"
+#include "utf8_strings.h"
 
 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
@@ -2825,18 +2826,15 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
 	     * LETTER SHARP S.  We decrease the min length by 1 for each
 	     * occurrence of 'ss' found */
 
-#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
-#	    define U390_first_byte 0xb4
-	    const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
-#	    define U3B0_first_byte 0xb5
-	    const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
-#else
-#	    define U390_first_byte 0xce
-	    const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
-#	    define U3B0_first_byte 0xcf
-	    const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
-#endif
-	    const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
+#define U390_FIRST_BYTE GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE
+#define U3B0_FIRST_BYTE GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE
+	    const U8 U390_tail[] = GREEK_SMALL_LETTER_IOTA_UTF8_TAIL
+                                   COMBINING_DIAERESIS_UTF8
+                                   COMBINING_ACUTE_ACCENT_UTF8;
+	    const U8 U3B0_tail[] = GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL
+                                   COMBINING_DIAERESIS_UTF8
+                                   COMBINING_ACUTE_ACCENT_UTF8;
+            const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
 						 yields a net of 0 */
 	    /* Examine the string for one of the problematic sequences */
 	    for (s = s0;
@@ -2866,7 +2864,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
 			}
 			break;
 
-		    case U390_first_byte:
+		    case U390_FIRST_BYTE:
 			if (s_end - s >= len
 
 			    /* The 1's are because are skipping comparing the
@@ -2877,7 +2875,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
 			}
 			break;
 
-		    case U3B0_first_byte:
+		    case U3B0_FIRST_BYTE:
 			if (! (s_end - s >= len
 			       && memEQ(s + 1, U3B0_tail, len - 1)))
 			{
@@ -12320,9 +12318,8 @@ parseit:
                         U8 dummy[UTF8_MAXBYTES+1];
                         STRLEN dummy_len;
 
-                        /* This particular string is above \xff in both UTF-8
-                         * and UTFEBCDIC */
-                        to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
+                        /* This string is just a short named one above \xff */
+                        to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
                         assert(PL_utf8_tofold); /* Verify that worked */
                     }
                     PL_utf8_foldclosures =
diff --git a/regen/utf8_strings.pl b/regen/utf8_strings.pl
new file mode 100644
index 0000000..d6d4c76
--- /dev/null
+++ b/regen/utf8_strings.pl
@@ -0,0 +1,108 @@
+use v5.16.0;
+use strict;
+use warnings;
+require 'regen/regen_lib.pl';
+use charnames qw(:loose);
+
+my $out_fh = open_new('utf8_strings.h', '>',
+		      {style => '*', by => $0,
+                      from => "Unicode data"});
+
+print $out_fh <<END;
+/* This file contains #defines for various Unicode code points.  The values
+ * for the macros are all or portions of the UTF-8 encoding for the code
+ * point.  Note that the names all have the suffix "_UTF8".
+ *
+ * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
+ * the first byte of the UTF-8 representation; the value will be a numeric
+ * constant.
+ *
+ * The suffix "_TAIL" is appened if instead it represents all but the first
+ * byte.  This, and with no suffix are both string constants */
+
+END
+
+# The data are at the end of this file.  Each line represents one #define.
+# Each line begins with either a Unicode character name with the blanks in it
+# squeezed out or replaced by underscores; or it may be a hexadecimal code
+# point.  In the latter case, the name will be looked-up to use as the name
+# of the macro.  In either case, the macro name will have suffixes as
+# listed above, and all blanks will be replaced by underscores.
+#
+# Each line may optionally have one of the following flags on it, separated by
+# white space from the initial token.
+#   first   indicates that the output is to be of the FIRST_BYTE form
+#           described in the comments above that are placed in the file.
+#   tail    indicates that the output is of the _TAIL form.
+#
+# This program is used to make it convenient to create compile time constants
+# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
+# having to figure things out.
+
+while ( <DATA> ) {
+    chomp;
+    unless ($_ =~ m/ ^ ( [^\ ]* )           # Name or code point token
+                       (?: [\ ]+ ( .* ) )?  # optional flag
+                   /x)
+    {
+        die "Unexpected syntax at line $.: $_\n";
+    }
+
+    my $name_or_cp = $1;
+    my $flag = $2;
+
+    my $name;
+    my $cp;
+
+    if ($name_or_cp =~ /[^[:xdigit:]]/) {
+
+        # Anything that isn't a hex value must be a name.
+        $name = $name_or_cp;
+        $cp = charnames::vianame($name =~ s/_/ /gr);
+        die "Unknown name '$name' at line $.: $_\n" unless defined $name;
+    }
+    else {
+        $cp = $name_or_cp;
+        $name = charnames::viacode("0$cp"); # viacode requires a leading zero
+                                            # to be sure that the argument is hex
+        die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
+    }
+
+    $name =~ s/ /_/g;   # The macro name can have no blanks in it
+
+    my $str = join "", map { sprintf "\\x%02X", $_ }
+                       unpack("U0C*", pack("U", hex $cp));
+
+    my $suffix = '_UTF8';
+    if (! defined $flag) {
+        $str = "\"$str\"";  # Will be a string constant
+    } elsif ($flag eq 'tail') {
+            $str =~ s/\\x..//;  # Remove the first byte
+            $suffix .= '_TAIL';
+            $str = "\"$str\"";  # Will be a string constant
+    }
+    elsif ($flag eq 'first') {
+        $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
+        $suffix .= '_FIRST_BYTE';
+        $str = "0x$str";        # Is a numeric constant
+    }
+    else {
+        die "Unknown flag at line $.: $_\n";
+    }
+    print $out_fh "#define ${name}$suffix $str    /* U+$cp */\n";
+}
+
+read_only_bottom_close_and_rename($out_fh);
+
+__DATA__
+0300
+0301
+0308
+03B9 tail
+03C5 tail
+03B9 first
+03C5 first
+1100
+1160
+11A8
+2010
diff --git a/regexec.c b/regexec.c
index e87e365..57f47ce 100644
--- a/regexec.c
+++ b/regexec.c
@@ -81,6 +81,7 @@
 #endif
 
 #include "inline_invlist.c"
+#include "utf8_strings.h"
 
 #define RF_tainted	1	/* tainted information used? e.g. locale */
 #define RF_warned	2		/* warned about big count? */
@@ -121,20 +122,13 @@
 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 
 /* these are unrolled below in the CCC_TRY_XXX defined */
-#ifdef EBCDIC
-    /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
-     * skip the check on EBCDIC platforms */
-#   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
-#else
-#   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
+#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
     if (!CAT2(PL_utf8_,class)) { \
 	bool ok; \
 	ENTER; save_re_context(); \
 	ok=CAT2(is_utf8_,class)((const U8*)str); \
         PERL_UNUSED_VAR(ok); \
 	assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
-#endif
-
 /* Doesn't do an assert to verify that is correct */
 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
     if (!CAT2(PL_utf8_,class)) { \
@@ -148,20 +142,17 @@
 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 
 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
-	LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
-	LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                             \
-	/* These are utf8 constants, and not utf-ebcdic constants, so the   \
-	    * assert should likely and hopefully fail on an EBCDIC machine */ \
-	LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
-									    \
-	/* No asserts are done for these, in case called on an early        \
-	    * Unicode version in which they map to nothing */               \
-	LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
-	LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);	    /* U+1100 "\xe1\x84\x80" */ \
-	LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
-	LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
-	LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI);	                            \
-	LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */  
+        /* No asserts are done for some of these, in case called on a   */  \
+        /* Unicode version in which they map to nothing */                  \
+	LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8);                          \
+	LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
+	LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
+	LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
+	LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8);	            \
+	LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8);      \
+	LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI);    /* empty in many releases */ \
+	LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8);             \
+	LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
 
 #define PLACEHOLDER	/* Something for the preprocessor to grab onto */
 
diff --git a/utf8_strings.h b/utf8_strings.h
new file mode 100644
index 0000000..a83d423
--- /dev/null
+++ b/utf8_strings.h
@@ -0,0 +1,30 @@
+/* -*- buffer-read-only: t -*-
+ * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
+ * This file is built by regen/utf8_strings.pl from Unicode data.
+ * Any changes made here will be lost!
+ */
+
+/* This file contains #defines for various Unicode code points.  The values
+ * for the macros are all or portions of the UTF-8 encoding for the code
+ * point.  Note that the names all have the suffix "_UTF8".
+ *
+ * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
+ * the first byte of the UTF-8 representation; the value will be a numeric
+ * constant.
+ *
+ * The suffix "_TAIL" is appened if instead it represents all but the first
+ * byte.  This, and with no suffix are both string constants */
+
+#define COMBINING_GRAVE_ACCENT_UTF8 "\xCC\x80"    /* U+0300 */
+#define COMBINING_ACUTE_ACCENT_UTF8 "\xCC\x81"    /* U+0301 */
+#define COMBINING_DIAERESIS_UTF8 "\xCC\x88"    /* U+0308 */
+#define GREEK_SMALL_LETTER_IOTA_UTF8_TAIL "\xB9"    /* U+03B9 */
+#define GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL "\x85"    /* U+03C5 */
+#define GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE 0xCE    /* U+03B9 */
+#define GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE 0xCF    /* U+03C5 */
+#define HANGUL_CHOSEONG_KIYEOK_UTF8 "\xE1\x84\x80"    /* U+1100 */
+#define HANGUL_JUNGSEONG_FILLER_UTF8 "\xE1\x85\xA0"    /* U+1160 */
+#define HANGUL_JONGSEONG_KIYEOK_UTF8 "\xE1\x86\xA8"    /* U+11A8 */
+#define HYPHEN_UTF8 "\xE2\x80\x90"    /* U+2010 */
+
+/* ex: set ro: */
-- 
1.8.3.1