From 61dad979a56eaefa315dbe8b01c52f0cb2723105 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 26 Aug 2012 20:26:37 -0600 Subject: [PATCH] Add utility and .h for character's UTF-8 This add regen/utf8_strings.pl takes Unicode characters and generates utf8_strings.h to contains #defines for macros that translate from the name to the UTF-8. This is needed in a few places, where previously things were manually figured out and hard-coded in. Doing this instead makes this easier, and removes EBCDIC dependencies/bugs, as the file would simply be regen'd on an EBCDIC platform. --- MANIFEST | 2 + regcomp.c | 31 +++++++-------- regen/utf8_strings.pl | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++ regexec.c | 35 ++++++---------- utf8_strings.h | 30 ++++++++++++++ 5 files changed, 167 insertions(+), 39 deletions(-) create mode 100644 regen/utf8_strings.pl create mode 100644 utf8_strings.h diff --git a/MANIFEST b/MANIFEST index 4a8e104..41a4aa5 100644 --- a/MANIFEST +++ b/MANIFEST @@ -4914,6 +4914,7 @@ regen/regcharclass.pl Generate regcharclass.h from inline data regen/regcomp.pl Builder of regnodes.h regen/regen_lib.pl Common file routines for generator scripts regen/uconfig_h.pl generate uconfig.h (requires /bin/sh) +regen/utf8_strings.pl generate utf8_strings.h regen/warnings.pl Program to write warnings.h and lib/warnings.pm regexec.c Regular expression evaluator regexp.h Public declarations for the above @@ -5551,6 +5552,7 @@ universal.c The default UNIVERSAL package methods unixish.h Defines that are assumed on Unix utf8.c Unicode routines utf8.h Unicode header +utf8_strings.h compile-time macros for characters in UTF-8 utfebcdic.h Unicode on EBCDIC (UTF-EBCDIC, tr16) header util.c Utility routines util.h Dummy header diff --git a/regcomp.c b/regcomp.c index 02382c4..921c0e9 100644 --- a/regcomp.c +++ b/regcomp.c @@ -89,6 +89,7 @@ extern const struct regexp_engine my_reg_engine; #include "dquote_static.c" #include "charclass_invlists.h" #include "inline_invlist.c" +#include "utf8_strings.h" #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) @@ -2825,18 +2826,15 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * LETTER SHARP S. We decrease the min length by 1 for each * occurrence of 'ss' found */ -#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ -# define U390_first_byte 0xb4 - const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42"; -# define U3B0_first_byte 0xb5 - const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42"; -#else -# define U390_first_byte 0xce - const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81"; -# define U3B0_first_byte 0xcf - const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81"; -#endif - const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; +#define U390_FIRST_BYTE GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE +#define U3B0_FIRST_BYTE GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE + const U8 U390_tail[] = GREEK_SMALL_LETTER_IOTA_UTF8_TAIL + COMBINING_DIAERESIS_UTF8 + COMBINING_ACUTE_ACCENT_UTF8; + const U8 U3B0_tail[] = GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL + COMBINING_DIAERESIS_UTF8 + COMBINING_ACUTE_ACCENT_UTF8; + const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; yields a net of 0 */ /* Examine the string for one of the problematic sequences */ for (s = s0; @@ -2866,7 +2864,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } break; - case U390_first_byte: + case U390_FIRST_BYTE: if (s_end - s >= len /* The 1's are because are skipping comparing the @@ -2877,7 +2875,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } break; - case U3B0_first_byte: + case U3B0_FIRST_BYTE: if (! (s_end - s >= len && memEQ(s + 1, U3B0_tail, len - 1))) { @@ -12320,9 +12318,8 @@ parseit: U8 dummy[UTF8_MAXBYTES+1]; STRLEN dummy_len; - /* This particular string is above \xff in both UTF-8 - * and UTFEBCDIC */ - to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len); assert(PL_utf8_tofold); /* Verify that worked */ } PL_utf8_foldclosures = diff --git a/regen/utf8_strings.pl b/regen/utf8_strings.pl new file mode 100644 index 0000000..d6d4c76 --- /dev/null +++ b/regen/utf8_strings.pl @@ -0,0 +1,108 @@ +use v5.16.0; +use strict; +use warnings; +require 'regen/regen_lib.pl'; +use charnames qw(:loose); + +my $out_fh = open_new('utf8_strings.h', '>', + {style => '*', by => $0, + from => "Unicode data"}); + +print $out_fh < ) { + chomp; + unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token + (?: [\ ]+ ( .* ) )? # optional flag + /x) + { + die "Unexpected syntax at line $.: $_\n"; + } + + my $name_or_cp = $1; + my $flag = $2; + + my $name; + my $cp; + + if ($name_or_cp =~ /[^[:xdigit:]]/) { + + # Anything that isn't a hex value must be a name. + $name = $name_or_cp; + $cp = charnames::vianame($name =~ s/_/ /gr); + die "Unknown name '$name' at line $.: $_\n" unless defined $name; + } + else { + $cp = $name_or_cp; + $name = charnames::viacode("0$cp"); # viacode requires a leading zero + # to be sure that the argument is hex + die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp; + } + + $name =~ s/ /_/g; # The macro name can have no blanks in it + + my $str = join "", map { sprintf "\\x%02X", $_ } + unpack("U0C*", pack("U", hex $cp)); + + my $suffix = '_UTF8'; + if (! defined $flag) { + $str = "\"$str\""; # Will be a string constant + } elsif ($flag eq 'tail') { + $str =~ s/\\x..//; # Remove the first byte + $suffix .= '_TAIL'; + $str = "\"$str\""; # Will be a string constant + } + elsif ($flag eq 'first') { + $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte + $suffix .= '_FIRST_BYTE'; + $str = "0x$str"; # Is a numeric constant + } + else { + die "Unknown flag at line $.: $_\n"; + } + print $out_fh "#define ${name}$suffix $str /* U+$cp */\n"; +} + +read_only_bottom_close_and_rename($out_fh); + +__DATA__ +0300 +0301 +0308 +03B9 tail +03C5 tail +03B9 first +03C5 first +1100 +1160 +11A8 +2010 diff --git a/regexec.c b/regexec.c index e87e365..57f47ce 100644 --- a/regexec.c +++ b/regexec.c @@ -81,6 +81,7 @@ #endif #include "inline_invlist.c" +#include "utf8_strings.h" #define RF_tainted 1 /* tainted information used? e.g. locale */ #define RF_warned 2 /* warned about big count? */ @@ -121,20 +122,13 @@ #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) /* these are unrolled below in the CCC_TRY_XXX defined */ -#ifdef EBCDIC - /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just - * skip the check on EBCDIC platforms */ -# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class) -#else -# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ if (!CAT2(PL_utf8_,class)) { \ bool ok; \ ENTER; save_re_context(); \ ok=CAT2(is_utf8_,class)((const U8*)str); \ PERL_UNUSED_VAR(ok); \ assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END -#endif - /* Doesn't do an assert to verify that is correct */ #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ if (!CAT2(PL_utf8_,class)) { \ @@ -148,20 +142,17 @@ #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ") #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ - LOAD_UTF8_CHARCLASS(X_begin, " "); \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \ - /* These are utf8 constants, and not utf-ebcdic constants, so the \ - * assert should likely and hopefully fail on an EBCDIC machine */ \ - LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \ - \ - /* No asserts are done for these, in case called on an early \ - * Unicode version in which they map to nothing */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */ + /* No asserts are done for some of these, in case called on a */ \ + /* Unicode version in which they map to nothing */ \ + LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \ + LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \ + LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8); \ + LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); /* empty in many releases */ \ + LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8); \ + LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8) #define PLACEHOLDER /* Something for the preprocessor to grab onto */ diff --git a/utf8_strings.h b/utf8_strings.h new file mode 100644 index 0000000..a83d423 --- /dev/null +++ b/utf8_strings.h @@ -0,0 +1,30 @@ +/* -*- buffer-read-only: t -*- + * !!!!!!! DO NOT EDIT THIS FILE !!!!!!! + * This file is built by regen/utf8_strings.pl from Unicode data. + * Any changes made here will be lost! + */ + +/* This file contains #defines for various Unicode code points. The values + * for the macros are all or portions of the UTF-8 encoding for the code + * point. Note that the names all have the suffix "_UTF8". + * + * The suffix "_FIRST_BYTE" may be appended to the name if the value is just + * the first byte of the UTF-8 representation; the value will be a numeric + * constant. + * + * The suffix "_TAIL" is appened if instead it represents all but the first + * byte. This, and with no suffix are both string constants */ + +#define COMBINING_GRAVE_ACCENT_UTF8 "\xCC\x80" /* U+0300 */ +#define COMBINING_ACUTE_ACCENT_UTF8 "\xCC\x81" /* U+0301 */ +#define COMBINING_DIAERESIS_UTF8 "\xCC\x88" /* U+0308 */ +#define GREEK_SMALL_LETTER_IOTA_UTF8_TAIL "\xB9" /* U+03B9 */ +#define GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL "\x85" /* U+03C5 */ +#define GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE 0xCE /* U+03B9 */ +#define GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE 0xCF /* U+03C5 */ +#define HANGUL_CHOSEONG_KIYEOK_UTF8 "\xE1\x84\x80" /* U+1100 */ +#define HANGUL_JUNGSEONG_FILLER_UTF8 "\xE1\x85\xA0" /* U+1160 */ +#define HANGUL_JONGSEONG_KIYEOK_UTF8 "\xE1\x86\xA8" /* U+11A8 */ +#define HYPHEN_UTF8 "\xE2\x80\x90" /* U+2010 */ + +/* ex: set ro: */ -- 1.8.3.1