This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add utility and .h for character's UTF-8
[perl5.git] / regcomp.c
index 5a77176..921c0e9 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -89,6 +89,7 @@ extern const struct regexp_engine my_reg_engine;
 #include "dquote_static.c"
 #include "charclass_invlists.h"
 #include "inline_invlist.c"
+#include "utf8_strings.h"
 
 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
@@ -2825,18 +2826,15 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
             * LETTER SHARP S.  We decrease the min length by 1 for each
             * occurrence of 'ss' found */
 
-#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
-#          define U390_first_byte 0xb4
-           const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
-#          define U3B0_first_byte 0xb5
-           const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
-#else
-#          define U390_first_byte 0xce
-           const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
-#          define U3B0_first_byte 0xcf
-           const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
-#endif
-           const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
+#define U390_FIRST_BYTE GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE
+#define U3B0_FIRST_BYTE GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE
+           const U8 U390_tail[] = GREEK_SMALL_LETTER_IOTA_UTF8_TAIL
+                                   COMBINING_DIAERESIS_UTF8
+                                   COMBINING_ACUTE_ACCENT_UTF8;
+           const U8 U3B0_tail[] = GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL
+                                   COMBINING_DIAERESIS_UTF8
+                                   COMBINING_ACUTE_ACCENT_UTF8;
+            const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
                                                 yields a net of 0 */
            /* Examine the string for one of the problematic sequences */
            for (s = s0;
@@ -2866,7 +2864,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                        }
                        break;
 
-                   case U390_first_byte:
+                   case U390_FIRST_BYTE:
                        if (s_end - s >= len
 
                            /* The 1's are because are skipping comparing the
@@ -2877,7 +2875,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                        }
                        break;
 
-                   case U3B0_first_byte:
+                   case U3B0_FIRST_BYTE:
                        if (! (s_end - s >= len
                               && memEQ(s + 1, U3B0_tail, len - 1)))
                        {
@@ -11541,7 +11539,9 @@ parseit:
            case 'P':
                {
                char *e;
-                U8 swash_init_flags = 0;
+
+                /* This routine will handle any undefined properties */
+                U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
 
                if (RExC_parse >= RExC_end)
                    vFAIL2("Empty \\%c{}", (U8)value);
@@ -11597,8 +11597,6 @@ parseit:
                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
                                              1, /* binary */
                                              0, /* not tr/// */
-                                             TRUE, /* this routine will handle
-                                                      undefined properties */
                                              NULL, /* No inversion list */
                                              &swash_init_flags
                                             );
@@ -12320,9 +12318,8 @@ parseit:
                         U8 dummy[UTF8_MAXBYTES+1];
                         STRLEN dummy_len;
 
-                        /* This particular string is above \xff in both UTF-8
-                         * and UTFEBCDIC */
-                        to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
+                        /* This string is just a short named one above \xff */
+                        to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
                         assert(PL_utf8_tofold); /* Verify that worked */
                     }
                     PL_utf8_foldclosures =