Add utility and .h for character's UTF-8

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index 27ad2d8..57f47ce 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -81,6 +81,7 @@
  #endif
  
  #include "inline_invlist.c"
+#include "utf8_strings.h"
  
  #define RF_tainted     1       /* tainted information used? e.g. locale */
  #define RF_warned      2               /* warned about big count? */
@@ -121,20 +122,13 @@
  #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
  
  /* these are unrolled below in the CCC_TRY_XXX defined */
-#ifdef EBCDIC
-    /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
-     * skip the check on EBCDIC platforms */
-#   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
-#else
-#   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
+#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
      if (!CAT2(PL_utf8_,class)) { \
         bool ok; \
         ENTER; save_re_context(); \
         ok=CAT2(is_utf8_,class)((const U8*)str); \
          PERL_UNUSED_VAR(ok); \
         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
-#endif
-
  /* Doesn't do an assert to verify that is correct */
  #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
      if (!CAT2(PL_utf8_,class)) { \
@@ -148,19 +142,17 @@
  #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
  
  #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
-       LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
-       LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
-       /* These are utf8 constants, and not utf-ebcdic constants, so the   \
-           * assert should likely and hopefully fail on an EBCDIC machine */ \
-       LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
-                                                                           \
-       /* No asserts are done for these, in case called on an early        \
-           * Unicode version in which they map to nothing */               \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */  
+        /* No asserts are done for some of these, in case called on a   */  \
+        /* Unicode version in which they map to nothing */                  \
+       LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8);                          \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
+       LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
+       LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8);              \
+       LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8);      \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI);    /* empty in many releases */ \
+       LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8);             \
+       LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
  
  #define PLACEHOLDER    /* Something for the preprocessor to grab onto */
  
@@ -3924,9 +3916,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                | Prepend* Begin Extend*
                | .
  
-              Begin is (Hangul-syllable | ! Control)
-              Extend is (Grapheme_Extend | Spacing_Mark)
-              Control is [ GCB_Control CR LF ]
+               Begin is:           ( Special_Begin | ! Control )
+               Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
+               Extend is:          ( Grapheme_Extend | Spacing_Mark )
+               Control is:         [ GCB_Control  CR  LF ]
+               Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
  
                The discussion below shows how the code for CLUMP is derived
                from this regex.  Note that most of these concepts are from
@@ -3999,13 +3993,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                      /* Match (prepend)*, but don't bother trying if empty (as
                       * being set to _undef indicates) */
                      if (PL_utf8_X_prepend != &PL_sv_undef) {
-                   while (locinput < PL_regeol
-                          && swash_fetch(PL_utf8_X_prepend,
-                                         (U8*)locinput, utf8_target))
-                    {
-                       previous_prepend = locinput;
-                       locinput += UTF8SKIP(locinput);
-                   }
+                        while (locinput < PL_regeol
+                               && swash_fetch(PL_utf8_X_prepend,
+                                              (U8*)locinput, utf8_target))
+                        {
+                            previous_prepend = locinput;
+                            locinput += UTF8SKIP(locinput);
+                        }
                      }
  
                     /* As noted above, if we matched a prepend character, but
@@ -4033,21 +4027,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                     } else {
  
                         /* Here is the beginning of a character that can have
-                        * an extender.  It is either a hangul syllable, or a
-                        * non-control */
-                       if (swash_fetch(PL_utf8_X_non_hangul,
+                         * an extender.  It is either a special begin character
+                         * that requires complicated handling, or a non-control
+                         * */
+                       if (! swash_fetch(PL_utf8_X_special_begin,
                                         (U8*)locinput, utf8_target))
                         {
  
-                           /* Here not a Hangul syllable, must be a
+                           /* Here not a special begin, must be a
                              * ('!  * Control') */
                             locinput += UTF8SKIP(locinput);
                         } else {
  
-                           /* Here is a Hangul syllable.  It can be composed
-                            * of several individual characters.  One
-                            * possibility is T+ */
-                           if (swash_fetch(PL_utf8_X_T,
+                           /* Here is a special begin.  It can be composed
+                             * of several individual characters.  One
+                             * possibility is RI+ */
+                           if (swash_fetch(PL_utf8_X_RI,
+                                           (U8*)locinput, utf8_target))
+                           {
+                               while (locinput < PL_regeol
+                                       && swash_fetch(PL_utf8_X_RI,
+                                                       (U8*)locinput, utf8_target))
+                               {
+                                   locinput += UTF8SKIP(locinput);
+                               }
+                           } else /* Another possibility is T+ */
+                                   if (swash_fetch(PL_utf8_X_T,
                                             (U8*)locinput, utf8_target))
                             {
                                 while (locinput < PL_regeol
@@ -4058,9 +4063,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                                 }
                             } else {
  
-                               /* Here, not T+, but is a Hangul.  That means
-                                * it is one of the others: L, LV, LVT or V,
-                                * and matches:
+                                /* Here, neither RI+ nor T+; must be some other
+                                 * Hangul.  That means it is one of the others:
+                                 * L, LV, LVT or V, and matches:
                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
  
                                 /* Match L*           */