regexec.c: Remove no longer needed comments

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index 469a7fc..39a504f 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -81,6 +81,7 @@
  #endif
  
  #include "inline_invlist.c"
+#include "utf8_strings.h"
  
  #define RF_tainted     1       /* tainted information used? e.g. locale */
  #define RF_warned      2               /* warned about big count? */
@@ -121,20 +122,13 @@
  #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
  
  /* these are unrolled below in the CCC_TRY_XXX defined */
-#ifdef EBCDIC
-    /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
-     * skip the check on EBCDIC platforms */
-#   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
-#else
-#   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
+#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
      if (!CAT2(PL_utf8_,class)) { \
         bool ok; \
         ENTER; save_re_context(); \
         ok=CAT2(is_utf8_,class)((const U8*)str); \
          PERL_UNUSED_VAR(ok); \
         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
-#endif
-
  /* Doesn't do an assert to verify that is correct */
  #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
      if (!CAT2(PL_utf8_,class)) { \
@@ -148,19 +142,17 @@
  #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
  
  #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
-       LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
-       LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
-       /* These are utf8 constants, and not utf-ebcdic constants, so the   \
-           * assert should likely and hopefully fail on an EBCDIC machine */ \
-       LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
-                                                                           \
-       /* No asserts are done for these, in case called on an early        \
-           * Unicode version in which they map to nothing */               \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */  
+        /* No asserts are done for some of these, in case called on a   */  \
+        /* Unicode version in which they map to nothing */                  \
+       LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8);                          \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
+       LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
+       LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8);              \
+       LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8);      \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI);    /* empty in many releases */ \
+       LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8);             \
+       LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
  
  #define PLACEHOLDER    /* Something for the preprocessor to grab onto */
  
@@ -3924,39 +3916,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                | Prepend* Begin Extend*
                | .
  
-              Begin is (Hangul-syllable | ! Control)
-              Extend is (Grapheme_Extend | Spacing_Mark)
-              Control is [ GCB_Control CR LF ]
-
-              The discussion below shows how the code for CLUMP is derived
-              from this regex.  Note that most of these concepts are from
-              property values of the Grapheme Cluster Boundary (GCB) property.
-              No code point can have multiple property values for a given
-              property.  Thus a code point in Prepend can't be in Control, but
-              it must be in !Control.  This is why Control above includes
-              GCB_Control plus CR plus LF.  The latter two are used in the GCB
-              property separately, and so can't be in GCB_Control, even though
-              they logically are controls.  Control is not the same as gc=cc,
-              but includes format and other characters as well.
-
-              The Unicode definition of Hangul-syllable is:
-                  L+
-                  | (L* ( ( V | LV ) V* | LVT ) T*)
-                  | T+ 
-                 )
-              Each of these is a value for the GCB property, and hence must be
-              disjoint, so the order they are tested is immaterial, so the
-              above can safely be changed to
-                  T+
-                  | L+
-                  | (L* ( LVT | ( V | LV ) V*) T*)
-
-              The last two terms can be combined like this:
-                  L* ( L | (( LVT | ( V | LV ) V*) T*))
-
-              That means that if we have seen any L's at all we can quit
-              there, but if the next character is an LVT, a V, or an LV we
-              should keep going.
+               Begin is:           ( Special_Begin | ! Control )
+               Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
+               Extend is:          ( Grapheme_Extend | Spacing_Mark )
+               Control is:         [ GCB_Control  CR  LF ]
+               Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
  
                There is a subtlety with Prepend* which showed up in testing.
                Note that the Begin, and only the Begin is required in:
@@ -3996,14 +3960,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
  
                     LOAD_UTF8_CHARCLASS_GCB();
  
-                   /* Match (prepend)* */
-                   while (locinput < PL_regeol
-                          && swash_fetch(PL_utf8_X_prepend,
-                                         (U8*)locinput, utf8_target))
-                   {
-                       previous_prepend = locinput;
-                       locinput += UTF8SKIP(locinput);
-                   }
+                    /* Match (prepend)*, but don't bother trying if empty (as
+                     * being set to _undef indicates) */
+                    if (PL_utf8_X_prepend != &PL_sv_undef) {
+                        while (locinput < PL_regeol
+                               && swash_fetch(PL_utf8_X_prepend,
+                                              (U8*)locinput, utf8_target))
+                        {
+                            previous_prepend = locinput;
+                            locinput += UTF8SKIP(locinput);
+                        }
+                    }
  
                     /* As noted above, if we matched a prepend character, but
                      * the next thing won't match, back off the last prepend we
@@ -4030,21 +3997,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                     } else {
  
                         /* Here is the beginning of a character that can have
-                        * an extender.  It is either a hangul syllable, or a
-                        * non-control */
-                       if (swash_fetch(PL_utf8_X_non_hangul,
+                         * an extender.  It is either a special begin character
+                         * that requires complicated handling, or a non-control
+                         * */
+                       if (! swash_fetch(PL_utf8_X_special_begin,
                                         (U8*)locinput, utf8_target))
                         {
  
-                           /* Here not a Hangul syllable, must be a
+                           /* Here not a special begin, must be a
                              * ('!  * Control') */
                             locinput += UTF8SKIP(locinput);
                         } else {
  
-                           /* Here is a Hangul syllable.  It can be composed
-                            * of several individual characters.  One
-                            * possibility is T+ */
-                           if (swash_fetch(PL_utf8_X_T,
+                           /* Here is a special begin.  It can be composed
+                             * of several individual characters.  One
+                             * possibility is RI+ */
+                           if (swash_fetch(PL_utf8_X_RI,
+                                           (U8*)locinput, utf8_target))
+                           {
+                               while (locinput < PL_regeol
+                                       && swash_fetch(PL_utf8_X_RI,
+                                                       (U8*)locinput, utf8_target))
+                               {
+                                   locinput += UTF8SKIP(locinput);
+                               }
+                           } else /* Another possibility is T+ */
+                                   if (swash_fetch(PL_utf8_X_T,
                                             (U8*)locinput, utf8_target))
                             {
                                 while (locinput < PL_regeol
@@ -4055,9 +4033,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                                 }
                             } else {
  
-                               /* Here, not T+, but is a Hangul.  That means
-                                * it is one of the others: L, LV, LVT or V,
-                                * and matches:
+                                /* Here, neither RI+ nor T+; must be some other
+                                 * Hangul.  That means it is one of the others:
+                                 * L, LV, LVT or V, and matches:
                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
  
                                 /* Match L*           */
@@ -6689,7 +6667,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
             SV * const rv = MUTABLE_SV(data->data[n]);
             AV * const av = MUTABLE_AV(SvRV(rv));
             SV **const ary = AvARRAY(av);
-           U8 swash_init_flags = 0;
+           U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
         
             si = *ary;  /* ary[0] = the string to initialize the swash with */
  
@@ -6718,8 +6696,6 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
                                       si,
                                       1, /* binary */
                                       0, /* not from tr/// */
-                                     FALSE, /* is error if can't find
-                                               property */
                                       invlist,
                                       &swash_init_flags);
                 (void)av_store(av, 1, sw);