regexec.c: clarify comments

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index f541d9b..7dadc02 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1359,7 +1359,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         switch (OP(c)) {
         case ANYOF:
             if (utf8_target) {
-                REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+                REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
                           !UTF8_IS_INVARIANT((U8)s[0]) ?
                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
                           REGINCLASS(prog, c, (U8*)s));
@@ -5761,33 +5761,87 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
      case CANY:
         scan = loceol;
         break;
+    case EXACT:
+       /* To get here, EXACTish nodes must have *byte* length == 1.  That
+        * means they match only characters in the string that can be expressed
+        * as a single byte.  For non-utf8 strings, that means a simple match.
+        * For utf8 strings, the character matched must be an invariant, or
+        * downgradable to a single byte.  The pattern's utf8ness is
+        * irrelevant, as since it's a single byte, it either isn't utf8, or if
+        * it is, it's an invariant */
+
+       c = (U8)*STRING(p);
+       assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+       if (! utf8_target || UNI_IS_INVARIANT(c)) {
+           while (scan < loceol && UCHARAT(scan) == c) {
+               scan++;
+           }
+       }
+       else {
+
+           /* Here, the string is utf8, and the pattern char is different
+            * in utf8 than not, so can't compare them directly.  Outside the
+            * loop, find find the two utf8 bytes that represent c, and then
+            * look for those in sequence in the utf8 string */
+           U8 high = UTF8_TWO_BYTE_HI(c);
+           U8 low = UTF8_TWO_BYTE_LO(c);
+           loceol = PL_regeol;
+
+           while (hardcount < max
+                   && scan + 1 < loceol
+                   && UCHARAT(scan) == high
+                   && UCHARAT(scan + 1) == low)
+           {
+               scan += 2;
+               hardcount++;
+           }
+       }
+       break;
      case EXACTFL:
         PL_reg_flags |= RF_tainted;
         /* FALL THROUGH */
-    case EXACT:
      case EXACTF:
-       /* To get here, EXACTish nodes must have *byte* length == 1.  That means
-        * they match only characters in the string that can be expressed as a
-        * single byte.  For non-utf8 strings, that means a simple match.  For
-        * utf8 strings, the character matched must be an invariant, or
-        * downgradable to a single byte.  The pattern's utf8ness is
-        * irrelevant, as it must be a single byte, so either it isn't utf8, or
-        * if it is it's an invariant */
+
+       /* The comments for the EXACT case above apply as well to these fold
+        * ones */
  
         c = (U8)*STRING(p);
         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
  
-       if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
+       if (utf8_target) { /* Use full Unicode fold matching */
+
+           /* For the EXACTFL case, It doesn't really make sense to compare
+            * locale and utf8, but it is best we can do.  The documents warn
+            * against mixing them */
+
+           char *tmpeol = loceol;
+           while (hardcount < max
+                   && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
+                                   STRING(p), NULL, 1, UTF_PATTERN))
+           {
+               scan = tmpeol;
+               tmpeol = loceol;
+               hardcount++;
+           }
  
-           /* Here, the string isn't utf8, or the character in the EXACT
-            * node is the same in utf8 as not, so can just do equality.
-            * Each matching char must be 1 byte long */
+           /* XXX Note that the above handles properly the German sharp s in
+            * the pattern matching ss in the string.  But it doesn't handle
+            * properly cases where the string contains say 'LIGATURE ff' and
+            * the pattern is 'f+'.  This would require, say, a new function or
+            * revised interface to foldEQ_utf8(), in which the maximum number
+            * of characters to match could be passed and it would return how
+            * many actually did.  This is just one of many cases where
+            * multi-char folds don't work properly, and so the fix is being
+            * deferred */
+       }
+       else {
+
+           /* Here, the string isn't utf8 and c is a single byte; and either
+            * the pattern isn't utf8 or c is an invariant, so its utf8ness
+            * doesn't affect c.  Can just do simple comparisons for exact or
+            * fold matching. */
             switch (OP(p)) {
-           case EXACT:
-               while (scan < loceol && UCHARAT(scan) == c) {
-                   scan++;
-               }
-               break;
             case EXACTF:
                 while (scan < loceol &&
                     (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
@@ -5806,61 +5860,6 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
             }
         }
-       else {
-
-           /* Here, the string is utf8, and the pattern char is different
-            * in utf8 than not.  */
-
-           switch (OP(p)) {
-           case EXACT:
-               {
-                   /* Fastest to find the two utf8 bytes that represent c, and
-                    * then look for those in sequence in the utf8 string */
-                   U8 high = UTF8_TWO_BYTE_HI(c);
-                   U8 low = UTF8_TWO_BYTE_LO(c);
-                   loceol = PL_regeol;
-
-                   while (hardcount < max
-                          && scan + 1 < loceol
-                          && UCHARAT(scan) == high
-                          && UCHARAT(scan + 1) == low)
-                   {
-                       scan += 2;
-                       hardcount++;
-                   }
-               }
-               break;
-           case EXACTFL:   /* Doesn't really make sense, but is best we can
-                              do.  The documents warn against mixing locale
-                              and utf8 */
-           case EXACTF:
-               {   /* utf8 string, so use utf8 foldEQ */
-                   char *tmpeol = loceol;
-                   while (hardcount < max
-                          && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
-                                         STRING(p), NULL, 1, UTF_PATTERN))
-                   {
-                       scan = tmpeol;
-                       tmpeol = loceol;
-                       hardcount++;
-                   }
-
-                   /* XXX Note that the above handles properly the German
-                    * sharp ss in the pattern matching ss in the string.  But
-                    * it doesn't handle properly cases where the string
-                    * contains say 'LIGATURE ff' and the pattern is 'f+'.
-                    * This would require, say, a new function or revised
-                    * interface to foldEQ_utf8(), in which the maximum number
-                    * of characters to match could be passed and it would
-                    * return how many actually did.  This is just one of many
-                    * cases where multi-char folds don't work properly, and so
-                    * the fix is being deferred */
-               }
-               break;
-           default:
-               Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
-           }
-       }
         break;
      case ANYOF:
         if (utf8_target) {
@@ -6302,11 +6301,19 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
  
      /* If the bitmap didn't (or couldn't) match, and something outside the
       * bitmap could match, try that */
-    if (!match && (utf8_target || (flags & ANYOF_UNICODE))) {
-       if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
-           match = TRUE;
+    if (!match) {
+       if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
+           if (c >= 256
+               || ((flags & ANYOF_FOLD) /* Latin1 1 that has a non-Latin1 fold
+                                           should match */
+                   && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
+           {
+               match = TRUE;
+           }
         }
-       else {
+       if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8)
+                      || (utf8_target && flags & ANYOF_UTF8)))
+       {
             AV *av;
             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
  
@@ -6344,50 +6351,94 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                             match = TRUE;
                         }
                         else {
-                           SV** listp;
-
-                            /* Consider "k" =~ /[K]/i.  The line above would
-                             * have just folded the 'k' to itself, and that
-                             * isn't going to match 'K'.  So we look through
-                             * the closure of everything that folds to 'k'.
-                             * That will find the 'K'.  Initialize the list, if
-                             * necessary */
-                           if (! PL_utf8_foldclosures) {
-
-                               /* If the folds haven't been read in, call a fold
-                            * function to force that */
-                               if (! PL_utf8_tofold) {
-                                   U8 dummy[UTF8_MAXBYTES+1];
-                                   STRLEN dummy_len;
-                                   to_utf8_fold((U8*) "A", dummy, &dummy_len);
-                               }
-                               PL_utf8_foldclosures =
-                                       _swash_inversion_hash(PL_utf8_tofold);
+                           /* The fold in a few cases  of an above Latin1 char
+                            * is in the Latin1 range, and hence may be in the
+                            * bitmap */
+                           if (UTF8_IS_INVARIANT(*folded)
+                               && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(*folded)))
+                           {
+                               match = TRUE;
                             }
+                           else if (UTF8_IS_DOWNGRADEABLE_START(*folded)
+                                    && ANYOF_BITMAP_TEST(n,
+                                         UNI_TO_NATIVE(
+                                            TWO_BYTE_UTF8_TO_UNI(folded[0],
+                                                                  folded[1]))))
+                           { /* Since the fold comes from internally
+                              * generated data, we can safely assume it is
+                              * valid utf8 in the test above */
  
-                            /* The data structure is a hash with the keys every
-                             * character that is folded to, like 'k', and the
-                             * values each an array of everything that folds to
-                             * its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
-                           if ((listp = hv_fetch(PL_utf8_foldclosures,
-                                           (char *) folded, foldlen, FALSE)))
-                           {
-                               AV* list = (AV*) *listp;
-                               IV i;
-                               for (i = 0; i <= av_len(list); i++) {
-                                   SV** try_p = av_fetch(list, i, FALSE);
-                                   if (try_p == NULL) {
-                                       Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                               match = TRUE;
+                           }
+                            if (! match) {
+                               SV** listp;
+
+                               /* Consider "k" =~ /[K]/i.  The line above
+                                * would have just folded the 'k' to itself,
+                                * and that isn't going to match 'K'.  So we
+                                * look through the closure of everything that
+                                * folds to 'k'.  That will find the 'K'.
+                                * Initialize the list, if necessary */
+                               if (! PL_utf8_foldclosures) {
+
+                                   /* If the folds haven't been read in, call a
+                                   * fold function to force that */
+                                   if (! PL_utf8_tofold) {
+                                       U8 dummy[UTF8_MAXBYTES+1];
+                                       STRLEN dummy_len;
+                                       to_utf8_fold((U8*) "A",
+                                                           dummy, &dummy_len);
                                     }
-                                   /* Don't have to worry about embeded nulls
-                                    * since NULL isn't folded or foldable */
-                                   if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
-                                       match = TRUE;
-                                       break;
+                                   PL_utf8_foldclosures =
+                                         _swash_inversion_hash(PL_utf8_tofold);
+                               }
+
+                               /* The data structure is a hash with the keys
+                                * every character that is folded to, like 'k',
+                                * and the values each an array of everything
+                                * that folds to its key.  e.g. [ 'k', 'K',
+                                * KELVIN_SIGN ] */
+                               if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                             (char *) folded, foldlen, FALSE)))
+                               {
+                                   AV* list = (AV*) *listp;
+                                   IV i;
+                                   for (i = 0; i <= av_len(list); i++) {
+                                       SV** try_p = av_fetch(list, i, FALSE);
+                                       char* try_c;
+                                       if (try_p == NULL) {
+                                           Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                                       }
+                                       /* Don't have to worry about embeded
+                                        * nulls since NULL isn't folded or
+                                        * foldable */
+                                       try_c = SvPVX(*try_p);
+                                       if (UTF8_IS_INVARIANT(*try_c)
+                                           && ANYOF_BITMAP_TEST(n,
+                                                           UNI_TO_NATIVE(*try_c)))
+                                       {
+                                           match = TRUE;
+                                           break;
+                                       }
+                                       else if
+                                           (UTF8_IS_DOWNGRADEABLE_START(*try_c)
+                                            && ANYOF_BITMAP_TEST(n,
+                                            UNI_TO_NATIVE(
+                                               TWO_BYTE_UTF8_TO_UNI(try_c[0],
+                                                                    try_c[1]))))
+                                       {
+                                           match = TRUE;
+                                           break;
+                                       } else if (swash_fetch(sw,
+                                                               (U8*) try_c, 1))
+                                       {
+                                           match = TRUE;
+                                           break;
+                                       }
                                     }
                                 }
                             }
-                       }
+                        }
                     }
                 }