regcomp.c: Have a subroutine do the work

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index b8fcb3b..197751d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -11029,171 +11029,6 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
         }                                                                  \
      }
  
-STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
-{
-
-    /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
-     * Locale folding is done at run-time, so this function should not be
-     * called for nodes that are for locales.
-     *
-     * This function sets the bit corresponding to the fold of the input
-     * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
-     * 'F' is 'f'.
-     *
-     * It also knows about the characters that are in the bitmap that have
-     * folds that are matchable only outside it, and sets the appropriate lists
-     * and flags.
-     *
-     * It returns the number of bits that actually changed from 0 to 1 */
-
-    U8 stored = 0;
-    U8 fold;
-
-    PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
-
-    fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
-                                    : PL_fold[value];
-
-    /* It assumes the bit for 'value' has already been set */
-    if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
-        ANYOF_BITMAP_SET(node, fold);
-        stored++;
-    }
-    if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
-       /* Certain Latin1 characters have matches outside the bitmap.  To get
-        * here, 'value' is one of those characters.   None of these matches is
-        * valid for ASCII characters under /aa, which have been excluded by
-        * the 'if' above.  The matches fall into three categories:
-        * 1) They are singly folded-to or -from an above 255 character, as
-        *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
-        *    WITH DIAERESIS;
-        * 2) They are part of a multi-char fold with another character in the
-        *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
-        * 3) They are part of a multi-char fold with a character not in the
-        *    bitmap, such as various ligatures.
-        * We aren't dealing fully with multi-char folds, except we do deal
-        * with the pattern containing a character that has a multi-char fold
-        * (not so much the inverse).
-        * For types 1) and 3), the matches only happen when the target string
-        * is utf8; that's not true for 2), and we set a flag for it.
-        *
-        * The code below adds to the passed in inversion list the single fold
-        * closures for 'value'.  The values are hard-coded here so that an
-        * innocent-looking character class, like /[ks]/i won't have to go out
-        * to disk to find the possible matches.  XXX It would be better to
-        * generate these via regen, in case a new version of the Unicode
-        * standard adds new mappings, though that is not really likely. */
-       switch (value) {
-           case 'k':
-           case 'K':
-               /* KELVIN SIGN */
-               *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
-               break;
-           case 's':
-           case 'S':
-               /* LATIN SMALL LETTER LONG S */
-               *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
-               break;
-           case MICRO_SIGN:
-               *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
-                                                GREEK_SMALL_LETTER_MU);
-               *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
-                                                GREEK_CAPITAL_LETTER_MU);
-               break;
-           case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
-           case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
-               /* ANGSTROM SIGN */
-               *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
-               if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
-                   *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
-                                                    PL_fold_latin1[value]);
-               }
-               break;
-           case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
-               *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
-                                       LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
-               break;
-           case LATIN_SMALL_LETTER_SHARP_S:
-               *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
-                                       LATIN_CAPITAL_LETTER_SHARP_S);
-
-               /* Under /a, /d, and /u, this can match the two chars "ss" */
-               if (! MORE_ASCII_RESTRICTED) {
-                   add_alternate(alternate_ptr, (U8 *) "ss", 2);
-
-                   /* And under /u or /a, it can match even if the target is
-                    * not utf8 */
-                   if (AT_LEAST_UNI_SEMANTICS) {
-                       ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
-                   }
-               }
-               break;
-           case 'F': case 'f':
-           case 'I': case 'i':
-           case 'L': case 'l':
-           case 'T': case 't':
-           case 'A': case 'a':
-           case 'H': case 'h':
-           case 'J': case 'j':
-           case 'N': case 'n':
-           case 'W': case 'w':
-           case 'Y': case 'y':
-                /* These all are targets of multi-character folds from code
-                 * points that require UTF8 to express, so they can't match
-                 * unless the target string is in UTF-8, so no action here is
-                 * necessary, as regexec.c properly handles the general case
-                 * for UTF-8 matching */
-               break;
-           default:
-               /* Use deprecated warning to increase the chances of this
-                * being output */
-               ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
-               break;
-       }
-    }
-    else if (DEPENDS_SEMANTICS
-           && ! isASCII(value)
-           && PL_fold_latin1[value] != value)
-    {
-          /* Under DEPENDS rules, non-ASCII Latin1 characters match their
-           * folds only when the target string is in UTF-8.  We add the fold
-           * here to the list of things to match outside the bitmap, which
-           * won't be looked at unless it is UTF8 (or else if something else
-           * says to look even if not utf8, but those things better not happen
-           * under DEPENDS semantics. */
-       *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
-    }
-
-    return stored;
-}
-
-
-PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
-{
-    /* This inline function sets a bit in the bitmap if not already set, and if
-     * appropriate, its fold, returning the number of bits that actually
-     * changed from 0 to 1 */
-
-    U8 stored;
-
-    PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
-
-    if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
-       return 0;
-    }
-
-    ANYOF_BITMAP_SET(node, value);
-    stored = 1;
-
-    if (FOLD && ! LOC) {       /* Locale folds aren't known until runtime */
-       stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
-    }
-
-    return stored;
-}
-
  STATIC void
  S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
  {
@@ -11260,22 +11095,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
       * string is in UTF-8.  (Because is under /d) */
      SV* depends_list = NULL;
  
-    /* The items that are to match that aren't stored in the bitmap, but are a
-     * result of things that are stored there.  This is the fold closure of
-     * such a character, either because it has DEPENDS semantics and shouldn't
-     * be matched unless the target string is utf8, or is a code point that is
-     * too large for the bit map, as for example, the fold of the MICRO SIGN is
-     * above 255.  This all is solely for performance reasons.  By having this
-     * code know the outside-the-bitmap folds that the bitmapped characters are
-     * involved with, we don't have to go out to disk to find the list of
-     * matches, unless the character class includes code points that aren't
-     * storable in the bit map.  That means that a character class with an 's'
-     * in it, for example, doesn't need to go out to disk to find everything
-     * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
-     * empty unless there is something whose fold we don't know about, and will
-     * have to go out to the disk to find. */
-    SV* l1_fold_invlist = NULL;
-
      /* List of multi-character folds that are matched by this node */
      AV* unicode_alternate  = NULL;
  #ifdef EBCDIC
@@ -12159,7 +11978,12 @@ parseit:
                  /* Here is an above Latin1 character.  We don't have the rules
                   * hard-coded for it.  First, get its fold */
                 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
-                                    ((allow_full_fold) ? FOLD_FLAGS_FULL : 0);
+                                    ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
+                                    | ((LOC)
+                                        ? FOLD_FLAGS_LOCALE
+                                        : (MORE_ASCII_RESTRICTED)
+                                            ? FOLD_FLAGS_NOMIX_ASCII
+                                            : 0));
  
                 if (foldlen > (STRLEN)UNISKIP(f)) {
  
@@ -12177,21 +12001,9 @@ parseit:
                          * Latin1 range, tell the regex engine that this can
                          * match a non-utf8 target string.  */
                          while (loc < e) {
-
-                            /* Can't mix ascii with non- under /aa */
-                            if (MORE_ASCII_RESTRICTED
-                                && (isASCII(*loc) != isASCII(j)))
-                            {
-                                goto end_multi_fold;
-                            }
                              if (UTF8_IS_INVARIANT(*loc)
                                  || UTF8_IS_DOWNGRADEABLE_START(*loc))
                              {
-                                /* Can't mix above and below 256 under LOC
-                                 */
-                                if (LOC) {
-                                    goto end_multi_fold;
-                                }
                                  ANYOF_FLAGS(ret)
                                          |= ANYOF_NONBITMAP_NON_UTF8;
                                  break;
@@ -12200,7 +12012,6 @@ parseit:
                          }
  
                         add_alternate(&unicode_alternate, foldbuf, foldlen);
-                   end_multi_fold: ;
                     }
                 }
                  else {
@@ -12252,17 +12063,6 @@ parseit:
         SvREFCNT_dec(fold_intersection);
      }
  
-    /* Combine the two lists into one. */
-    if (l1_fold_invlist) {
-       if (nonbitmap) {
-           _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
-           SvREFCNT_dec(l1_fold_invlist);
-       }
-       else {
-           nonbitmap = l1_fold_invlist;
-       }
-    }
-
      /* And combine the result (if any) with any inversion list from properties.
       * The lists are kept separate up to now because we don't want to fold the
       * properties */