regen/regcharclass.pl: Generate macros for multi-char fold sequences

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 61b52c9..8cef832 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -91,6 +91,12 @@ extern const struct regexp_engine my_reg_engine;
  #include "inline_invlist.c"
  #include "unicode_constants.h"
  
+#ifdef HAS_ISBLANK
+#   define hasISBLANK 1
+#else
+#   define hasISBLANK 0
+#endif
+
  #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  
@@ -291,8 +297,8 @@ typedef struct RExC_state_t {
      string can occur infinitely far to the right.
    
    - minlenp
-    A pointer to the minimum length of the pattern that the string 
-    was found inside. This is important as in the case of positive 
+    A pointer to the minimum number of characters of the pattern that the
+    string was found inside. This is important as in the case of positive
      lookahead or positive lookbehind we can have multiple patterns 
      involved. Consider
      
@@ -2593,9 +2599,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
   * these get optimized out
   *
   * If there are problematic code sequences, *min_subtract is set to the delta
- * that the minimum size of the node can be less than its actual size.  And,
- * the node type of the result is changed to reflect that it contains these
- * sequences.
+ * number of characters that the minimum size of the node can be less than its
+ * actual size.  And, the node type of the result is changed to reflect that it
+ * contains these sequences.
   *
   * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
   * and contains LATIN SMALL LETTER SHARP S
@@ -2812,15 +2818,12 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
               *
              * This means that in case-insensitive matching (or "loose
-            * matching", as Unicode calls it), an EXACTF of length six (the
-            * UTF-8 encoded byte length of the above casefolded versions) can
-            * match a target string of length two (the byte length of UTF-8
-            * encoded U+0390 or U+03B0).  This would rather mess up the
-            * minimum length computation.  (there are other code points that
-            * also fold to these two sequences, but the delta is smaller)
+            * matching", as Unicode calls it), an EXACTF of length 3 chars can
+             * match a target string of length 1 char.  This would rather mess
+             * up the minimum length computation.
              *
              * If these sequences are found, the minimum length is decreased by
-            * four (six minus two).
+            * two.
              *
              * Similarly, 'ss' may match the single char and byte LATIN SMALL
              * LETTER SHARP S.  We decrease the min length by 1 for each
@@ -2882,7 +2885,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                             break;
                         }
                       greek_sequence:
-                       *min_subtract += 4;
+                       *min_subtract += 2;
  
                         /* This requires special handling by trie's, so change
                          * the node type to indicate this.  If EXACTFA and
@@ -3025,7 +3028,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
  {
      dVAR;
-    I32 min = 0, pars = 0, code;
+    I32 min = 0;    /* There must be at least this number of characters to match */
+    I32 pars = 0, code;
      regnode *scan = *scanp, *next;
      I32 delta = 0;
      int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
@@ -3052,9 +3056,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
    fake_study_recurse:
      while ( scan && OP(scan) != END && scan < last ){
-        UV min_subtract = 0;    /* How much to subtract from the minimum node
-                                   length to get a real minimum (because the
-                                   folded version may be shorter) */
+        UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
+                                   node length to get a real minimum (because
+                                   the folded version may be shorter) */
         bool has_exactf_sharp_s = FALSE;
         /* Peephole optimizer: */
         DEBUG_STUDYDATA("Peep:", data,depth);
@@ -3419,7 +3423,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                       * trietype so we can turn them into a trie. If/when we
                                       * allow NOTHING to start a trie sequence this condition will be
                                       * required, and it isn't expensive so we leave it in for now. */
-                                    if ( trietype != NOTHING )
+                                    if ( trietype && trietype != NOTHING )
                                          make_trie( pRExC_state,
                                                  startbranch, first, cur, tail, count,
                                                  trietype, depth+1 );
@@ -3450,7 +3454,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
  
                          });
-                        if ( last ) {
+                        if ( last && trietype ) {
                              if ( trietype != NOTHING ) {
                                  /* the last branch of the sequence was part of a trie,
                                   * so we have to construct it here outside of the loop
@@ -3666,9 +3670,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
             }
             min += l - min_subtract;
-            if (min < 0) {
-                min = 0;
-            }
+            assert (min >= 0);
              delta += min_subtract;
             if (flags & SCF_DO_SUBSTR) {
                 data->pos_min += l - min_subtract;
@@ -4209,7 +4211,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 case ALNUM:
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
+                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
                              if (OP(scan) == ALNUMU) {
                                  for (value = 0; value < 256; value++) {
                                      if (!isWORDCHAR_L1(value)) {
@@ -4227,7 +4229,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     }
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
+                           ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
  
                         /* Even if under locale, set the bits for non-locale
                          * in case it isn't a true locale-node.  This will
@@ -4250,7 +4252,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 case NALNUM:
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
+                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
                              if (OP(scan) == NALNUMU) {
                                  for (value = 0; value < 256; value++) {
                                      if (isWORDCHAR_L1(value)) {
@@ -4268,7 +4270,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     }
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
+                           ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
  
                         /* Even if under locale, set the bits for non-locale in
                          * case it isn't a true locale-node.  This will create
@@ -11118,7 +11120,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
                     switch (skip) {
                     case 4:
                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
-                           namedclass = ANYOF_ALNUM;
+                           namedclass = ANYOF_WORDCHAR;
                         break;
                     case 5:
                         /* Names all of length 5.  */
@@ -11559,8 +11561,8 @@ parseit:
              * A similar issue a little bit later when switching on
              * namedclass. --jhi */
             switch ((I32)value) {
-           case 'w':   namedclass = ANYOF_ALNUM;       break;
-           case 'W':   namedclass = ANYOF_NALNUM;      break;
+           case 'w':   namedclass = ANYOF_WORDCHAR;    break;
+           case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
             case 's':   namedclass = ANYOF_SPACE;       break;
             case 'S':   namedclass = ANYOF_NSPACE;      break;
             case 'd':   namedclass = ANYOF_DIGIT;       break;
@@ -11854,32 +11856,83 @@ parseit:
                          runtime_posix_matches_above_Unicode);
                     break;
                 case ANYOF_ASCII:
+#ifdef HAS_ISASCII
                     if (LOC) {
                         ANYOF_CLASS_SET(ret, namedclass);
                     }
-                    else {
+                    else
+#endif  /* Not isascii(); just use the hard-coded definition for it */
                          _invlist_union(posixes, PL_ASCII, &posixes);
-                    }
                     break;
                 case ANYOF_NASCII:
+#ifdef HAS_ISASCII
                     if (LOC) {
                         ANYOF_CLASS_SET(ret, namedclass);
                     }
                      else {
+#endif
                          _invlist_union_complement_2nd(posixes,
                                                      PL_ASCII, &posixes);
                          if (DEPENDS_SEMANTICS) {
                              ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
                          }
+#ifdef HAS_ISASCII
                      }
+#endif
                     break;
                 case ANYOF_BLANK:
-                    DO_POSIX(ret, namedclass, posixes,
+                    if (hasISBLANK || ! LOC) {
+                        DO_POSIX(ret, namedclass, posixes,
                                              PL_PosixBlank, PL_XPosixBlank);
+                    }
+                    else { /* There is no isblank() and we are in locale:  We
+                              use the ASCII range and the above-Latin1 range
+                              code points */
+                        SV* scratch_list = NULL;
+
+                        /* Include all above-Latin1 blanks */
+                        _invlist_intersection(PL_AboveLatin1,
+                                              PL_XPosixBlank,
+                                              &scratch_list);
+                        /* Add it to the running total of posix classes */
+                        if (! posixes) {
+                            posixes = scratch_list;
+                        }
+                        else {
+                            _invlist_union(posixes, scratch_list, &posixes);
+                            SvREFCNT_dec(scratch_list);
+                        }
+                        /* Add the ASCII-range blanks to the running total. */
+                        _invlist_union(posixes, PL_PosixBlank, &posixes);
+                    }
                     break;
                 case ANYOF_NBLANK:
-                    DO_N_POSIX(ret, namedclass, posixes,
-                                            PL_PosixBlank, PL_XPosixBlank);
+                    if (hasISBLANK || ! LOC) {
+                        DO_N_POSIX(ret, namedclass, posixes,
+                                                PL_PosixBlank, PL_XPosixBlank);
+                    }
+                    else { /* There is no isblank() and we are in locale */
+                        SV* scratch_list = NULL;
+
+                        /* Include all above-Latin1 non-blanks */
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+
+                        /* Add them to the running total of posix classes */
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+                        if (! posixes) {
+                            posixes = scratch_list;
+                        }
+                        else {
+                            _invlist_union(posixes, scratch_list, &posixes);
+                            SvREFCNT_dec(scratch_list);
+                        }
+
+                        /* Get the list of all non-ASCII-blanks in Latin 1, and
+                         * add them to the running total */
+                        _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+                        _invlist_union(posixes, scratch_list, &posixes);
+                        SvREFCNT_dec(scratch_list);
+                    }
                     break;
                 case ANYOF_CNTRL:
                      DO_POSIX(ret, namedclass, posixes,
@@ -12015,11 +12068,11 @@ parseit:
                     }
                     break;
                 }
-               case ANYOF_ALNUM:   /* Really is 'Word' */
+               case ANYOF_WORDCHAR:
                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                              PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
                     break;
-               case ANYOF_NALNUM:
+               case ANYOF_NWORDCHAR:
                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                              PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
                              runtime_posix_matches_above_Unicode);
@@ -12150,10 +12203,10 @@ parseit:
                   * modifier to the regex.  We first calculate the base node
                   * type, and if it should be inverted */
  
-                case ANYOF_NALNUM:
+                case ANYOF_NWORDCHAR:
                      invert = ! invert;
                      /* FALLTHROUGH */
-                case ANYOF_ALNUM:
+                case ANYOF_WORDCHAR:
                      op = ALNUM;
                      goto join_charset_classes;