This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/regcharclass.pl: Generate macros for multi-char fold sequences
[perl5.git] / regcomp.c
index 61b52c9..8cef832 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -91,6 +91,12 @@ extern const struct regexp_engine my_reg_engine;
 #include "inline_invlist.c"
 #include "unicode_constants.h"
 
+#ifdef HAS_ISBLANK
+#   define hasISBLANK 1
+#else
+#   define hasISBLANK 0
+#endif
+
 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 
@@ -291,8 +297,8 @@ typedef struct RExC_state_t {
     string can occur infinitely far to the right.
   
   - minlenp
-    A pointer to the minimum length of the pattern that the string 
-    was found inside. This is important as in the case of positive 
+    A pointer to the minimum number of characters of the pattern that the
+    string was found inside. This is important as in the case of positive
     lookahead or positive lookbehind we can have multiple patterns 
     involved. Consider
     
@@ -2593,9 +2599,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  * these get optimized out
  *
  * If there are problematic code sequences, *min_subtract is set to the delta
- * that the minimum size of the node can be less than its actual size.  And,
- * the node type of the result is changed to reflect that it contains these
- * sequences.
+ * number of characters that the minimum size of the node can be less than its
+ * actual size.  And, the node type of the result is changed to reflect that it
+ * contains these sequences.
  *
  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
  * and contains LATIN SMALL LETTER SHARP S
@@ -2812,15 +2818,12 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
             * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
              *
             * This means that in case-insensitive matching (or "loose
-            * matching", as Unicode calls it), an EXACTF of length six (the
-            * UTF-8 encoded byte length of the above casefolded versions) can
-            * match a target string of length two (the byte length of UTF-8
-            * encoded U+0390 or U+03B0).  This would rather mess up the
-            * minimum length computation.  (there are other code points that
-            * also fold to these two sequences, but the delta is smaller)
+            * matching", as Unicode calls it), an EXACTF of length 3 chars can
+             * match a target string of length 1 char.  This would rather mess
+             * up the minimum length computation.
             *
             * If these sequences are found, the minimum length is decreased by
-            * four (six minus two).
+            * two.
             *
             * Similarly, 'ss' may match the single char and byte LATIN SMALL
             * LETTER SHARP S.  We decrease the min length by 1 for each
@@ -2882,7 +2885,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                            break;
                        }
                      greek_sequence:
-                       *min_subtract += 4;
+                       *min_subtract += 2;
 
                        /* This requires special handling by trie's, so change
                         * the node type to indicate this.  If EXACTFA and
@@ -3025,7 +3028,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
 {
     dVAR;
-    I32 min = 0, pars = 0, code;
+    I32 min = 0;    /* There must be at least this number of characters to match */
+    I32 pars = 0, code;
     regnode *scan = *scanp, *next;
     I32 delta = 0;
     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
@@ -3052,9 +3056,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
   fake_study_recurse:
     while ( scan && OP(scan) != END && scan < last ){
-        UV min_subtract = 0;    /* How much to subtract from the minimum node
-                                   length to get a real minimum (because the
-                                   folded version may be shorter) */
+        UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
+                                   node length to get a real minimum (because
+                                   the folded version may be shorter) */
        bool has_exactf_sharp_s = FALSE;
        /* Peephole optimizer: */
        DEBUG_STUDYDATA("Peep:", data,depth);
@@ -3419,7 +3423,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                      * trietype so we can turn them into a trie. If/when we
                                      * allow NOTHING to start a trie sequence this condition will be
                                      * required, and it isn't expensive so we leave it in for now. */
-                                    if ( trietype != NOTHING )
+                                    if ( trietype && trietype != NOTHING )
                                         make_trie( pRExC_state,
                                                 startbranch, first, cur, tail, count,
                                                 trietype, depth+1 );
@@ -3450,7 +3454,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
 
                         });
-                        if ( last ) {
+                        if ( last && trietype ) {
                             if ( trietype != NOTHING ) {
                                 /* the last branch of the sequence was part of a trie,
                                  * so we have to construct it here outside of the loop
@@ -3666,9 +3670,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
            }
            min += l - min_subtract;
-            if (min < 0) {
-                min = 0;
-            }
+            assert (min >= 0);
             delta += min_subtract;
            if (flags & SCF_DO_SUBSTR) {
                data->pos_min += l - min_subtract;
@@ -4209,7 +4211,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                case ALNUM:
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
+                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
                             if (OP(scan) == ALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (!isWORDCHAR_L1(value)) {
@@ -4227,7 +4229,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
+                           ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
 
                        /* Even if under locale, set the bits for non-locale
                         * in case it isn't a true locale-node.  This will
@@ -4250,7 +4252,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                case NALNUM:
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
+                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
                             if (OP(scan) == NALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (isWORDCHAR_L1(value)) {
@@ -4268,7 +4270,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
+                           ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
 
                        /* Even if under locale, set the bits for non-locale in
                         * case it isn't a true locale-node.  This will create
@@ -11118,7 +11120,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
                    switch (skip) {
                    case 4:
                        if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
-                           namedclass = ANYOF_ALNUM;
+                           namedclass = ANYOF_WORDCHAR;
                        break;
                    case 5:
                        /* Names all of length 5.  */
@@ -11559,8 +11561,8 @@ parseit:
             * A similar issue a little bit later when switching on
             * namedclass. --jhi */
            switch ((I32)value) {
-           case 'w':   namedclass = ANYOF_ALNUM;       break;
-           case 'W':   namedclass = ANYOF_NALNUM;      break;
+           case 'w':   namedclass = ANYOF_WORDCHAR;    break;
+           case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
            case 's':   namedclass = ANYOF_SPACE;       break;
            case 'S':   namedclass = ANYOF_NSPACE;      break;
            case 'd':   namedclass = ANYOF_DIGIT;       break;
@@ -11854,32 +11856,83 @@ parseit:
                         runtime_posix_matches_above_Unicode);
                    break;
                case ANYOF_ASCII:
+#ifdef HAS_ISASCII
                    if (LOC) {
                        ANYOF_CLASS_SET(ret, namedclass);
                    }
-                    else {
+                    else
+#endif  /* Not isascii(); just use the hard-coded definition for it */
                         _invlist_union(posixes, PL_ASCII, &posixes);
-                    }
                    break;
                case ANYOF_NASCII:
+#ifdef HAS_ISASCII
                    if (LOC) {
                        ANYOF_CLASS_SET(ret, namedclass);
                    }
                     else {
+#endif
                         _invlist_union_complement_2nd(posixes,
                                                     PL_ASCII, &posixes);
                         if (DEPENDS_SEMANTICS) {
                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
                         }
+#ifdef HAS_ISASCII
                     }
+#endif
                    break;
                case ANYOF_BLANK:
-                    DO_POSIX(ret, namedclass, posixes,
+                    if (hasISBLANK || ! LOC) {
+                        DO_POSIX(ret, namedclass, posixes,
                                             PL_PosixBlank, PL_XPosixBlank);
+                    }
+                    else { /* There is no isblank() and we are in locale:  We
+                              use the ASCII range and the above-Latin1 range
+                              code points */
+                        SV* scratch_list = NULL;
+
+                        /* Include all above-Latin1 blanks */
+                        _invlist_intersection(PL_AboveLatin1,
+                                              PL_XPosixBlank,
+                                              &scratch_list);
+                        /* Add it to the running total of posix classes */
+                        if (! posixes) {
+                            posixes = scratch_list;
+                        }
+                        else {
+                            _invlist_union(posixes, scratch_list, &posixes);
+                            SvREFCNT_dec(scratch_list);
+                        }
+                        /* Add the ASCII-range blanks to the running total. */
+                        _invlist_union(posixes, PL_PosixBlank, &posixes);
+                    }
                    break;
                case ANYOF_NBLANK:
-                    DO_N_POSIX(ret, namedclass, posixes,
-                                            PL_PosixBlank, PL_XPosixBlank);
+                    if (hasISBLANK || ! LOC) {
+                        DO_N_POSIX(ret, namedclass, posixes,
+                                                PL_PosixBlank, PL_XPosixBlank);
+                    }
+                    else { /* There is no isblank() and we are in locale */
+                        SV* scratch_list = NULL;
+
+                        /* Include all above-Latin1 non-blanks */
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+
+                        /* Add them to the running total of posix classes */
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+                        if (! posixes) {
+                            posixes = scratch_list;
+                        }
+                        else {
+                            _invlist_union(posixes, scratch_list, &posixes);
+                            SvREFCNT_dec(scratch_list);
+                        }
+
+                        /* Get the list of all non-ASCII-blanks in Latin 1, and
+                         * add them to the running total */
+                        _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+                        _invlist_union(posixes, scratch_list, &posixes);
+                        SvREFCNT_dec(scratch_list);
+                    }
                    break;
                case ANYOF_CNTRL:
                     DO_POSIX(ret, namedclass, posixes,
@@ -12015,11 +12068,11 @@ parseit:
                    }
                    break;
                }
-               case ANYOF_ALNUM:   /* Really is 'Word' */
+               case ANYOF_WORDCHAR:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
                    break;
-               case ANYOF_NALNUM:
+               case ANYOF_NWORDCHAR:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
                             runtime_posix_matches_above_Unicode);
@@ -12150,10 +12203,10 @@ parseit:
                  * modifier to the regex.  We first calculate the base node
                  * type, and if it should be inverted */
 
-                case ANYOF_NALNUM:
+                case ANYOF_NWORDCHAR:
                     invert = ! invert;
                     /* FALLTHROUGH */
-                case ANYOF_ALNUM:
+                case ANYOF_WORDCHAR:
                     op = ALNUM;
                     goto join_charset_classes;