This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Extract _cmd_l_range .
[perl5.git] / regexec.c
index 8576e3b..1860909 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -436,19 +436,21 @@ S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
     switch ((_char_class_number) classnum) {
         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
+        case _CC_ENUM_ASCII:     return isASCII_LC(character);
+        case _CC_ENUM_BLANK:     return isBLANK_LC(character);
+        case _CC_ENUM_CASED:     return isLOWER_LC(character)
+                                        || isUPPER_LC(character);
+        case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
+        case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
+        case _CC_ENUM_SPACE:     return isSPACE_LC(character);
         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
-        case _CC_ENUM_SPACE:     return isSPACE_LC(character);
-        case _CC_ENUM_BLANK:     return isBLANK_LC(character);
         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
-        case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
-        case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
-        case _CC_ENUM_ASCII:     return isASCII_LC(character);
         default:    /* VERTSPACE should never occur in locales */
             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
     }
@@ -1617,19 +1619,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     }
     case BOUNDL:
         RXp_MATCH_TAINTED_on(prog);
-        FBC_BOUND(isALNUM_LC,
-                  isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
-                  isALNUM_LC_utf8((U8*)s));
+        FBC_BOUND(isWORDCHAR_LC,
+                  isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                  isWORDCHAR_LC_utf8((U8*)s));
         break;
     case NBOUNDL:
         RXp_MATCH_TAINTED_on(prog);
-        FBC_NBOUND(isALNUM_LC,
-                   isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
-                   isALNUM_LC_utf8((U8*)s));
+        FBC_NBOUND(isWORDCHAR_LC,
+                   isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                   isWORDCHAR_LC_utf8((U8*)s));
         break;
     case BOUND:
         FBC_BOUND(isWORDCHAR,
-                  isALNUM_uni(tmp),
+                  isWORDCHAR_uni(tmp),
                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
         break;
     case BOUNDA:
@@ -1639,7 +1641,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         break;
     case NBOUND:
         FBC_NBOUND(isWORDCHAR,
-                   isALNUM_uni(tmp),
+                   isWORDCHAR_uni(tmp),
                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
         break;
     case NBOUNDA:
@@ -1649,12 +1651,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         break;
     case BOUNDU:
         FBC_BOUND(isWORDCHAR_L1,
-                  isALNUM_uni(tmp),
+                  isWORDCHAR_uni(tmp),
                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
         break;
     case NBOUNDU:
         FBC_NBOUND(isWORDCHAR_L1,
-                   isALNUM_uni(tmp),
+                   isWORDCHAR_uni(tmp),
                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
         break;
     case LNBREAK:
@@ -4209,7 +4211,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                    ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
                }
                if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
-                   ln = isALNUM_uni(ln);
+                   ln = isWORDCHAR_uni(ln);
                     if (NEXTCHR_IS_EOS)
                         n = 0;
                     else {
@@ -4219,8 +4221,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     }
                }
                else {
-                   ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
-                   n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
+                   ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
+                   n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
                }
            }
            else {
@@ -4244,12 +4246,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                        n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
                        break;
                    case REGEX_LOCALE_CHARSET:
-                       ln = isALNUM_LC(ln);
-                       n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
+                       ln = isWORDCHAR_LC(ln);
+                       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
                        break;
                    case REGEX_DEPENDS_CHARSET:
-                       ln = isALNUM(ln);
-                       n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
+                       ln = isWORDCHAR(ln);
+                       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
                        break;
                    case REGEX_ASCII_RESTRICTED_CHARSET:
                    case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
@@ -7330,7 +7332,17 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
                  * will be 1, so the exclusive or will reverse things, so we
                  * are testing for \W.  On the third iteration, 'to_complement'
                  * will be 0, and we would be testing for \s; the fourth
-                 * iteration would test for \S, etc. */
+                 * iteration would test for \S, etc.
+                 *
+                 * Note that this code assumes that all the classes are closed
+                 * under folding.  For example, if a character matches \w, then
+                 * its fold does too; and vice versa.  This should be true for
+                 * any well-behaved locale for all the currently defined Posix
+                 * classes, except for :lower: and :upper:, which are handled
+                 * by the pseudo-class :cased: which matches if either of the
+                 * other two does.  To get rid of this assumption, an outer
+                 * loop could be used below to iterate over both the source
+                 * character, and its fold (if different) */
 
                 int count = 0;
                 int to_complement = 0;