X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/6d351bf2e060da9a1c13a1f7c2deb014f74fe6b8..1f933721fdfb02e4d62d10a1a69032d38d4db05d:/regexec.c

diff --git a/regexec.c b/regexec.c
index d95b27a..375d4fd 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1359,7 +1359,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	switch (OP(c)) {
 	case ANYOF:
 	    if (utf8_target) {
-		 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+		 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
 			  !UTF8_IS_INVARIANT((U8)s[0]) ?
 			  reginclass(prog, c, (U8*)s, 0, utf8_target) :
 			  REGINCLASS(prog, c, (U8*)s));
@@ -5761,33 +5761,85 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
     case CANY:
 	scan = loceol;
 	break;
+    case EXACT:
+	/* To get here, EXACTish nodes must have *byte* length == 1.  That
+	 * means they match only characters in the string that can be expressed
+	 * as a single byte.  For non-utf8 strings, that means a simple match.
+	 * For utf8 strings, the character matched must be an invariant, or
+	 * downgradable to a single byte.  The pattern's utf8ness is
+	 * irrelevant, as since it's a single byte, it either isn't utf8, or if
+	 * it is, it's an invariant */
+
+	c = (U8)*STRING(p);
+	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+	if (! utf8_target || UNI_IS_INVARIANT(c)) {
+	    while (scan < loceol && UCHARAT(scan) == c) {
+		scan++;
+	    }
+	}
+	else {
+
+	    /* Here, the string is utf8, and the pattern char is different
+	     * in utf8 than not, so can't compare them directly.  Outside the
+	     * loop, find find the two utf8 bytes that represent c, and then
+	     * look for those in sequence in the utf8 string */
+	    U8 high = UTF8_TWO_BYTE_HI(c);
+	    U8 low = UTF8_TWO_BYTE_LO(c);
+	    loceol = PL_regeol;
+
+	    while (hardcount < max
+		    && scan + 1 < loceol
+		    && UCHARAT(scan) == high
+		    && UCHARAT(scan + 1) == low)
+	    {
+		scan += 2;
+		hardcount++;
+	    }
+	}
+	break;
     case EXACTFL:
 	PL_reg_flags |= RF_tainted;
 	/* FALL THROUGH */
-    case EXACT:
     case EXACTF:
-	/* To get here, EXACTish nodes must have *byte* length == 1.  That means
-	 * they match only characters in the string that can be expressed as a
-	 * single byte.  For non-utf8 strings, that means a simple match.  For
-	 * utf8 strings, the character matched must be an invariant, or
-	 * downgradable to a single byte.  The pattern's utf8ness is
-	 * irrelevant, as it must be a single byte, so either it isn't utf8, or
-	 * if it is it's an invariant */
+
+	/* The comments for the EXACT case apply as well to these fold ones */
 
 	c = (U8)*STRING(p);
 	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
 
-	if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
+	if (utf8_target) { /* Use full Unicode fold matching */
+
+	    /* For the EXACTFL case, It doesn't really make sense to compare
+	     * locale and utf8, but it is best we can do.  The documents warn
+	     * against mixing them */
+
+	    char *tmpeol = loceol;
+	    while (hardcount < max
+		    && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
+				    STRING(p), NULL, 1, UTF_PATTERN))
+	    {
+		scan = tmpeol;
+		tmpeol = loceol;
+		hardcount++;
+	    }
+
+	    /* XXX Note that the above handles properly the German sharp s in
+	     * the pattern matching ss in the string.  But it doesn't handle
+	     * properly cases where the string contains say 'LIGATURE ff' and
+	     * the pattern is 'f+'.  This would require, say, a new function or
+	     * revised interface to foldEQ_utf8(), in which the maximum number
+	     * of characters to match could be passed and it would return how
+	     * many actually did.  This is just one of many cases where
+	     * multi-char folds don't work properly, and so the fix is being
+	     * deferred */
+	}
+	else {
 
-	    /* Here, the string isn't utf8, or the character in the EXACT
-	     * node is the same in utf8 as not, so can just do equality.
-	     * Each matching char must be 1 byte long */
+	    /* Here, the string isn't utf8; and either the pattern isn't utf8
+	     * or c is an invariant, so its utf8ness doesn't affect c.  Can
+	     * just do simple comparisons for exact or fold matching. */
 	    switch (OP(p)) {
-	    case EXACT:
-		while (scan < loceol && UCHARAT(scan) == c) {
-		    scan++;
-		}
-		break;
 	    case EXACTF:
 		while (scan < loceol &&
 		    (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
@@ -5806,61 +5858,6 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 		Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
 	    }
 	}
-	else {
-
-	    /* Here, the string is utf8, and the pattern char is different
-	     * in utf8 than not.  */
-
-	    switch (OP(p)) {
-	    case EXACT:
-		{
-		    /* Fastest to find the two utf8 bytes that represent c, and
-		     * then look for those in sequence in the utf8 string */
-		    U8 high = UTF8_TWO_BYTE_HI(c);
-		    U8 low = UTF8_TWO_BYTE_LO(c);
-		    loceol = PL_regeol;
-
-		    while (hardcount < max
-			   && scan + 1 < loceol
-			   && UCHARAT(scan) == high
-			   && UCHARAT(scan + 1) == low)
-		    {
-			scan += 2;
-			hardcount++;
-		    }
-		}
-		break;
-	    case EXACTFL:   /* Doesn't really make sense, but is best we can
-			       do.  The documents warn against mixing locale
-			       and utf8 */
-	    case EXACTF:
-		{   /* utf8 string, so use utf8 foldEQ */
-		    char *tmpeol = loceol;
-		    while (hardcount < max
-			   && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
-				          STRING(p), NULL, 1, UTF_PATTERN))
-		    {
-			scan = tmpeol;
-			tmpeol = loceol;
-			hardcount++;
-		    }
-
-		    /* XXX Note that the above handles properly the German
-		     * sharp ss in the pattern matching ss in the string.  But
-		     * it doesn't handle properly cases where the string
-		     * contains say 'LIGATURE ff' and the pattern is 'f+'.
-		     * This would require, say, a new function or revised
-		     * interface to foldEQ_utf8(), in which the maximum number
-		     * of characters to match could be passed and it would
-		     * return how many actually did.  This is just one of many
-		     * cases where multi-char folds don't work properly, and so
-		     * the fix is being deferred */
-		}
-		break;
-	    default:
-		Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
-	    }
-	}
 	break;
     case ANYOF:
 	if (utf8_target) {
@@ -6260,7 +6257,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
 		match = TRUE;
 	}
 	
-	if (!match && (flags & ANYOF_CLASS)) {
+	if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
 	    PL_reg_flags |= RF_tainted;
 	    if (
 		(ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
@@ -6302,11 +6299,13 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
 
     /* If the bitmap didn't (or couldn't) match, and something outside the
      * bitmap could match, try that */
-    if (!match && (utf8_target || (flags & ANYOF_UNICODE))) {
+    if (!match) {
 	if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
 	    match = TRUE;
 	}
-	else {
+	else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
+		 || (utf8_target && flags & ANYOF_UTF8))
+	{
 	    AV *av;
 	    SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
 
@@ -6335,12 +6334,103 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
 			}
 		    }
 		    if (!match) {
-		        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+		        U8 folded[UTF8_MAXBYTES_CASE+1];
 
-			STRLEN tmplen;
-			to_utf8_fold(utf8_p, tmpbuf, &tmplen);
-			if (swash_fetch(sw, tmpbuf, 1))
+			/* See if the folded version matches */
+			STRLEN foldlen;
+			to_utf8_fold(utf8_p, folded, &foldlen);
+			if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
 			    match = TRUE;
+			}
+			else {
+			    /* The fold in a few cases  of an above Latin1 char
+			     * is in the Latin1 range, and hence may be in the
+			     * bitmap */
+			    if (UTF8_IS_INVARIANT(*folded)
+				&& ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(*folded)))
+			    {
+				match = TRUE;
+			    }
+			    else if (UTF8_IS_DOWNGRADEABLE_START(*folded)
+				     && ANYOF_BITMAP_TEST(n,
+					  UNI_TO_NATIVE(
+					     TWO_BYTE_UTF8_TO_UNI(folded[0],
+							           folded[1]))))
+			    { /* Since the fold comes from internally
+			       * generated data, we can safely assume it is
+			       * valid utf8 in the test above */
+
+				match = TRUE;
+			    }
+                            if (! match) {
+				SV** listp;
+
+				/* Consider "k" =~ /[K]/i.  The line above
+				 * would have just folded the 'k' to itself,
+				 * and that isn't going to match 'K'.  So we
+				 * look through the closure of everything that
+				 * folds to 'k'.  That will find the 'K'.
+				 * Initialize the list, if necessary */
+				if (! PL_utf8_foldclosures) {
+
+				    /* If the folds haven't been read in, call a
+				    * fold function to force that */
+				    if (! PL_utf8_tofold) {
+					U8 dummy[UTF8_MAXBYTES+1];
+					STRLEN dummy_len;
+					to_utf8_fold((U8*) "A",
+							    dummy, &dummy_len);
+				    }
+				    PL_utf8_foldclosures =
+					  _swash_inversion_hash(PL_utf8_tofold);
+				}
+
+				/* The data structure is a hash with the keys
+				 * every character that is folded to, like 'k',
+				 * and the values each an array of everything
+				 * that folds to its key.  e.g. [ 'k', 'K',
+				 * KELVIN_SIGN ] */
+				if ((listp = hv_fetch(PL_utf8_foldclosures,
+					      (char *) folded, foldlen, FALSE)))
+				{
+				    AV* list = (AV*) *listp;
+				    IV i;
+				    for (i = 0; i <= av_len(list); i++) {
+					SV** try_p = av_fetch(list, i, FALSE);
+					char* try_c;
+					if (try_p == NULL) {
+					    Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+					}
+					/* Don't have to worry about embeded
+					 * nulls since NULL isn't folded or
+					 * foldable */
+					try_c = SvPVX(*try_p);
+					if (UTF8_IS_INVARIANT(*try_c)
+					    && ANYOF_BITMAP_TEST(n,
+							    UNI_TO_NATIVE(*try_c)))
+					{
+					    match = TRUE;
+					    break;
+					}
+					else if
+					    (UTF8_IS_DOWNGRADEABLE_START(*try_c)
+					     && ANYOF_BITMAP_TEST(n,
+					     UNI_TO_NATIVE(
+						TWO_BYTE_UTF8_TO_UNI(try_c[0],
+								     try_c[1]))))
+					{
+					    match = TRUE;
+					    break;
+					} else if (swash_fetch(sw,
+								(U8*) try_c, 1))
+					{
+					    match = TRUE;
+					    break;
+					}
+				    }
+				}
+			    }
+                        }
 		    }
 		}