X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/287722f3d3da256273a55d7ef88b415ec6acbc00..f758bddff3d477c79b77132210e9d4249f1d83c9:/regcomp.c

diff --git a/regcomp.c b/regcomp.c
index 7f51d87..3d0121c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2522,6 +2522,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  * node type of the result is changed to reflect that it contains these
  * sequences.
  *
+ * And *has_exactf_sharp_s is set to indicate if the node is EXACTF and
+ * contains LATIN SMALL LETTER SHARP S
+ *
  * This is as good a place as any to discuss the design of handling these
  * problematic sequences.  It's been wrong in Perl for a very long time.  There
  * are three code points in Unicode whose folded lengths differ so much from
@@ -2592,8 +2595,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
  *      reluctant to try to change this assumption, so instead the code punts.
- *      Elsewhere in this file, each EXACTF node is examined for the sharp s.
- *      If found, a flag is set that later causes the optimizer in this file to
+ *      This routine examines EXACTF nodes for the sharp s, and returns whether
+ *      the node is an EXACTF node that contains one or not.  When it is true,
+ *      the caller sets a flag that later causes the optimizer in this file to
  *      not set values for the floating and fixed string lengths, and thus
  *      avoid the optimizer code in regexec.c that makes this invalid
  *      assumption.  Thus, there is no optimization based on string lengths for
@@ -2601,12 +2605,12 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  *      (which means the pattern isn't in UTF-8).
  */
 
-#define JOIN_EXACT(scan,min_change,flags) \
+#define JOIN_EXACT(scan,min_change,has_exactf_sharp_s, flags) \
     if (PL_regkind[OP(scan)] == EXACT) \
-        join_exact(pRExC_state,(scan),(min_change),(flags),NULL,depth+1)
+        join_exact(pRExC_state,(scan),(min_change),has_exactf_sharp_s, (flags),NULL,depth+1)
 
 STATIC U32
-S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32 flags,regnode *val, U32 depth) {
+S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
     /* Merge several consecutive EXACTish nodes into one. */
     regnode *n = regnext(scan);
     U32 stringok = 1;
@@ -2689,22 +2693,35 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32
 #define UPSILON_D_T	GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
 
     *min_change = 0;
+    *has_exactf_sharp_s = FALSE;
 
     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
      * can now analyze for sequences of problematic code points.  (Prior to
      * this final joining, sequences could have been split over boundaries, and
      * hence missed).  The sequences only happen in folding */
     if (OP(scan) != EXACT) {
-        char *s, *t;
-        char * s0 = STRING(scan);
-        char * const s_end = s0 + STR_LEN(scan);
-
-	/* First we look at the sequences that can occur only in UTF-8 strings.
-	 * The sequences are of length 6 */
-	if (UTF && STR_LEN(scan) >= 6) {
+        U8 *s;
+        U8 * s0 = (U8*) STRING(scan);
+        U8 * const s_end = s0 + STR_LEN(scan);
+
+	/* The below is perhaps overboard, but this allows us to save a test
+	 * each time through the loop at the expense of a mask.  This is
+	 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
+	 * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
+	 * This uses an exclusive 'or' to find that bit and then inverts it to
+	 * form a mask, with just a single 0, in the bit position where 'S' and
+	 * 's' differ. */
+	const U8 S_or_s_mask = ~ ('S' ^ 's');
+	const U8 s_masked = 's' & S_or_s_mask;
+
+	/* One pass is made over the node's string looking for all the
+	 * possibilities.  to avoid some tests in the loop, there are two main
+	 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
+	 * non-UTF-8 */
+	if (UTF) {
 
-	    /* Two problematic code points in Unicode casefolding of EXACT
-	     * nodes:
+	    /* There are two problematic Greek code points in Unicode
+	     * casefolding
 	     *
 	     * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
 	     * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
@@ -2724,86 +2741,122 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32
 	     * minimum length computation.  (there are other code points that
 	     * also fold to these two sequences, but the delta is smaller)
 	     *
-	     * What we'll do is to look for the tail four bytes, and then peek
-	     * at the preceding two bytes to see whether we need to decrease
-	     * the minimum length by four (six minus two).
+	     * If these sequences are found, the minimum length is decreased by
+	     * four (six minus two).
 	     *
-	     * Thanks to the design of UTF-8, there cannot be false matches:
-	     * A sequence of valid UTF-8 bytes cannot be a subsequence of
-	     * another valid sequence of UTF-8 bytes. */
+	     * Similarly, 'ss' may match the single char and byte LATIN SMALL
+	     * LETTER SHARP S.  We decrease the min length by 1 for each
+	     * occurrence of 'ss' found */
 
 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
-	    const char U390_first_byte = '\xb4';
-	    const char U390_2nd_byte = '\x68';
-	    const char U3B0_first_byte = '\xb5';
-	    const char U3B0_2nd_byte = '\x46';
-	    const char tail[] = "\xaf\x49\xaf\x42";
+#	    define U390_first_byte 0xb4
+	    const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
+#	    define U3B0_first_byte 0xb5
+	    const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
 #else
-	    const char U390_first_byte = '\xce';
-	    const char U390_2nd_byte = '\xb9';
-	    const char U3B0_first_byte = '\xcf';
-	    const char U3B0_2nd_byte = '\x85';
-	    const char tail[] = "\xcc\x88\xcc\x81";
+#	    define U390_first_byte 0xce
+	    const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
+#	    define U3B0_first_byte 0xcf
+	    const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
 #endif
-	    const STRLEN tail_len = sizeof(tail) - 1;
-	    for (s = s0 + 2;    /* +2 is to skip the non-tail */
-		 s <= s_end - tail_len
-		   && (t = ninstr(s, s_end, tail, tail + tail_len));
-		 s = t + tail_len)
+	    const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
+						 yields a net of 0 */
+	    /* Examine the string for one of the problematic sequences */
+	    for (s = s0;
+		 s < s_end - 1; /* Can stop 1 before the end, as minimum length
+				 * sequence we are looking for is 2 */
+		 s += UTF8SKIP(s))
 	    {
-		if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
-		    || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
-		{
-		    *min_change -= 4;
 
-		    /* This can't currently be handled by tries, so change the
-		     * node type to indicate this. */
-		    if (OP(scan) == EXACTFU) {
-			OP(scan) = EXACTFU_NO_TRIE;
-		    }
+		/* Look for the first byte in each problematic sequence */
+		switch (*s) {
+		    /* We don't have to worry about other things that fold to
+		     * 's' (such as the long s, U+017F), as all above-latin1
+		     * code points have been pre-folded */
+		    case 's':
+		    case 'S':
+
+			if (((*(s+1) & S_or_s_mask) == s_masked)
+			    /* These two node types don't have special handling
+			     * for 'ss' */
+			    && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+			{
+			    *min_change -= 1;
+			    OP(scan) = EXACTFU_SS;
+			    s++;    /* No need to look at this character again */
+			}
+			break;
+
+		    case U390_first_byte:
+			if (s_end - s >= len
+
+			    /* The 1's are because are skipping comparing the
+			     * first byte */
+			    && memEQ(s + 1, U390_tail, len - 1))
+			{
+			    goto greek_sequence;
+			}
+			break;
+
+		    case U3B0_first_byte:
+			if (! (s_end - s >= len
+			       && memEQ(s + 1, U3B0_tail, len - 1)))
+			{
+			    break;
+			}
+		      greek_sequence:
+			*min_change -= 4;
+
+			/* This can't currently be handled by trie's, so change
+			 * the node type to indicate this.  If EXACTFA and
+			 * EXACTFL were ever to be handled by trie's, this
+			 * would have to be changed.  If this node has already
+			 * been changed to EXACTFU_SS in this loop, leave it as
+			 * is.  (I (khw) think it doesn't matter in regexec.c
+			 * for UTF patterns, but no need to change it */
+			if (OP(scan) == EXACTFU) {
+			    OP(scan) = EXACTFU_NO_TRIE;
+			}
+			s += 6;	/* We already know what this sequence is.  Skip
+				   the rest of it */
+			break;
 		}
 	    }
 	}
+	else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
 
-	/* The third problematic sequence is 'ss', which can match just the
-	 * single byte LATIN SMALL LETTER SHARP S, and it can do it in both
-	 * non- and UTF-8.  Code elsewhere in this file makes sure, however,
-	 * that the sharp s gets folded to 'ss' under Unicode rules even if not
-	 * UTF-8. */
-	if (STR_LEN(scan) >= 2
-	    && (OP(scan) == EXACTFU
-		|| OP(scan) == EXACTFU_NO_TRIE	/* The code above could have
-						   set to this node type */
-	        || OP(scan) == EXACTF))
-	{
-	    /* The string will be folded to 'ss' if it's in UTF-8, but it could
-             * include capital 'S' instead of lower case when not UTF-8.  We
-             * could have different code to handle the two cases, but this is
-             * not necessary since both S and s are invariants under UTF-8; and
-             * not worth it, especially because we can use just one test for
-             * either 'S' or 's' each * time through the loop (plus a mask).
-             * Ths is because on both EBCDIC and ASCII machines, 'S' and 's'
-             * differ by a single bit.  On ASCII they are 32 apart; on EBCDIC,
-             * they are 64.  This uses an exclusive 'or' to find that bit and
-             * then inverts it to form a mask, with just a single 0, in the bit
-             * position where 'S' and 's' differ. */
-	    const char S_or_s_mask = ~ ('S' ^ 's');
-	    const char s_masked = 's' & S_or_s_mask;
-
-	    for (s = s0; s < s_end - 1; s++) {
-		if (((*s & S_or_s_mask) == s_masked)
-		    && ((*(s+1) & S_or_s_mask) == s_masked))
-		{
-		    s++;
-		    *min_change -= 1;
-
-		    /* EXACTFU_SS also isn't trie'able, so don't have to
-		     * preserve EXACTFU_NO_TRIE.  EXACTF is also not trie'able,
-		     * and because we essentially punt the optimizations in its
-		     * case, we don't need to indicate that it has an ss */
-		    if (OP(scan) == EXACTFU || OP(scan) == EXACTFU_NO_TRIE) {
-			OP(scan) = EXACTFU_SS;
-		    }
+	    /* Here, the pattern is not UTF-8.  We need to look only for the
+	     * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
+	     * in the final position.  Otherwise we can stop looking 1 byte
+	     * earlier because have to find both the first and second 's' */
+	    const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
+
+	    for (s = s0; s < upper; s++) {
+		switch (*s) {
+		    case 'S':
+		    case 's':
+			if (s_end - s > 1
+			    && ((*(s+1) & S_or_s_mask) == s_masked))
+			{
+			    *min_change -= 1;
+
+			    /* EXACTF nodes need to know that the minimum
+			     * length changed so that a sharp s in the string
+			     * can match this ss in the pattern, but they
+			     * remain EXACTF nodes, as they are not trie'able,
+			     * so don't have to invent a new node type to
+			     * exclude them from the trie code */
+			    if (OP(scan) != EXACTF) {
+				OP(scan) = EXACTFU_SS;
+			    }
+			    s++;
+			}
+			break;
+		    case LATIN_SMALL_LETTER_SHARP_S:
+			if (OP(scan) == EXACTF) {
+			    *has_exactf_sharp_s = TRUE;
+			}
+			break;
 		}
 	    }
 	}
@@ -2923,10 +2976,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
   fake_study_recurse:
     while ( scan && OP(scan) != END && scan < last ){
         IV min_change = 0;
+	bool has_exactf_sharp_s = FALSE;
 	/* Peephole optimizer: */
 	DEBUG_STUDYDATA("Peep:", data,depth);
 	DEBUG_PEEP("Peep",scan,depth);
-        JOIN_EXACT(scan,&min_change,0);
+        JOIN_EXACT(scan,&min_change, &has_exactf_sharp_s, 0);
 
 	/* Follow the next-chain of the current node and optimize
 	   away all the NOTHINGs from it.  */
@@ -3440,10 +3494,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		l = utf8_length(s, s + l);
 		uc = utf8_to_uvchr(s, NULL);
 	    }
-	    else if (OP(scan) == EXACTF) {
-		if (memchr(STRING(scan), LATIN_SMALL_LETTER_SHARP_S, l)) {
-		    RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
-		}
+	    else if (has_exactf_sharp_s) {
+		RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
 	    }
 	    min += l + min_change;
             if (min < 0) {
@@ -11628,9 +11680,11 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
     for (;;) {
         regnode * const temp = regnext(scan);
 #ifdef EXPERIMENTAL_INPLACESCAN
-        if (PL_regkind[OP(scan)] == EXACT)
-            if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
+        if (PL_regkind[OP(scan)] == EXACT) {
+	    bool has_exactf_sharp_s;	/* Unexamined in this routine */
+            if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
                 return EXACT;
+	}
 #endif
         if ( exact ) {
             switch (OP(scan)) {