Revert "regcomp.c: Use a weird value in a place where ignored"

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 7ffba08..9dac06c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -307,7 +307,8 @@ struct RExC_state_t {
   * Flags to be passed up and down.
   */
  #define        WORST           0       /* Worst case. */
-#define        HASWIDTH        0x01    /* Known to match non-null strings. */
+#define        HASWIDTH        0x01    /* Known to not match null strings, could match
+                                   non-null ones. */
  
  /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
   * character.  (There needs to be a case: in the switch statement in regexec.c
@@ -2648,7 +2649,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
  #endif
  
      switch (flags) {
-        case EXACT: case EXACTL: break;
+        case EXACT: case EXACT_ONLY8: case EXACTL: break;
         case EXACTFAA:
          case EXACTFU_SS:
         case EXACTFU:
@@ -2663,7 +2664,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
      trie->wordcount = word_count;
      RExC_rxi->data->data[ data_slot ] = (void*)trie;
      trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
-    if (flags == EXACT || flags == EXACTL)
+    if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
      trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
                         trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -2737,15 +2738,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                  noper= noper_next;
          }
  
-        if ( noper < tail &&
-                (
-                    OP(noper) == flags ||
-                    (
-                        flags == EXACTFU &&
-                        OP(noper) == EXACTFU_SS
-                    )
-                )
-        ) {
+        if (    noper < tail
+            && (    OP(noper) == flags
+                || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+                || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                                         || OP(noper) == EXACTFU_SS))) )
+        {
              uc= (U8*)STRING(noper);
              e= uc + STR_LEN(noper);
          } else {
@@ -2958,7 +2956,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                      noper= noper_next;
              }
  
-            if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+            if (    noper < tail
+                && (    OP(noper) == flags
+                    || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+                    || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                                             || OP(noper) == EXACTFU_SS))) )
+            {
                  const U8 *uc= (U8*)STRING(noper);
                  const U8 *e= uc + STR_LEN(noper);
  
@@ -3178,7 +3181,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                      noper= noper_next;
              }
  
-            if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+            if (    noper < tail
+                && (    OP(noper) == flags
+                    || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+                    || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                                             || OP(noper) == EXACTFU_SS))) )
+            {
                  const U8 *uc= (U8*)STRING(noper);
                  const U8 *e= uc + STR_LEN(noper);
  
@@ -4011,7 +4019,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
       * this final joining, sequences could have been split over boundaries, and
       * hence missed).  The sequences only happen in folding, hence for any
       * non-EXACT EXACTish node */
-    if (OP(scan) != EXACT && OP(scan) != EXACTL) {
+    if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
          U8* s0 = (U8*) STRING(scan);
          U8* s = s0;
          U8* s_end = s0 + STR_LEN(scan);
@@ -4664,9 +4672,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  ----------------+-----------
                                  NOTHING         | NOTHING
                                  EXACT           | EXACT
+                                EXACT_ONLY8     | EXACT
                                  EXACTFU         | EXACTFU
+                                EXACTFU_ONLY8   | EXACTFU
                                  EXACTFU_SS      | EXACTFU
-                                EXACTFAA         | EXACTFAA
+                                EXACTFAA        | EXACTFAA
                                  EXACTL          | EXACTL
                                  EXACTFLU8       | EXACTFLU8
  
@@ -4674,16 +4684,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          */
  #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
                         ? NOTHING                                            \
-                       : ( EXACT == (X) )                                   \
+                       : ( EXACT == (X) || EXACT_ONLY8 == (X) )             \
                           ? EXACT                                            \
-                         : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
+                         : (     EXACTFU == (X)                             \
+                              || EXACTFU_ONLY8 == (X)                       \
+                              || EXACTFU_SS == (X) )                        \
                             ? EXACTFU                                        \
-                           : ( EXACTFAA == (X) )                             \
-                             ? EXACTFAA                                      \
+                           : ( EXACTFAA == (X) )                            \
+                             ? EXACTFAA                                     \
                               : ( EXACTL == (X) )                            \
                                 ? EXACTL                                     \
-                               : ( EXACTFLU8 == (X) )                        \
-                                 ? EXACTFLU8                                 \
+                               : ( EXACTFLU8 == (X) )                       \
+                                 ? EXACTFLU8                                \
                                   : 0 )
  
                          /* dont use tail as the end marker for this traverse */
@@ -4998,7 +5010,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 continue;
             }
         }
-       else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
+       else if (   OP(scan) == EXACT
+                 || OP(scan) == EXACT_ONLY8
+                 || OP(scan) == EXACTL)
+        {
             SSize_t l = STR_LEN(scan);
             UV uc;
              assert(l);
@@ -5117,7 +5132,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             case PLUS:
                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
                     next = NEXTOPER(scan);
-                   if (OP(next) == EXACT
+                   if (   OP(next) == EXACT
+                        || OP(next) == EXACT_ONLY8
                          || OP(next) == EXACTL
                          || (flags & SCF_DO_STCLASS))
                      {
@@ -6755,13 +6771,27 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
                 && n < pRExC_state->code_blocks->count
                 && s == pRExC_state->code_blocks->cb[n].start)
             {
-               /* blank out literal code block */
-               assert(pat[s] == '(');
-               while (s <= pRExC_state->code_blocks->cb[n].end) {
-                   *p++ = '_';
+               /* blank out literal code block so that they aren't
+                 * recompiled: eg change from/to:
+                 *     /(?{xyz})/
+                 *     /(?=====)/
+                 * and
+                 *     /(??{xyz})/
+                 *     /(?======)/
+                 * and
+                 *     /(?(?{xyz}))/
+                 *     /(?(?=====))/
+                */
+               assert(pat[s]   == '(');
+               assert(pat[s+1] == '?');
+                *p++ = '(';
+                *p++ = '?';
+                s += 2;
+               while (s < pRExC_state->code_blocks->cb[n].end) {
+                   *p++ = '=';
                     s++;
                 }
-               s--;
+                *p++ = ')';
                 n++;
                 continue;
             }
@@ -7534,9 +7564,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          goto redo_parse;
      }
  
-    /* In a stable state, as here, this must be true */
-    assert(RExC_size = RExC_emit + 1);
-
      /* Here, we have successfully parsed and generated the pattern's program
       * for the regex engine.  We are ready to finish things up and look for
       * optimizations. */
@@ -7701,8 +7728,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          DEBUG_PEEP("first:", first, 0, 0);
          /* Ignore EXACT as we deal with it later. */
         if (PL_regkind[OP(first)] == EXACT) {
-           if (OP(first) == EXACT || OP(first) == EXACTL)
+           if (   OP(first) == EXACT
+                || OP(first) == EXACT_ONLY8
+                || OP(first) == EXACTL)
+            {
                 NOOP;   /* Empty, get anchored substr later. */
+            }
             else
                 RExC_rxi->regstclass = first;
         }
@@ -8044,7 +8075,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                   && nop == END)
              RExC_rx->extflags |= RXf_WHITE;
          else if ( RExC_rx->extflags & RXf_SPLIT
-                  && (fop == EXACT || fop == EXACTL)
+                  && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
                    && STR_LEN(first) == 1
                    && *(STRING(first)) == ' '
                    && nop == END )
@@ -13728,7 +13759,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
               * contain only above-Latin1 characters (hence must be in UTF8),
               * which don't participate in folds with Latin1-range characters,
               * as the latter's folds aren't known until runtime. */
-            bool maybe_exactfu = TRUE;
+            bool maybe_exactfu = FOLD;
+
+            /* Does this node contain something that can't match unless the
+             * target string is (also) in UTF-8 */
+            bool requires_utf8_target = FALSE;
+
+            bool has_micro_sign = FALSE;
  
              /* Allocate an EXACT node.  The node_type may change below to
               * another EXACTish node, but since the size of the node doesn't
@@ -13852,9 +13889,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          }
                          p = RExC_parse;
                          RExC_parse = parse_start;
-                        if (ender > 0xff) {
-                            REQUIRE_UTF8(flagp);
-                        }
  
                          /* The \N{} means the pattern, if previously /d,
                           * becomes /u.  That means it can't be an EXACTF node,
@@ -13868,7 +13902,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              if (! maybe_exactfu) {
                                  len = 0;
                                  s = s0;
-                                maybe_exactfu = TRUE;   /* Prob. unnecessary */
+                                maybe_exactfu = FOLD;   /* Prob. unnecessary */
                                  goto reparse;
                              }
                          }
@@ -13916,9 +13950,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             }
                              UPDATE_WARNINGS_LOC(p - 1);
                              ender = result;
-                           if (ender > 0xff) {
-                               REQUIRE_UTF8(flagp);
-                           }
                             break;
                         }
                     case 'x':
@@ -13952,9 +13983,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                  }
  #endif
                             }
-                            else {
-                               REQUIRE_UTF8(flagp);
-                           }
                             break;
                         }
                     case 'c':
@@ -13999,9 +14027,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                             STRLEN numlen = 3;
                             ender = grok_oct(p, &numlen, &flags, NULL);
-                           if (ender > 0xff) {
-                               REQUIRE_UTF8(flagp);
-                           }
                             p += numlen;
                              if (   isDIGIT(*p)  /* like \08, \178 */
                                  && ckWARN(WARN_REGEXP)
@@ -14082,7 +14107,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
                 /* Here, have looked at the literal character, and <ender>
                   * contains its ordinal; <p> points to the character after it.
-                 * We need to check if the next non-ignored thing is a
+                 * */
+
+                if (ender > 255) {
+                    REQUIRE_UTF8(flagp);
+                }
+
+                /* We need to check if the next non-ignored thing is a
                   * quantifier.  Move <p> to after anything that should be
                   * ignored, which, as a side effect, positions <p> for the next
                   * loop iteration */
@@ -14117,6 +14148,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              U8 * new_s = uvchr_to_utf8((U8*)s, ender);
                              added_len = (char *) new_s - s;
                              s = (char *) new_s;
+
+                            if (ender > 255)  {
+                                requires_utf8_target = TRUE;
+                            }
                          }
                  }
                  else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
@@ -14163,14 +14198,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
                      /* Here, continuing a node with non-folded characters.  Add
                       * this one */
-
-                    if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
-                        *(s++) = (char) ender;
-                    }
-                    else {
-                        s = (char *) uvchr_to_utf8((U8 *) s, ender);
-                        added_len = UVCHR_SKIP(ender);
-                    }
+                    goto not_fold_common;
                  }
                  else {  /* Here, does participate in some fold */
  
@@ -14199,6 +14227,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                                      ? FOLD_FLAGS_NOMIX_ASCII
                                                      : 0));
                              s += added_len;
+
+                            if (ender > 255)  {
+                                requires_utf8_target = TRUE;
+                                if (UNLIKELY(ender == GREEK_SMALL_LETTER_MU)) {
+                                    has_micro_sign = TRUE;
+                                }
+                            }
                          }
                      }
                      else {
@@ -14240,6 +14275,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          }
  #endif
  
+                        else if (UNLIKELY(ender == MICRO_SIGN)) {
+                            has_micro_sign = TRUE;
+                        }
+
                          /* Even when folding, we store just the input
                           * character, as we have an array that finds its fold
                           * quickly */
@@ -14432,11 +14471,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  OP(REGNODE_p(ret)) = node_type;
  
                  /* If the node type is EXACT here, check to see if it
-                 * should be EXACTL. */
+                 * should be EXACTL, or EXACT_ONLY8. */
                  if (node_type == EXACT) {
                      if (LOC) {
                          OP(REGNODE_p(ret)) = EXACTL;
                      }
+                    else if (requires_utf8_target) {
+                        OP(REGNODE_p(ret)) = EXACT_ONLY8;
+                    }
                  }
  
                  if (FOLD) {
@@ -14454,6 +14496,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      else if (node_type == EXACTF) {
                          RExC_seen_d_op = TRUE;
                      }
+
+                    /* The micro sign is the only below 256 character that
+                     * folds to above 255 */
+                    if (   OP(REGNODE_p(ret)) == EXACTFU
+                        && requires_utf8_target
+                        && LIKELY(! has_micro_sign))
+                    {
+                        OP(REGNODE_p(ret)) = EXACTFU_ONLY8;
+                    }
+
                  }
  
                  alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
@@ -15637,7 +15689,6 @@ redo_curchar:
                      if (UCHARAT(RExC_parse) != ')')
                          vFAIL("Expecting close paren for wrapper for nested extended charclass");
  
-                    RExC_parse++;
                      RExC_flags = save_flags;
                      goto handle_operand;
                  }
@@ -16478,8 +16529,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
      bool warn_super = ALWAYS_WARN_SUPER;
  
-    const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in
-        case we need to change the emitted regop to an EXACT. */
      const char * orig_parse = RExC_parse;
      bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
  
@@ -18103,7 +18152,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
      if (optimizable) {
          int posix_class = -1;   /* Illegal value */
-        const char * cur_parse= RExC_parse;
          U8 ANYOFM_mask = 0xFF;
          U32 anode_arg = 0;
          UV start, end;
@@ -18369,16 +18417,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          }
  
          if (op != END) {
-            RExC_parse = (char *)orig_parse;
-            RExC_emit = orig_emit;
-
              if (regarglen[op]) {
                  ret = reganode(pRExC_state, op, anode_arg);
              } else {
                  ret = reg_node(pRExC_state, op);
              }
-
-            RExC_parse = (char *)cur_parse;
+            Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
+                                                   RExC_parse - orig_parse);;
  
              if (PL_regkind[op] == EXACT) {
                  alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
@@ -19224,11 +19269,13 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
          if ( exact ) {
              switch (OP(REGNODE_p(scan))) {
                  case EXACT:
+                case EXACT_ONLY8:
                  case EXACTL:
                  case EXACTF:
                  case EXACTFAA_NO_TRIE:
                  case EXACTFAA:
                  case EXACTFU:
+                case EXACTFU_ONLY8:
                  case EXACTFLU8:
                  case EXACTFU_SS:
                  case EXACTFL: