regcomp.c: Replace a loop with strcspn()

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 4116dd3..3202323 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4783,7 +4783,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                      if ( SvIV(re_trie_maxbuff)>=0  ) {
                          regnode *cur;
                          regnode *first = (regnode *)NULL;
-                        regnode *last = (regnode *)NULL;
+                        regnode *prev = (regnode *)NULL;
                          regnode *tail = scan;
                          U8 trietype = 0;
                          U32 count=0;
@@ -4914,7 +4914,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                      REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
                                  }
                                  Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
-                                   REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+                                   REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
                                 );
                              });
@@ -4960,7 +4960,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  } else {
                                      if ( trietype == NOTHING )
                                          trietype = noper_trietype;
-                                    last = cur;
+                                    prev = cur;
                                  }
                                 if (first)
                                     count++;
@@ -4970,7 +4970,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                   * noper may either be a triable node which can
                                   * not be tried together with the current trie,
                                   * or a non triable node */
-                                if ( last ) {
+                                if ( prev ) {
                                      /* If last is set and trietype is not
                                       * NOTHING then we have found at least two
                                       * triable branch sequences in a row of a
@@ -4983,7 +4983,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                          make_trie( pRExC_state,
                                                  startbranch, first, cur, tail,
                                                  count, trietype, depth+1 );
-                                    last = NULL; /* note: we clear/update
+                                    prev = NULL; /* note: we clear/update
                                                      first, trietype etc below,
                                                      so we dont do it here */
                                  }
@@ -5012,12 +5012,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                              Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
                                depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
                              Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
-                               REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+                               REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
                                 PL_reg_name[trietype]
                              );
  
                          });
-                        if ( last && trietype ) {
+                        if ( prev && trietype ) {
                              if ( trietype != NOTHING ) {
                                  /* the last branch of the sequence was part of
                                   * a trie, so we have to construct it here
@@ -5062,9 +5062,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                          OP(opt)= OPTIMIZED;
                                  }
                              }
-                        } /* end if ( last) */
+                        } /* end if ( prev) */
                      } /* TRIE_MAXBUF is non zero */
-
                  } /* do trie */
  
             }
@@ -7420,28 +7419,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      DEBUG_r(if (!PL_colorset) reginitcolors());
  
-    /* Initialize these here instead of as-needed, as is quick and avoids
-     * having to test them each time otherwise */
-    if (! PL_InBitmap) {
-#ifdef DEBUGGING
-        char * dump_len_string;
-#endif
-
-        /* This is calculated here, because the Perl program that generates the
-         * static global ones doesn't currently have access to
-         * NUM_ANYOF_CODE_POINTS */
-       PL_InBitmap = _new_invlist(2);
-       PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
-                                                    NUM_ANYOF_CODE_POINTS - 1);
-#ifdef DEBUGGING
-        dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
-        if (   ! dump_len_string
-            || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
-        {
-            PL_dump_re_max_len = 60;    /* A reasonable default */
-        }
-#endif
-    }
  
      pRExC_state->warn_text = NULL;
      pRExC_state->unlexed_names = NULL;
@@ -8699,8 +8676,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
         i = rx->sublen + rx->suboffset - rx->offs[0].end;
      }
      else
-    if ( 0 <= n && n <= (I32)rx->nparens &&
-        (s1 = rx->offs[n].start) != -1 &&
+    if (inRANGE(n, 0, (I32)rx->nparens) &&
+        (s1 = rx->offs[n].start) != -1  &&
          (t1 = rx->offs[n].end) != -1)
      {
          /* $&, ${^MATCH},  $1 ... */
@@ -12874,9 +12851,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          value = (U8 *) SvPV(value_sv, value_len);
  
          /* See if the result is one code point vs 0 or multiple */
-        if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
-                                               ? UTF8SKIP(value)
-                                               : 1))
+        if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
+                                  ? UTF8SKIP(value)
+                                  : 1)))
          {
              /* Here, exactly one code point.  If that isn't what is wanted,
               * fail */
@@ -14570,13 +14547,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
                  goto continue_parse;
              }
-            else if (! LOC) {  /* XXX shouldn't /l assume could be a UTF-8
-                                locale, and prepare for that? */
+            else if (FOLD) {
                  bool splittable = FALSE;
                  bool backed_up = FALSE;
-                char * e = s;
-
-                assert(FOLD);
+                char * e;
+                char * s_start;
  
                  /* Here is /i.  Running out of room creates a problem if we are
                   * folding, and the split happens in the middle of a
@@ -14613,6 +14588,132 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   *  oldp      points to the beginning byte in the input of
                   *            'ender'.
                   *
+                 * In the case of /il, we haven't folded anything that could be
+                 * affected by the locale.  That means only above-Latin1
+                 * characters that fold to other above-latin1 characters get
+                 * folded at compile time.  To check where a good place to
+                 * split nodes is, everything in it will have to be folded.
+                 * The boolean 'maybe_exactfu' keeps track in /il if there are
+                 * any unfolded characters in the node. */
+                bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+                /* If we do need to fold the node, we need a place to store the
+                 * folded copy, and a way to map back to the unfolded original
+                 * */
+                char * locfold_buf = NULL;
+                Size_t * loc_correspondence = NULL;
+
+                if (! need_to_fold_loc) {   /* The normal case.  Just
+                                               initialize to the actual node */
+                    e = s;
+                    s_start = s0;
+                    s = old_old_s;  /* Point to the beginning of the final char
+                                       that fits in the node */
+                }
+                else {
+
+                    /* Here, we have filled a /il node, and there are unfolded
+                     * characters in it.  If the runtime locale turns out to be
+                     * UTF-8, there are possible multi-character folds, just
+                     * like when not under /l.  The node hence can't terminate
+                     * in the middle of such a fold.  To determine this, we
+                     * have to create a folded copy of this node.  That means
+                     * reparsing the node, folding everything assuming a UTF-8
+                     * locale.  (If at runtime it isn't such a locale, the
+                     * actions here wouldn't have been necessary, but we have
+                     * to assume the worst case.)  If we find we need to back
+                     * off the folded string, we do so, and then map that
+                     * position back to the original unfolded node, which then
+                     * gets output, truncated at that spot */
+
+                    char * redo_p = RExC_parse;
+                    char * redo_e;
+                    char * old_redo_e;
+
+                    /* Allow enough space assuming a single byte input folds to
+                     * a single byte output, plus assume that the two unparsed
+                     * characters (that we may need) fold to the largest number
+                     * of bytes possible, plus extra for one more worst case
+                     * scenario.  In the loop below, if we start eating into
+                     * that final spare space, we enlarge this initial space */
+                    Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+                    Newxz(locfold_buf, size, char);
+                    Newxz(loc_correspondence, size, Size_t);
+
+                    /* Redo this node's parse, folding into 'locfold_buf' */
+                    redo_p = RExC_parse;
+                    old_redo_e = redo_e = locfold_buf;
+                    while (redo_p <= oldp) {
+
+                        old_redo_e = redo_e;
+                        loc_correspondence[redo_e - locfold_buf]
+                                                        = redo_p - RExC_parse;
+
+                        if (UTF) {
+                            Size_t added_len;
+
+                            (void) _to_utf8_fold_flags((U8 *) redo_p,
+                                                       (U8 *) RExC_end,
+                                                       (U8 *) redo_e,
+                                                       &added_len,
+                                                       FOLD_FLAGS_FULL);
+                            redo_e += added_len;
+                            redo_p += UTF8SKIP(redo_p);
+                        }
+                        else {
+
+                            /* Note that if this code is run on some ancient
+                             * Unicode versions, SHARP S doesn't fold to 'ss',
+                             * but rather than clutter the code with #ifdef's,
+                             * as is done above, we ignore that possibility.
+                             * This is ok because this code doesn't affect what
+                             * gets matched, but merely where the node gets
+                             * split */
+                            if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+                                *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+                            }
+                            else {
+                                *redo_e++ = 's';
+                                *redo_e++ = 's';
+                            }
+                            redo_p++;
+                        }
+
+
+                        /* If we're getting so close to the end that a
+                         * worst-case fold in the next character would cause us
+                         * to overflow, increase, assuming one byte output byte
+                         * per one byte input one, plus room for another worst
+                         * case fold */
+                        if (   redo_p <= oldp
+                            && redo_e > locfold_buf + size
+                                                    - (UTF8_MAXBYTES_CASE + 1))
+                        {
+                            Size_t new_size = size
+                                            + (oldp - redo_p)
+                                            + UTF8_MAXBYTES_CASE + 1;
+                            Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+                            Renew(locfold_buf, new_size, char);
+                            Renew(loc_correspondence, new_size, Size_t);
+                            size = new_size;
+
+                            redo_e = locfold_buf + e_offset;
+                        }
+                    }
+
+                    /* Set so that things are in terms of the folded, temporary
+                     * string */
+                    s = old_redo_e;
+                    s_start = locfold_buf;
+                    e = redo_e;
+
+                }
+
+                /* Here, we have 's', 's_start' and 'e' set up to point to the
+                 * input that goes into the node, folded.
+                 *
                   * If the final character of the node and the fold of ender
                   * form the first two characters of a three character fold, we
                   * need to peek ahead at the next (unparsed) character in the
@@ -14652,11 +14753,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   * and try again.
                   *
                   * Otherwise, the node can be split at the current position.
-                 */
-                s = old_old_s;  /* Point to the beginning of the final char
-                                   that fits in the node */
-
-                /* The same logic is used for UTF-8 patterns and not */
+                 *
+                 * The same logic is used for UTF-8 patterns and not */
                  if (UTF) {
                      Size_t added_len;
  
@@ -14695,7 +14793,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                           * drop down to try at that position */
                          if (isPUNCT(*p)) {
                              s = (char *) utf8_hop_back((U8 *) s, -1,
-                                       (U8 *) s0);
+                                       (U8 *) s_start);
                              backed_up = TRUE;
                          }
                          else {
@@ -14727,7 +14825,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       * either case would break apart a fold */
                      do {
                          char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
-                                                                    (U8 *) s0);
+                                                            (U8 *) s_start);
  
                          /* If is a multi-char fold, can't split here.  Backup
                           * one char and try again */
@@ -14741,11 +14839,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                           * three character fold starting at the character
                           * before s, we can't split either before or after s.
                           * Backup two chars and try again */
-                        if (   LIKELY(s > s0)
+                        if (   LIKELY(s > s_start)
                              && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
                          {
                              s = prev_s;
-                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
                              backed_up = TRUE;
                              continue;
                          }
@@ -14755,7 +14853,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          splittable = TRUE;
                          break;
  
-                    } while (s > s0); /* End of loops backing up through the node */
+                    } while (s > s_start); /* End of loops backing up through the node */
  
                      /* Here we either couldn't find a place to split the node,
                       * or else we broke out of the loop setting 'splittable' to
@@ -14804,7 +14902,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              continue;
                          }
  
-                        if (   LIKELY(s > s0)
+                        if (   LIKELY(s > s_start)
                              && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
                          {
                              s -= 2;
@@ -14815,7 +14913,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          splittable = TRUE;
                          break;
  
-                    } while (s > s0);
+                    } while (s > s_start);
  
                      if (splittable) {
                          s++;
@@ -14829,9 +14927,28 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     /* If we did find a place to split, reparse the entire node
                      * stopping where we have calculated. */
                      if (splittable) {
-                        upper_fill = s - s0;
+
+                       /* If we created a temporary folded string under /l, we
+                        * have to map that back to the original */
+                        if (need_to_fold_loc) {
+                            upper_fill = loc_correspondence[s - s_start];
+                            Safefree(locfold_buf);
+                            Safefree(loc_correspondence);
+
+                            if (upper_fill == 0) {
+                                FAIL2("panic: loc_correspondence[%d] is 0",
+                                      (int) (s - s_start));
+                            }
+                        }
+                        else {
+                            upper_fill = s - s0;
+                        }
                          goto reparse;
                      }
+                    else if (need_to_fold_loc) {
+                        Safefree(locfold_buf);
+                        Safefree(loc_correspondence);
+                    }
  
                      /* Here the node consists entirely of non-final multi-char
                       * folds.  (Likely it is all 'f's or all 's's.)  There's no
@@ -16798,7 +16915,7 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
      UPDATE_WARNINGS_LOC(RExC_parse);
  }
  
-Size_t PERL_STATIC_INLINE
+PERL_STATIC_INLINE Size_t
  S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
  {
      const U8 * const start = s1;
@@ -18722,7 +18839,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                  /* Only try if there are no more code points in the class than
                   * in the max possible fold */
-            &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1)
+            &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
          {
              if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
              {
@@ -19376,15 +19493,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                             regarglen[op] + STR_SZ(len),
                                             "anyofhs");
                          FILL_NODE(ret, op);
-                        RExC_emit += 1 + regarglen[op]
-                                   - 1 + STR_SZ(len); /* Replace the [1]
-                                                         element of the struct
-                                                         by the real value */
-                        REGNODE_p(ret)->flags = len;
+                        ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
+                                                                        = len;
                          Copy(low_utf8,  /* Add the common bytes */
                             ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
                             len, U8);
-                        NEXT_OFF(REGNODE_p(ret)) = regarglen[op] + STR_SZ(len);
+                        RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
                          set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
                                                    NULL, only_utf8_locale_list);
                          goto not_anyof;
@@ -19649,10 +19763,10 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                      STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
                      STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
  
-                    av_store(av, INVLIST_INDEX, invlist);
+                    ary[INVLIST_INDEX] = invlist;
                      av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
-                                 ? ONLY_LOCALE_MATCHES_INDEX:
-                                 INVLIST_INDEX);
+                                 ? ONLY_LOCALE_MATCHES_INDEX
+                                 : INVLIST_INDEX);
                      si = NULL;
                  }
             }
@@ -19685,15 +19799,11 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                  UV prev_cp = 0;
                  U8 count = 0;
  
-                /* Ignore everything before the first new-line */
-                while (*si_string != '\n' && remaining > 0) {
-                    si_string++;
-                    remaining--;
-                }
-                assert(remaining > 0);
-
+                /* Ignore everything before and including the first new-line */
+                si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
+                assert (si_string != NULL);
                  si_string++;
-                remaining--;
+                remaining = SvPVX(si) + SvCUR(si) - si_string;
  
                  while (remaining > 0) {
  
@@ -19739,22 +19849,21 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                       * here to the next \n */
  
                      remaining -= len;
-                    while (*(si_string + len) != '\n' && remaining > 0) {
-                        remaining--;
-                        len++;
-                    }
-                    if (*(si_string + len) == '\n') {
-                        len++;
-                        remaining--;
-                    }
+                    len = strcspn(si_string, "\n");
+                    remaining -= len;
                      if (matches_string) {
-                        sv_catpvn(matches_string, si_string, len - 1);
+                        sv_catpvn(matches_string, si_string, len);
                      }
                      else {
-                        matches_string = newSVpvn(si_string, len - 1);
+                        matches_string = newSVpvn(si_string, len);
                      }
-                    si_string += len;
                      sv_catpvs(matches_string, " ");
+
+                    si_string += len;
+                    if (remaining && UCHARAT(si_string) == '\n') {
+                        si_string++;
+                        remaining--;
+                    }
                  } /* end of loop through the text */
  
                  assert(matches_string);
@@ -20189,6 +20298,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
          scan = REGNODE_OFFSET(temp);
      }
  
+    assert(val >= scan);
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
          assert((UV) (val - scan) <= U32_MAX);
          ARG_SET(REGNODE_p(scan), val - scan);
@@ -22431,7 +22541,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
         else if ( op == PLUS || op == STAR) {
             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
         }
-       else if (PL_regkind[(U8)op] == EXACT) {
+       else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
              /* Literal string, where present. */
             node += NODE_SZ_STR(node) - 1;
             node = NEXTOPER(node);
@@ -22461,6 +22571,17 @@ Perl_init_uniprops(pTHX)
  {
      dVAR;
  
+#ifdef DEBUGGING
+    char * dump_len_string;
+
+    dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+    if (   ! dump_len_string
+        || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
+    {
+        PL_dump_re_max_len = 60;    /* A reasonable default */
+    }
+#endif
+
      PL_user_def_props = newHV();
  
  #ifdef USE_ITHREADS
@@ -22470,7 +22591,7 @@ Perl_init_uniprops(pTHX)
  
  #endif
  
-    /* Set up the inversion list global variables */
+    /* Set up the inversion list interpreter-level variables */
  
      PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
      PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
@@ -22512,6 +22633,7 @@ Perl_init_uniprops(pTHX)
      PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
      PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
  
+    PL_InBitmap = _new_invlist_C_array(_Perl_InBitmap_invlist);
      PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
      PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
      PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);