op.c: Add, clarify comments

[perl5.git] / op.c
diff --git a/op.c b/op.c

index 95670a3..e810d46 100644 (file)
--- a/op.c
+++ b/op.c
@@ -5499,6 +5499,124 @@ Perl_invert(pTHX_ OP *o)
      return newUNOP(OP_NOT, OPf_SPECIAL, scalar(o));
  }
  
+OP *
+Perl_cmpchain_start(pTHX_ I32 type, OP *left, OP *right)
+{
+    dVAR;
+    BINOP *bop;
+    OP *op;
+
+    if (!left)
+       left = newOP(OP_NULL, 0);
+    if (!right)
+       right = newOP(OP_NULL, 0);
+    scalar(left);
+    scalar(right);
+    NewOp(0, bop, 1, BINOP);
+    op = (OP*)bop;
+    ASSUME((PL_opargs[type] & OA_CLASS_MASK) == OA_BINOP);
+    OpTYPE_set(op, type);
+    cBINOPx(op)->op_flags = OPf_KIDS;
+    cBINOPx(op)->op_private = 2;
+    cBINOPx(op)->op_first = left;
+    cBINOPx(op)->op_last = right;
+    OpMORESIB_set(left, right);
+    OpLASTSIB_set(right, op);
+    return op;
+}
+
+OP *
+Perl_cmpchain_extend(pTHX_ I32 type, OP *ch, OP *right)
+{
+    dVAR;
+    BINOP *bop;
+    OP *op;
+
+    PERL_ARGS_ASSERT_CMPCHAIN_EXTEND;
+    if (!right)
+       right = newOP(OP_NULL, 0);
+    scalar(right);
+    NewOp(0, bop, 1, BINOP);
+    op = (OP*)bop;
+    ASSUME((PL_opargs[type] & OA_CLASS_MASK) == OA_BINOP);
+    OpTYPE_set(op, type);
+    if (ch->op_type != OP_NULL) {
+       UNOP *lch;
+       OP *nch, *cleft, *cright;
+       NewOp(0, lch, 1, UNOP);
+       nch = (OP*)lch;
+       OpTYPE_set(nch, OP_NULL);
+       nch->op_flags = OPf_KIDS;
+       cleft = cBINOPx(ch)->op_first;
+       cright = cBINOPx(ch)->op_last;
+       cBINOPx(ch)->op_first = NULL;
+       cBINOPx(ch)->op_last = NULL;
+       cBINOPx(ch)->op_private = 0;
+       cBINOPx(ch)->op_flags = 0;
+       cUNOPx(nch)->op_first = cright;
+       OpMORESIB_set(cright, ch);
+       OpMORESIB_set(ch, cleft);
+       OpLASTSIB_set(cleft, nch);
+       ch = nch;
+    }
+    OpMORESIB_set(right, op);
+    OpMORESIB_set(op, cUNOPx(ch)->op_first);
+    cUNOPx(ch)->op_first = right;
+    return ch;
+}
+
+OP *
+Perl_cmpchain_finish(pTHX_ OP *ch)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_CMPCHAIN_FINISH;
+    if (ch->op_type != OP_NULL) {
+       OPCODE cmpoptype = ch->op_type;
+       ch = CHECKOP(cmpoptype, ch);
+       if(!ch->op_next && ch->op_type == cmpoptype)
+           ch = fold_constants(op_integerize(op_std_init(ch)));
+       return ch;
+    } else {
+       OP *condop = NULL;
+       OP *rightarg = cUNOPx(ch)->op_first;
+       cUNOPx(ch)->op_first = OpSIBLING(rightarg);
+       OpLASTSIB_set(rightarg, NULL);
+       while (1) {
+           OP *cmpop = cUNOPx(ch)->op_first;
+           OP *leftarg = OpSIBLING(cmpop);
+           OPCODE cmpoptype = cmpop->op_type;
+           OP *nextrightarg;
+           bool is_last;
+           is_last = !(cUNOPx(ch)->op_first = OpSIBLING(leftarg));
+           OpLASTSIB_set(cmpop, NULL);
+           OpLASTSIB_set(leftarg, NULL);
+           if (is_last) {
+               ch->op_flags = 0;
+               op_free(ch);
+               nextrightarg = NULL;
+           } else {
+               nextrightarg = newUNOP(OP_CMPCHAIN_DUP, 0, leftarg);
+               leftarg = newOP(OP_NULL, 0);
+           }
+           cBINOPx(cmpop)->op_first = leftarg;
+           cBINOPx(cmpop)->op_last = rightarg;
+           OpMORESIB_set(leftarg, rightarg);
+           OpLASTSIB_set(rightarg, cmpop);
+           cmpop->op_flags = OPf_KIDS;
+           cmpop->op_private = 2;
+           cmpop = CHECKOP(cmpoptype, cmpop);
+           if(!cmpop->op_next && cmpop->op_type == cmpoptype)
+               cmpop = fold_constants(op_integerize(op_std_init(cmpop)));
+           condop = condop ? newLOGOP(OP_CMPCHAIN_AND, 0, cmpop, condop) :
+                       cmpop;
+           if (!nextrightarg)
+               return condop;
+           rightarg = nextrightarg;
+       }
+    }
+}
+
  /*
  =for apidoc op_scope
  
@@ -5906,9 +6024,10 @@ S_op_integerize(pTHX_ OP *o)
  }
  
  /* This function exists solely to provide a scope to limit
-   setjmp/longjmp() messing with auto variables.
+   setjmp/longjmp() messing with auto variables.  It cannot be inlined because
+   it uses setjmp
   */
-PERL_STATIC_INLINE int
+STATIC int
  S_fold_constants_eval(pTHX) {
      int ret = 0;
      dJMPENV;
@@ -6853,16 +6972,17 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
       * allocated, then copied.  If the replacement for every character in every
       * possible string takes up no more bytes than the the character it
       * replaces, then it can be edited in place.  Otherwise the replacement
-     * could "grow", depending on the strings being processed.  Some inputs
-     * won't grow, and might even shrink under /d, but some inputs could grow,
-     * so we have to assume any given one might grow.  On very long inputs, the
-     * temporary could eat up a lot of memory, so we want to avoid it if
-     * possible.  For non-UTF-8 inputs, everything is single-byte, so can be
-     * edited in place, unless there is something in the pattern that could
-     * force it into UTF-8.  The inversion map makes it feasible to determine
-     * this.  Previous versions of this code pretty much punted on determining
-     * if UTF-8 could be edited in place.  Now, this code is rigorous in making
-     * that determination.
+     * could overwrite a byte we are about to read, depending on the strings
+     * being processed.  The comments and variable names here refer to this as
+     * "growing".  Some inputs won't grow, and might even shrink under /d, but
+     * some inputs could grow, so we have to assume any given one might grow.
+     * On very long inputs, the temporary could eat up a lot of memory, so we
+     * want to avoid it if possible.  For non-UTF-8 inputs, everything is
+     * single-byte, so can be edited in place, unless there is something in the
+     * pattern that could force it into UTF-8.  The inversion map makes it
+     * feasible to determine this.  Previous versions of this code pretty much
+     * punted on determining if UTF-8 could be edited in place.  Now, this code
+     * is rigorous in making that determination.
       *
       * Another characteristic we need to know is whether the lhs and rhs are
       * identical.  If so, and no other flags are present, the only effect of
@@ -6899,9 +7019,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
      const bool squash     = cBOOL(o->op_private & OPpTRANS_SQUASH);
      const bool del        = cBOOL(o->op_private & OPpTRANS_DELETE);
  
-    /* Set to true if there is some character < 256 in the lhs that maps to >
-     * 255.  If so, a non-UTF-8 match string can be forced into requiring to be
-     * in UTF-8 by a tr/// operation. */
+    /* Set to true if there is some character < 256 in the lhs that maps to
+     * above 255.  If so, a non-UTF-8 match string can be forced into being in
+     * UTF-8 by a tr/// operation. */
      bool can_force_utf8 = FALSE;
  
      /* What is the maximum expansion factor in UTF-8 transliterations.  If a
@@ -6909,7 +7029,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
       * expansion factor is 1.5.  This number is used at runtime to calculate
       * how much space to allocate for non-inplace transliterations.  Without
       * this number, the worst case is 14, which is extremely unlikely to happen
-     * in real life, and would require significant memory overhead. */
+     * in real life, and could require significant memory overhead. */
      NV max_expansion = 1.;
  
      UV t_range_count, r_range_count, min_range_count;
@@ -6951,16 +7071,16 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
  #ifndef EBCDIC
      UV PL_partition_by_byte_length[] = {
          0,
-        0x80,
-        (32 * (1UL << (    UTF_ACCUMULATION_SHIFT))),
-        (16 * (1UL << (2 * UTF_ACCUMULATION_SHIFT))),
-        ( 8 * (1UL << (3 * UTF_ACCUMULATION_SHIFT))),
-        ( 4 * (1UL << (4 * UTF_ACCUMULATION_SHIFT))),
-        ( 2 * (1UL << (5 * UTF_ACCUMULATION_SHIFT)))
+        0x80,   /* Below this is 1 byte representations */
+        (32 * (1UL << (    UTF_ACCUMULATION_SHIFT))),   /* 2 bytes below this */
+        (16 * (1UL << (2 * UTF_ACCUMULATION_SHIFT))),   /* 3 bytes below this */
+        ( 8 * (1UL << (3 * UTF_ACCUMULATION_SHIFT))),   /* 4 bytes below this */
+        ( 4 * (1UL << (4 * UTF_ACCUMULATION_SHIFT))),   /* 5 bytes below this */
+        ( 2 * (1UL << (5 * UTF_ACCUMULATION_SHIFT)))    /* 6 bytes below this */
  
  #  ifdef UV_IS_QUAD
                                                      ,
-        ( ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)))
+        ( ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)))    /* 7 bytes below this */
  #  endif
  
      };
@@ -7356,7 +7476,12 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
              t_cp_end = MIN(IV_MAX, t_cp + span - 1);
  
              if (r_cp == TR_SPECIAL_HANDLING) {
-                r_cp_end = TR_SPECIAL_HANDLING;
+
+                /* If unmatched lhs code points map to the final map, use that
+                 * value.  This being set to TR_SPECIAL_HANDLING indicates that
+                 * we don't have a final map: unmatched lhs code points are
+                 * simply deleted */
+                r_cp_end = (del) ? TR_SPECIAL_HANDLING : final_map;
              }
              else {
                  r_cp_end = MIN(IV_MAX, r_cp + span - 1);
@@ -7386,6 +7511,13 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                   * we use the above sample data.  The t_cp chunk must be any
                   * contiguous subset of M, N, O, P, and/or Q.
                   *
+                 * In the first pass, calculate if there is any possible input
+                 * string that has a character whose transliteration will be
+                 * longer than it.  If none, the transliteration may be done
+                 * in-place, as it can't write over a so-far unread byte.
+                 * Otherwise, a copy must first be made.  This could be
+                 * expensive for long inputs.
+                 *
                   * In the first pass, the t_invlist has been partitioned so
                   * that all elements in any single range have the same number
                   * of bytes in their UTF-8 representations.  And the r space is
@@ -7409,15 +7541,25 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                      && r_cp_end != TR_SPECIAL_HANDLING
                      && UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end))
                  {
-                    /* Consider tr/\xCB/\X{E000}/.  The maximum expansion
-                     * factor is 1 byte going to 3 if the lhs is not UTF-8, but
-                     * 2 bytes going to 3 if it is in UTF-8.  We could pass two
-                     * different values so doop could choose based on the
-                     * UTF-8ness of the target.  But khw thinks (perhaps
-                     * wrongly) that is overkill.  It is used only to make sure
-                     * we malloc enough space.  If no target string can force
-                     * the result to be UTF-8, then we don't have to worry
-                     * about this */
+                    /* Here, we will need to make a copy of the input string
+                     * before doing the transliteration.  The worst possible
+                     * case is an expansion ratio of 14:1. This is rare, and
+                     * we'd rather allocate only the necessary amount of extra
+                     * memory for that copy.  We can calculate the worst case
+                     * for this particular transliteration is by keeping track
+                     * of the expansion factor for each range.
+                     *
+                     * Consider tr/\xCB/\X{E000}/.  The maximum expansion
+                     * factor is 1 byte going to 3 if the target string is not
+                     * UTF-8, but 2 bytes going to 3 if it is in UTF-8.  We
+                     * could pass two different values so doop could choose
+                     * based on the UTF-8ness of the target.  But khw thinks
+                     * (perhaps wrongly) that is overkill.  It is used only to
+                     * make sure we malloc enough space.
+                     *
+                     * If no target string can force the result to be UTF-8,
+                     * then we don't have to worry about the case of the target
+                     * string not being UTF-8 */
                      NV t_size = (can_force_utf8 && t_cp < 256)
                                  ? 1
                                  : UVCHR_SKIP(t_cp_end);
@@ -11597,10 +11739,9 @@ S_process_special_blocks(pTHX_ I32 floor, const char *const fullname,
                   * to PL_compiling, IN_PERL_COMPILETIME/IN_PERL_RUNTIME
                   * will give the wrong answer.
                   */
-                Newx(PL_curcop, 1, COP);
-                StructCopy(&PL_compiling, PL_curcop, COP);
-                PL_curcop->op_slabbed = 0;
-                SAVEFREEPV(PL_curcop);
+                PL_curcop = (COP*)newSTATEOP(PL_compiling.op_flags, NULL, NULL);
+                CopLINE_set(PL_curcop, CopLINE(&PL_compiling));
+                SAVEFREEOP(PL_curcop);
              }
  
              PUSHSTACKi(PERLSI_REQUIRE);
@@ -17431,6 +17572,7 @@ Perl_rpeep(pTHX_ OP *o)
          case OP_AND:
         case OP_OR:
         case OP_DOR:
+       case OP_CMPCHAIN_AND:
             while (cLOGOP->op_other->op_type == OP_NULL)
                 cLOGOP->op_other = cLOGOP->op_other->op_next;
             while (o->op_next && (   o->op_type == o->op_next->op_type