regcomp_internal.h

   1 #ifndef REGCOMP_INTERNAL_H
   2 #define REGCOMP_INTERNAL_H
   3 #ifndef STATIC
   4 #define STATIC  static
   5 #endif
   6
   7 /* this is a chain of data about sub patterns we are processing that
   8    need to be handled separately/specially in study_chunk. Its so
   9    we can simulate recursion without losing state.  */
  10 struct scan_frame;
  11 typedef struct scan_frame {
  12     regnode *last_regnode;      /* last node to process in this frame */
  13     regnode *next_regnode;      /* next node to process when last is reached */
  14     U32 prev_recursed_depth;
  15     I32 stopparen;              /* what stopparen do we use */
  16     bool in_gosub;              /* this or an outer frame is for GOSUB */
  17
  18     struct scan_frame *this_prev_frame; /* this previous frame */
  19     struct scan_frame *prev_frame;      /* previous frame */
  20     struct scan_frame *next_frame;      /* next frame */
  21 } scan_frame;
  22
  23 /* Certain characters are output as a sequence with the first being a
  24  * backslash. */
  25 #define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
  26
  27
  28 struct RExC_state_t {
  29     U32         flags;                  /* RXf_* are we folding, multilining? */
  30     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
  31     char        *precomp;               /* uncompiled string. */
  32     char        *precomp_end;           /* pointer to end of uncompiled string. */
  33     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
  34     regexp      *rx;                    /* perl core regexp structure */
  35     regexp_internal     *rxi;           /* internal data for regexp object
  36                                            pprivate field */
  37     char        *start;                 /* Start of input for compile */
  38     char        *end;                   /* End of input for compile */
  39     char        *parse;                 /* Input-scan pointer. */
  40     char        *copy_start;            /* start of copy of input within
  41                                            constructed parse string */
  42     char        *save_copy_start;       /* Provides one level of saving
  43                                            and restoring 'copy_start' */
  44     char        *copy_start_in_input;   /* Position in input string
  45                                            corresponding to copy_start */
  46     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
  47     regnode     *emit_start;            /* Start of emitted-code area */
  48     regnode_offset emit;                /* Code-emit pointer */
  49     I32         naughty;                /* How bad is this pattern? */
  50     I32         sawback;                /* Did we see \1, ...? */
  51     SSize_t     size;                   /* Number of regnode equivalents in
  52                                            pattern */
  53     Size_t      sets_depth;              /* Counts recursion depth of already-
  54                                            compiled regex set patterns */
  55     U32         seen;
  56
  57     I32      parens_buf_size;           /* #slots malloced open/close_parens */
  58     regnode_offset *open_parens;        /* offsets to open parens */
  59     regnode_offset *close_parens;       /* offsets to close parens */
  60     HV          *paren_names;           /* Paren names */
  61
  62     /* position beyond 'precomp' of the warning message furthest away from
  63      * 'precomp'.  During the parse, no warnings are raised for any problems
  64      * earlier in the parse than this position.  This works if warnings are
  65      * raised the first time a given spot is parsed, and if only one
  66      * independent warning is raised for any given spot */
  67     Size_t      latest_warn_offset;
  68
  69     /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id.
  70      * "Logical Parno" is the user visible view with branch reset taken into
  71      * account. "Parno" (or physical parno) is the actual capture buffers in
  72      * the pattern *NOT* taking into account branch reset. We also maintain
  73      * a map of "next" pointers which allow us to skip to the next physical
  74      * capture buffer with the same logical id, with 0 representing "none".
  75      *
  76      * As we compile we keep track of the two different counts using the
  77      * 'logical_npar' and 'npar' members, and we keep track of the upper bound
  78      * of both in 'total_par' and 'logical_total_par', we also populate
  79      * the 'logical_to_parno' map, which gives us the first physical parno
  80      * for a given logical parno, and the `parno_to_logical` array which gives
  81      * us the logical id for each physical parno. When compilation is
  82      * completed we construct the 'parno_to_logical_next' array from the
  83      * 'parno_to_logical' array. (We do not bother constructing it during
  84      * compilation as we do not need it, and we can construct it in O(N) time
  85      * once we are done, but would need more complicated logic during the
  86      * compile, because we want the next pointers to go from smallest to
  87      * largest, eg, left to right.)
  88      *
  89      * Logical: $1      $2  $3  $4    $2  $3    $2    $5
  90      * Physical: 1       2   3   4     5   6     7     8
  91      * Next:     0       5   6   0     7   0     0     0
  92      * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/
  93      *
  94      * As much as possible the internals use and store the physical id of
  95      * of capture buffers. We decode the physical to the logical only when
  96      * we need to, for instance when someone use $2.
  97      *
  98      * Note that when branch reset is not used logical and physical are the
  99      * same and the next data would be all zero. So when branch reset is not
 100      * used we do not need to populate this data into the final regexp.
 101      *
 102      */
 103     I32         *logical_to_parno;        /* logical_parno to parno */
 104     I32         *parno_to_logical;        /* parno to logical_parno */
 105     I32         *parno_to_logical_next;   /* parno to next (greater value)
 106                                              parno with the same
 107                                              logical_parno as parno.*/
 108
 109     I32         npar;                   /* Capture buffer count so far in the
 110                                            parse, (OPEN) plus one. ("par" 0 is
 111                                            the whole pattern)*/
 112     I32         logical_npar;           /* Logical version of npar */
 113     I32         total_par;              /* During initial parse, is either 0,
 114                                            or -1; the latter indicating a
 115                                            reparse is needed.  After that pass,
 116                                            it is what 'npar' became after the
 117                                            pass.  Hence, it being > 0 indicates
 118                                            we are in a reparse situation */
 119     I32         logical_total_par;      /* Logical version to total par */
 120     I32         nestroot;               /* root parens we are in - used by
 121                                            accept */
 122     I32         seen_zerolen;
 123     regnode     *end_op;                /* END node in program */
 124     I32         utf8;           /* whether the pattern is utf8 or not */
 125     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 126                                 /* XXX use this for future optimisation of case
 127                                  * where pattern must be upgraded to utf8. */
 128     I32         uni_semantics;  /* If a d charset modifier should use unicode
 129                                    rules, even if the pattern is not in
 130                                    utf8 */
 131
 132     I32         recurse_count;          /* Number of recurse regops we have generated */
 133     regnode     **recurse;              /* Recurse regops */
 134     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 135                                            through */
 136     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 137     I32         in_lookaround;
 138     I32         contains_locale;
 139     I32         override_recoding;
 140     I32         recode_x_to_native;
 141     I32         in_multi_char_class;
 142     int         code_index;             /* next code_blocks[] slot */
 143     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 144                                             within pattern */
 145     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 146     scan_frame *frame_head;
 147     scan_frame *frame_last;
 148     U32         frame_count;
 149     AV         *warn_text;
 150     HV         *unlexed_names;
 151     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 152 #ifdef DEBUGGING
 153     const char  *lastparse;
 154     I32         lastnum;
 155     U32         study_chunk_recursed_count;
 156     AV          *paren_name_list;       /* idx -> name */
 157     SV          *mysv1;
 158     SV          *mysv2;
 159
 160 #define RExC_lastparse  (pRExC_state->lastparse)
 161 #define RExC_lastnum    (pRExC_state->lastnum)
 162 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 163 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 164 #define RExC_mysv       (pRExC_state->mysv1)
 165 #define RExC_mysv1      (pRExC_state->mysv1)
 166 #define RExC_mysv2      (pRExC_state->mysv2)
 167
 168 #endif
 169     bool        seen_d_op;
 170     bool        strict;
 171     bool        study_started;
 172     bool        in_script_run;
 173     bool        use_BRANCHJ;
 174     bool        sWARN_EXPERIMENTAL__VLB;
 175     bool        sWARN_EXPERIMENTAL__REGEX_SETS;
 176 };
 177
 178 #define RExC_flags      (pRExC_state->flags)
 179 #define RExC_pm_flags   (pRExC_state->pm_flags)
 180 #define RExC_precomp    (pRExC_state->precomp)
 181 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 182 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 183 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
 184 #define RExC_precomp_end (pRExC_state->precomp_end)
 185 #define RExC_rx_sv      (pRExC_state->rx_sv)
 186 #define RExC_rx         (pRExC_state->rx)
 187 #define RExC_rxi        (pRExC_state->rxi)
 188 #define RExC_start      (pRExC_state->start)
 189 #define RExC_end        (pRExC_state->end)
 190 #define RExC_parse      (pRExC_state->parse)
 191 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 192 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 193 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 194                                                    under /d from /u ? */
 195
 196 #define RExC_emit       (pRExC_state->emit)
 197 #define RExC_emit_start (pRExC_state->emit_start)
 198 #define RExC_sawback    (pRExC_state->sawback)
 199 #define RExC_seen       (pRExC_state->seen)
 200 #define RExC_size       (pRExC_state->size)
 201 #define RExC_maxlen        (pRExC_state->maxlen)
 202 #define RExC_logical_npar           (pRExC_state->logical_npar)
 203 #define RExC_logical_total_parens   (pRExC_state->logical_total_par)
 204 #define RExC_logical_to_parno       (pRExC_state->logical_to_parno)
 205 #define RExC_parno_to_logical       (pRExC_state->parno_to_logical)
 206 #define RExC_parno_to_logical_next  (pRExC_state->parno_to_logical_next)
 207 #define RExC_npar       (pRExC_state->npar)
 208 #define RExC_total_parens       (pRExC_state->total_par)
 209 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
 210 #define RExC_nestroot   (pRExC_state->nestroot)
 211 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 212 #define RExC_utf8       (pRExC_state->utf8)
 213 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 214 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 215 #define RExC_open_parens        (pRExC_state->open_parens)
 216 #define RExC_close_parens       (pRExC_state->close_parens)
 217 #define RExC_end_op     (pRExC_state->end_op)
 218 #define RExC_paren_names        (pRExC_state->paren_names)
 219 #define RExC_recurse    (pRExC_state->recurse)
 220 #define RExC_recurse_count      (pRExC_state->recurse_count)
 221 #define RExC_sets_depth         (pRExC_state->sets_depth)
 222 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 223 #define RExC_study_chunk_recursed_bytes  \
 224                                    (pRExC_state->study_chunk_recursed_bytes)
 225 #define RExC_in_lookaround      (pRExC_state->in_lookaround)
 226 #define RExC_contains_locale    (pRExC_state->contains_locale)
 227 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 228
 229 #ifdef EBCDIC
 230 #  define SET_recode_x_to_native(x)                                         \
 231                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
 232 #else
 233 #  define SET_recode_x_to_native(x) NOOP
 234 #endif
 235
 236 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 237 #define RExC_frame_head (pRExC_state->frame_head)
 238 #define RExC_frame_last (pRExC_state->frame_last)
 239 #define RExC_frame_count (pRExC_state->frame_count)
 240 #define RExC_strict (pRExC_state->strict)
 241 #define RExC_study_started      (pRExC_state->study_started)
 242 #define RExC_warn_text (pRExC_state->warn_text)
 243 #define RExC_in_script_run      (pRExC_state->in_script_run)
 244 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 245 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
 246 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
 247 #define RExC_unlexed_names (pRExC_state->unlexed_names)
 248
 249
 250 /***********************************************************************/
 251 /* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse
 252  *
 253  * All of these macros depend on the above RExC_ accessor macros, which
 254  * in turns depend on a variable pRExC_state being in scope where they
 255  * are used. This is the standard regexp parser context variable which is
 256  * passed into every non-trivial parse function in this file.
 257  *
 258  * Note that the UTF macro is itself a wrapper around RExC_utf8, so all
 259  * of the macros which do not take an argument will operate on the
 260  * pRExC_state structure *only*.
 261  *
 262  * Please do NOT modify RExC_parse without using these macros. In the
 263  * future these macros will be extended for enhanced debugging and trace
 264  * output during the parse process.
 265  */
 266
 267 /* RExC_parse_incf(flag)
 268  *
 269  * Increment RExC_parse to point at the next codepoint, while doing
 270  * the right thing depending on whether we are parsing UTF-8 strings
 271  * or not. The 'flag' argument determines if content is UTF-8 or not,
 272  * intended for cases where this is NOT governed by the UTF macro.
 273  *
 274  * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro.
 275  *
 276  * WARNING: Does NOT take into account RExC_end; it is the callers
 277  * responsibility to make sure there are enough octets left in
 278  * RExC_parse to ensure that when processing UTF-8 we would not read
 279  * past the end of the string.
 280  */
 281 #define RExC_parse_incf(flag) STMT_START {              \
 282     RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1;    \
 283 } STMT_END
 284
 285 /* RExC_parse_inc_safef(flag)
 286  *
 287  * Safely increment RExC_parse to point at the next codepoint,
 288  * doing the right thing depending on whether we are parsing
 289  * UTF-8 strings or not and NOT reading past the end of the buffer.
 290  * The 'flag' argument determines if content is UTF-8 or not,
 291  * intended for cases where this is NOT governed by the UTF macro.
 292  *
 293  * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro.
 294  *
 295  * NOTE: Will NOT read past RExC_end when content is UTF-8.
 296  */
 297 #define RExC_parse_inc_safef(flag) STMT_START {                     \
 298     RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \
 299 } STMT_END
 300
 301 /* RExC_parse_inc()
 302  *
 303  * Increment RExC_parse to point at the next codepoint,
 304  * doing the right thing depending on whether we are parsing
 305  * UTF-8 strings or not.
 306  *
 307  * WARNING: Does NOT take into account RExC_end, it is the callers
 308  * responsibility to make sure there are enough octets left in
 309  * RExC_parse to ensure that when processing UTF-8 we would not read
 310  * past the end of the string.
 311  *
 312  * NOTE: whether we are parsing UTF-8 or not is determined by the
 313  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
 314  * macro operates on the pRExC_state structure only.
 315  */
 316 #define RExC_parse_inc() RExC_parse_incf(UTF)
 317
 318 /* RExC_parse_inc_safe()
 319  *
 320  * Safely increment RExC_parse to point at the next codepoint,
 321  * doing the right thing depending on whether we are parsing
 322  * UTF-8 strings or not and NOT reading past the end of the buffer.
 323  *
 324  * NOTE: whether we are parsing UTF-8 or not is determined by the
 325  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
 326  * macro operates on the pRExC_state structure only.
 327  */
 328 #define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF)
 329
 330 /* RExC_parse_inc_utf8()
 331  *
 332  * Increment RExC_parse to point at the next utf8 codepoint,
 333  * assumes content is UTF-8.
 334  *
 335  * WARNING: Does NOT take into account RExC_end; it is the callers
 336  * responsibility to make sure there are enough octets left in RExC_parse
 337  * to ensure that when processing UTF-8 we would not read past the end
 338  * of the string.
 339  */
 340 #define RExC_parse_inc_utf8() STMT_START {  \
 341     RExC_parse += UTF8SKIP(RExC_parse);     \
 342 } STMT_END
 343
 344 /* RExC_parse_inc_if_char()
 345  *
 346  * Increment RExC_parse to point at the next codepoint, if and only
 347  * if the current parse point is NOT a NULL, while doing the right thing
 348  * depending on whether we are parsing UTF-8 strings or not.
 349  *
 350  * WARNING: Does NOT take into account RExC_end, it is the callers
 351  * responsibility to make sure there are enough octets left in RExC_parse
 352  * to ensure that when processing UTF-8 we would not read past the end
 353  * of the string.
 354  *
 355  * NOTE: whether we are parsing UTF-8 or not is determined by the
 356  * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
 357  * macro operates on the pRExC_state structure only.
 358  */
 359 #define RExC_parse_inc_if_char() STMT_START {         \
 360     RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end);  \
 361 } STMT_END
 362
 363 /* RExC_parse_inc_by(n_octets)
 364  *
 365  * Increment the parse cursor by the number of octets specified by
 366  * the 'n_octets' argument.
 367  *
 368  * NOTE: Does NOT check ANY constraints. It is the callers responsibility
 369  * that this will not move past the end of the string, or leave the
 370  * pointer in the middle of a UTF-8 sequence.
 371  *
 372  * Typically used to advanced past previously analyzed content.
 373  */
 374 #define RExC_parse_inc_by(n_octets) STMT_START {  \
 375     RExC_parse += (n_octets);                     \
 376 } STMT_END
 377
 378 /* RExC_parse_set(to_ptr)
 379  *
 380  * Sets the RExC_parse pointer to the pointer specified by the 'to'
 381  * argument. No validation whatsoever is performed on the to pointer.
 382  */
 383 #define RExC_parse_set(to_ptr) STMT_START { \
 384     RExC_parse = (to_ptr);                  \
 385 } STMT_END
 386
 387 /**********************************************************************/
 388
 389 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 390  * a flag to disable back-off on the fixed/floating substrings - if it's
 391  * a high complexity pattern we assume the benefit of avoiding a full match
 392  * is worth the cost of checking for the substrings even if they rarely help.
 393  */
 394 #define RExC_naughty    (pRExC_state->naughty)
 395 #define TOO_NAUGHTY (10)
 396 #define MARK_NAUGHTY(add) \
 397     if (RExC_naughty < TOO_NAUGHTY) \
 398         RExC_naughty += (add)
 399 #define MARK_NAUGHTY_EXP(exp, add) \
 400     if (RExC_naughty < TOO_NAUGHTY) \
 401         RExC_naughty += RExC_naughty / (exp) + (add)
 402
 403 #define isNON_BRACE_QUANTIFIER(c)   ((c) == '*' || (c) == '+' || (c) == '?')
 404 #define isQUANTIFIER(s,e)  (   isNON_BRACE_QUANTIFIER(*s)                      \
 405                             || ((*s) == '{' && regcurly(s, e, NULL)))
 406
 407 /*
 408  * Flags to be passed up.
 409  */
 410 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 411                                    non-null ones. */
 412 #define SIMPLE          0x02    /* Exactly one character wide */
 413                                 /* (or LNBREAK as a special case) */
 414 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 415 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 416 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 417 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 418                                    calcuate sizes as UTF-8 */
 419
 420 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 421
 422 /* whether trie related optimizations are enabled */
 423 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 424 #define TRIE_STUDY_OPT
 425 #define FULL_TRIE_STUDY
 426 #define TRIE_STCLASS
 427 #endif
 428
 429 /* About the term "restudy" and the var "restudied" and the defines
 430  * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to
 431  * doing multiple study_chunk() calls over the same set of opcodes for* the
 432  * purpose of enhanced TRIE optimizations.
 433  *
 434  * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal
 435  * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled
 436  * which then causes the Perl_re_op_compile() to then call the optimizer
 437  * S_study_chunk() a second time to perform additional optimizations,
 438  * including the aho_corasick startclass optimization.
 439  * This additional pass will only happen once, which is managed by the
 440  * 'restudied' variable in Perl_re_op_compile().
 441  *
 442  * When this second pass is under way the flags passed into study_chunk() will
 443  * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down
 444  * to any recursive calls to S_study_chunk().
 445  *
 446  * IMPORTANT: Any logic in study_chunk() that emits warnings should check that
 447  * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may
 448  * be produced twice.
 449  *
 450  * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and
 451  * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details.
 452  */
 453
 454
 455 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 456 #define PBITVAL(paren) (1 << ((paren) & 7))
 457 #define PAREN_OFFSET(depth) \
 458     (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
 459 #define PAREN_TEST(depth, paren) \
 460     (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
 461 #define PAREN_SET(depth, paren) \
 462     (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
 463 #define PAREN_UNSET(depth, paren) \
 464     (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
 465
 466 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 467                                      if (!UTF) {                           \
 468                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 469                                          return 0;                         \
 470                                      }                                     \
 471                              } STMT_END
 472
 473 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
 474  * pattern is in UTF-8.  This latter condition is in case the outermost rules
 475  * are locale.  See GH #17278 */
 476 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
 477
 478 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 479  * a flag that indicates we need to override /d with /u as a result of
 480  * something in the pattern.  It should only be used in regards to calling
 481  * set_regex_charset() or get_regex_charset() */
 482 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 483     STMT_START {                                                            \
 484             if (DEPENDS_SEMANTICS) {                                        \
 485                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 486                 RExC_uni_semantics = 1;                                     \
 487                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
 488                     /* No need to restart the parse if we haven't seen      \
 489                      * anything that differs between /u and /d, and no need \
 490                      * to restart immediately if we're going to reparse     \
 491                      * anyway to count parens */                            \
 492                     *flagp |= RESTART_PARSE;                                \
 493                     return restart_retval;                                  \
 494                 }                                                           \
 495             }                                                               \
 496     } STMT_END
 497
 498 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 499     STMT_START {                                                            \
 500                 RExC_use_BRANCHJ = 1;                                       \
 501                 *flagp |= RESTART_PARSE;                                    \
 502                 return restart_retval;                                      \
 503     } STMT_END
 504
 505 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
 506  * less.  After that, it must always be positive, because the whole re is
 507  * considered to be surrounded by virtual parens.  Setting it to negative
 508  * indicates there is some construct that needs to know the actual number of
 509  * parens to be properly handled.  And that means an extra pass will be
 510  * required after we've counted them all */
 511 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
 512 #define REQUIRE_PARENS_PASS                                                 \
 513     STMT_START {  /* No-op if have completed a pass */                      \
 514                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
 515     } STMT_END
 516 #define IN_PARENS_PASS (RExC_total_parens < 0)
 517
 518
 519 /* This is used to return failure (zero) early from the calling function if
 520  * various flags in 'flags' are set.  Two flags always cause a return:
 521  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 522  * additional flags that should cause a return; 0 if none.  If the return will
 523  * be done, '*flagp' is first set to be all of the flags that caused the
 524  * return. */
 525 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 526     STMT_START {                                                            \
 527             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 528                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 529                 return 0;                                                   \
 530             }                                                               \
 531     } STMT_END
 532
 533 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 534
 535 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 536                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 537 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 538                                     if (MUST_RESTART(*(flagp))) return 0
 539
 540 /* This converts the named class defined in regcomp.h to its equivalent class
 541  * number defined in handy.h. */
 542 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 543 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 544
 545 #define _invlist_union_complement_2nd(a, b, output) \
 546                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 547 #define _invlist_intersection_complement_2nd(a, b, output) \
 548                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 549
 550 /* We add a marker if we are deferring expansion of a property that is both
 551  * 1) potentiallly user-defined; and
 552  * 2) could also be an official Unicode property.
 553  *
 554  * Without this marker, any deferred expansion can only be for a user-defined
 555  * one.  This marker shouldn't conflict with any that could be in a legal name,
 556  * and is appended to its name to indicate this.  There is a string and
 557  * character form */
 558 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
 559 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
 560
 561 /* What is infinity for optimization purposes */
 562 #define OPTIMIZE_INFTY  SSize_t_MAX
 563
 564 /* About scan_data_t.
 565
 566   During optimisation we recurse through the regexp program performing
 567   various inplace (keyhole style) optimisations. In addition study_chunk
 568   and scan_commit populate this data structure with information about
 569   what strings MUST appear in the pattern. We look for the longest
 570   string that must appear at a fixed location, and we look for the
 571   longest string that may appear at a floating location. So for instance
 572   in the pattern:
 573
 574     /FOO[xX]A.*B[xX]BAR/
 575
 576   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 577   strings (because they follow a .* construct). study_chunk will identify
 578   both FOO and BAR as being the longest fixed and floating strings respectively.
 579
 580   The strings can be composites, for instance
 581
 582      /(f)(o)(o)/
 583
 584   will result in a composite fixed substring 'foo'.
 585
 586   For each string some basic information is maintained:
 587
 588   - min_offset
 589     This is the position the string must appear at, or not before.
 590     It also implicitly (when combined with minlenp) tells us how many
 591     characters must match before the string we are searching for.
 592     Likewise when combined with minlenp and the length of the string it
 593     tells us how many characters must appear after the string we have
 594     found.
 595
 596   - max_offset
 597     Only used for floating strings. This is the rightmost point that
 598     the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
 599     string can occur infinitely far to the right.
 600     For fixed strings, it is equal to min_offset.
 601
 602   - minlenp
 603     A pointer to the minimum number of characters of the pattern that the
 604     string was found inside. This is important as in the case of positive
 605     lookahead or positive lookbehind we can have multiple patterns
 606     involved. Consider
 607
 608     /(?=FOO).*F/
 609
 610     The minimum length of the pattern overall is 3, the minimum length
 611     of the lookahead part is 3, but the minimum length of the part that
 612     will actually match is 1. So 'FOO's minimum length is 3, but the
 613     minimum length for the F is 1. This is important as the minimum length
 614     is used to determine offsets in front of and behind the string being
 615     looked for.  Since strings can be composites this is the length of the
 616     pattern at the time it was committed with a scan_commit. Note that
 617     the length is calculated by study_chunk, so that the minimum lengths
 618     are not known until the full pattern has been compiled, thus the
 619     pointer to the value.
 620
 621   - lookbehind
 622
 623     In the case of lookbehind the string being searched for can be
 624     offset past the start point of the final matching string.
 625     If this value was just blithely removed from the min_offset it would
 626     invalidate some of the calculations for how many chars must match
 627     before or after (as they are derived from min_offset and minlen and
 628     the length of the string being searched for).
 629     When the final pattern is compiled and the data is moved from the
 630     scan_data_t structure into the regexp structure the information
 631     about lookbehind is factored in, with the information that would
 632     have been lost precalculated in the end_shift field for the
 633     associated string.
 634
 635   The fields pos_min and pos_delta are used to store the minimum offset
 636   and the delta to the maximum offset at the current point in the pattern.
 637
 638 */
 639
 640 struct scan_data_substrs {
 641     SV      *str;       /* longest substring found in pattern */
 642     SSize_t min_offset; /* earliest point in string it can appear */
 643     SSize_t max_offset; /* latest point in string it can appear */
 644     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 645     SSize_t lookbehind; /* is the pos of the string modified by LB */
 646     I32 flags;          /* per substring SF_* and SCF_* flags */
 647 };
 648
 649 /* this is typedef'ed in perl.h */
 650 struct scan_data_t {
 651     /*I32 len_min;      unused */
 652     /*I32 len_delta;    unused */
 653     SSize_t pos_min;
 654     SSize_t pos_delta;
 655     SV *last_found;
 656     SSize_t last_end;       /* min value, <0 unless valid. */
 657     SSize_t last_start_min;
 658     SSize_t last_start_max;
 659     U8      cur_is_floating; /* whether the last_* values should be set as
 660                               * the next fixed (0) or floating (1)
 661                               * substring */
 662
 663     /* [0] is longest fixed substring so far, [1] is longest float so far */
 664     struct scan_data_substrs  substrs[2];
 665
 666     I32 flags;             /* common SF_* and SCF_* flags */
 667     I32 whilem_c;
 668     SSize_t *last_closep;
 669     regnode **last_close_opp; /* pointer to pointer to last CLOSE regop
 670                                  seen. DO NOT DEREFERENCE the regnode
 671                                  pointer - the op may have been optimized
 672                                  away */
 673     regnode_ssc *start_class;
 674 };
 675
 676 /*
 677  * Forward declarations for pregcomp()'s friends.
 678  */
 679
 680 static const scan_data_t zero_scan_data = {
 681     0, 0, NULL, 0, 0, 0, 0,
 682     {
 683         { NULL, 0, 0, 0, 0, 0 },
 684         { NULL, 0, 0, 0, 0, 0 },
 685     },
 686     0, 0, NULL, NULL, NULL
 687 };
 688
 689 /* study flags */
 690
 691 #define SF_BEFORE_SEOL          0x0001
 692 #define SF_BEFORE_MEOL          0x0002
 693 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 694
 695 #define SF_IS_INF               0x0040
 696 #define SF_HAS_PAR              0x0080
 697 #define SF_IN_PAR               0x0100
 698 #define SF_HAS_EVAL             0x0200
 699
 700
 701 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 702  * longest substring in the pattern. When it is not set the optimiser keeps
 703  * track of position, but does not keep track of the actual strings seen,
 704  *
 705  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 706  * /foo/i will not.
 707  *
 708  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 709  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 710  * turned off because of the alternation (BRANCH). */
 711 #define SCF_DO_SUBSTR           0x0400
 712
 713 #define SCF_DO_STCLASS_AND      0x0800
 714 #define SCF_DO_STCLASS_OR       0x1000
 715 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 716 #define SCF_WHILEM_VISITED_POS  0x2000
 717
 718 #define SCF_TRIE_RESTUDY        0x4000 /* Need to do restudy in study_chunk()?
 719                                           Search for "restudy" in this file
 720                                           to find a detailed explanation.*/
 721 #define SCF_SEEN_ACCEPT         0x8000
 722 #define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now?
 723                                           Search for "restudy" in this file
 724                                           to find a detailed explanation. */
 725 #define SCF_IN_DEFINE          0x20000
 726
 727
 728
 729 #define UTF cBOOL(RExC_utf8)
 730
 731 /* The enums for all these are ordered so things work out correctly */
 732 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 733 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 734                                                      == REGEX_DEPENDS_CHARSET)
 735 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 736 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 737                                                      >= REGEX_UNICODE_CHARSET)
 738 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 739                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 740 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 741                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 742 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 743                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 744
 745 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 746
 747 /* For programs that want to be strictly Unicode compatible by dying if any
 748  * attempt is made to match a non-Unicode code point against a Unicode
 749  * property.  */
 750 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 751
 752 #define OOB_NAMEDCLASS          -1
 753
 754 /* There is no code point that is out-of-bounds, so this is problematic.  But
 755  * its only current use is to initialize a variable that is always set before
 756  * looked at. */
 757 #define OOB_UNICODE             0xDEADBEEF
 758
 759 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 760
 761
 762 /* length of regex to show in messages that don't mark a position within */
 763 #define RegexLengthToShowInErrorMessages 127
 764
 765 /*
 766  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 767  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 768  * op/pragma/warn/regcomp.
 769  */
 770 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 771 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 772
 773 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 774                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 775
 776 /* The code in this file in places uses one level of recursion with parsing
 777  * rebased to an alternate string constructed by us in memory.  This can take
 778  * the form of something that is completely different from the input, or
 779  * something that uses the input as part of the alternate.  In the first case,
 780  * there should be no possibility of an error, as we are in complete control of
 781  * the alternate string.  But in the second case we don't completely control
 782  * the input portion, so there may be errors in that.  Here's an example:
 783  *      /[abc\x{DF}def]/ui
 784  * is handled specially because \x{df} folds to a sequence of more than one
 785  * character: 'ss'.  What is done is to create and parse an alternate string,
 786  * which looks like this:
 787  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 788  * where it uses the input unchanged in the middle of something it constructs,
 789  * which is a branch for the DF outside the character class, and clustering
 790  * parens around the whole thing. (It knows enough to skip the DF inside the
 791  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 792  * need to be reported.  The general situation looks like this:
 793  *
 794  *                                       |<------- identical ------>|
 795  *              sI                       tI               xI       eI
 796  * Input:       ---------------------------------------------------------------
 797  * Constructed:         ---------------------------------------------------
 798  *                      sC               tC               xC       eC     EC
 799  *                                       |<------- identical ------>|
 800  *
 801  * sI..eI   is the portion of the input pattern we are concerned with here.
 802  * sC..EC   is the constructed substitute parse string.
 803  *  sC..tC  is constructed by us
 804  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 805  *          In the diagram, these are vertically aligned.
 806  *  eC..EC  is also constructed by us.
 807  * xC       is the position in the substitute parse string where we found a
 808  *          problem.
 809  * xI       is the position in the original pattern corresponding to xC.
 810  *
 811  * We want to display a message showing the real input string.  Thus we need to
 812  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 813  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 814  * get:
 815  *      xI = tI + (xC - tC)
 816  *
 817  * When the substitute parse is constructed, the code needs to set:
 818  *      RExC_start (sC)
 819  *      RExC_end (eC)
 820  *      RExC_copy_start_in_input  (tI)
 821  *      RExC_copy_start_in_constructed (tC)
 822  * and restore them when done.
 823  *
 824  * During normal processing of the input pattern, both
 825  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 826  * sI, so that xC equals xI.
 827  */
 828
 829 #define sI              RExC_precomp
 830 #define eI              RExC_precomp_end
 831 #define sC              RExC_start
 832 #define eC              RExC_end
 833 #define tI              RExC_copy_start_in_input
 834 #define tC              RExC_copy_start_in_constructed
 835 #define xI(xC)          (tI + (xC - tC))
 836 #define xI_offset(xC)   (xI(xC) - sI)
 837
 838 #define REPORT_LOCATION_ARGS(xC)                                            \
 839     UTF8fARG(UTF,                                                           \
 840              (xI(xC) > eI) /* Don't run off end */                          \
 841               ? eI - sI   /* Length before the <--HERE */                   \
 842               : ((xI_offset(xC) >= 0)                                       \
 843                  ? xI_offset(xC)                                            \
 844                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 845                                     IVdf " trying to output message for "   \
 846                                     " pattern %.*s",                        \
 847                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 848                                     ((int) (eC - sC)), sC), 0)),            \
 849              sI),         /* The input pattern printed up to the <--HERE */ \
 850     UTF8fARG(UTF,                                                           \
 851              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 852              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 853
 854 /* Used to point after bad bytes for an error message, but avoid skipping
 855  * past a nul byte. */
 856 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
 857
 858 /* Set up to clean up after our imminent demise */
 859 #define PREPARE_TO_DIE                                                      \
 860     STMT_START {                                                            \
 861         if (RExC_rx_sv)                                                     \
 862             SAVEFREESV(RExC_rx_sv);                                         \
 863         if (RExC_open_parens)                                               \
 864             SAVEFREEPV(RExC_open_parens);                                   \
 865         if (RExC_close_parens)                                              \
 866             SAVEFREEPV(RExC_close_parens);                                  \
 867         if (RExC_logical_to_parno)                                          \
 868             SAVEFREEPV(RExC_logical_to_parno);                              \
 869         if (RExC_parno_to_logical)                                          \
 870             SAVEFREEPV(RExC_parno_to_logical);                              \
 871     } STMT_END
 872
 873 /*
 874  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 875  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 876  * "...".
 877  */
 878 #define _FAIL(code) STMT_START {                                        \
 879     const char *ellipses = "";                                          \
 880     IV len = RExC_precomp_end - RExC_precomp;                           \
 881                                                                         \
 882     PREPARE_TO_DIE;                                                     \
 883     if (len > RegexLengthToShowInErrorMessages) {                       \
 884         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 885         len = RegexLengthToShowInErrorMessages - 10;                    \
 886         ellipses = "...";                                               \
 887     }                                                                   \
 888     code;                                                               \
 889 } STMT_END
 890
 891 #define FAIL(msg) _FAIL(                            \
 892     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 893             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 894
 895 #define FAIL2(msg,arg) _FAIL(                       \
 896     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 897             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 898
 899 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
 900     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 901      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 902
 903 /*
 904  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 905  */
 906 #define Simple_vFAIL(m) STMT_START {                                    \
 907     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 908             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 909 } STMT_END
 910
 911 /*
 912  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 913  */
 914 #define vFAIL(m) STMT_START {                           \
 915     PREPARE_TO_DIE;                                     \
 916     Simple_vFAIL(m);                                    \
 917 } STMT_END
 918
 919 /*
 920  * Like Simple_vFAIL(), but accepts two arguments.
 921  */
 922 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 923     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,                \
 924                       REPORT_LOCATION_ARGS(RExC_parse));        \
 925 } STMT_END
 926
 927 /*
 928  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 929  */
 930 #define vFAIL2(m,a1) STMT_START {                       \
 931     PREPARE_TO_DIE;                                     \
 932     Simple_vFAIL2(m, a1);                               \
 933 } STMT_END
 934
 935
 936 /*
 937  * Like Simple_vFAIL(), but accepts three arguments.
 938  */
 939 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 940     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,            \
 941             REPORT_LOCATION_ARGS(RExC_parse));                  \
 942 } STMT_END
 943
 944 /*
 945  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 946  */
 947 #define vFAIL3(m,a1,a2) STMT_START {                    \
 948     PREPARE_TO_DIE;                                     \
 949     Simple_vFAIL3(m, a1, a2);                           \
 950 } STMT_END
 951
 952 /*
 953  * Like Simple_vFAIL(), but accepts four arguments.
 954  */
 955 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 956     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3,        \
 957             REPORT_LOCATION_ARGS(RExC_parse));                  \
 958 } STMT_END
 959
 960 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 961     PREPARE_TO_DIE;                                     \
 962     Simple_vFAIL4(m, a1, a2, a3);                       \
 963 } STMT_END
 964
 965 /* A specialized version of vFAIL2 that works with UTF8f */
 966 #define vFAIL2utf8f(m, a1) STMT_START {             \
 967     PREPARE_TO_DIE;                                 \
 968     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,  \
 969             REPORT_LOCATION_ARGS(RExC_parse));      \
 970 } STMT_END
 971
 972 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 973     PREPARE_TO_DIE;                                     \
 974     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,  \
 975             REPORT_LOCATION_ARGS(RExC_parse));          \
 976 } STMT_END
 977
 978 /* Setting this to NULL is a signal to not output warnings */
 979 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
 980     STMT_START {                                                            \
 981       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
 982       RExC_copy_start_in_constructed = NULL;                                \
 983     } STMT_END
 984 #define RESTORE_WARNINGS                                                    \
 985     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
 986
 987 /* Since a warning can be generated multiple times as the input is reparsed, we
 988  * output it the first time we come to that point in the parse, but suppress it
 989  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 990  * generate any warnings */
 991 #define TO_OUTPUT_WARNINGS(loc)                                         \
 992   (   RExC_copy_start_in_constructed                                    \
 993    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 994
 995 /* After we've emitted a warning, we save the position in the input so we don't
 996  * output it again */
 997 #define UPDATE_WARNINGS_LOC(loc)                                        \
 998     STMT_START {                                                        \
 999         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
1000             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
1001                                                        - RExC_precomp;  \
1002         }                                                               \
1003     } STMT_END
1004
1005 /* 'warns' is the output of the packWARNx macro used in 'code' */
1006 #define _WARN_HELPER(loc, warns, code)                                  \
1007     STMT_START {                                                        \
1008         if (! RExC_copy_start_in_constructed) {                         \
1009             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
1010                               " expected at '%s'",                      \
1011                               __FILE__, __LINE__, loc);                 \
1012         }                                                               \
1013         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
1014             if (ckDEAD(warns))                                          \
1015                 PREPARE_TO_DIE;                                         \
1016             code;                                                       \
1017             UPDATE_WARNINGS_LOC(loc);                                   \
1018         }                                                               \
1019     } STMT_END
1020
1021 /* m is not necessarily a "literal string", in this macro */
1022 #define warn_non_literal_string(loc, packed_warn, m)                    \
1023     _WARN_HELPER(loc, packed_warn,                                      \
1024                       Perl_warner(aTHX_ packed_warn,                    \
1025                                        "%s" REPORT_LOCATION,            \
1026                                   m, REPORT_LOCATION_ARGS(loc)))
1027 #define reg_warn_non_literal_string(loc, m)                             \
1028                 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
1029
1030 #define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
1031     STMT_START {                                                            \
1032                 char * format;                                              \
1033                 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
1034                 Newx(format, format_size, char);                            \
1035                 my_strlcpy(format, m, format_size);                         \
1036                 my_strlcat(format, REPORT_LOCATION, format_size);           \
1037                 SAVEFREEPV(format);                                         \
1038                 _WARN_HELPER(loc, packwarn,                                 \
1039                       Perl_ck_warner(aTHX_ packwarn,                        \
1040                                         format,                             \
1041                                         a1, REPORT_LOCATION_ARGS(loc)));    \
1042     } STMT_END
1043
1044 #define ckWARNreg(loc,m)                                                \
1045     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1046                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1047                                           m REPORT_LOCATION,            \
1048                                           REPORT_LOCATION_ARGS(loc)))
1049
1050 #define vWARN(loc, m)                                                   \
1051     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1052                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1053                                        m REPORT_LOCATION,               \
1054                                        REPORT_LOCATION_ARGS(loc)))      \
1055
1056 #define vWARN_dep(loc, m)                                               \
1057     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
1058                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
1059                                        m REPORT_LOCATION,               \
1060                                        REPORT_LOCATION_ARGS(loc)))
1061
1062 #define ckWARNdep(loc,m)                                                \
1063     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
1064                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
1065                                             m REPORT_LOCATION,          \
1066                                             REPORT_LOCATION_ARGS(loc)))
1067
1068 #define ckWARNregdep(loc,m)                                                 \
1069     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
1070                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
1071                                                       WARN_REGEXP),         \
1072                                              m REPORT_LOCATION,             \
1073                                              REPORT_LOCATION_ARGS(loc)))
1074
1075 #define ckWARN2reg_d(loc,m, a1)                                             \
1076     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1077                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
1078                                             m REPORT_LOCATION,              \
1079                                             a1, REPORT_LOCATION_ARGS(loc)))
1080
1081 #define ckWARN2reg(loc, m, a1)                                              \
1082     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1083                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1084                                           m REPORT_LOCATION,                \
1085                                           a1, REPORT_LOCATION_ARGS(loc)))
1086
1087 #define vWARN3(loc, m, a1, a2)                                              \
1088     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1089                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
1090                                        m REPORT_LOCATION,                   \
1091                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
1092
1093 #define ckWARN3reg(loc, m, a1, a2)                                          \
1094     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1095                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1096                                           m REPORT_LOCATION,                \
1097                                           a1, a2,                           \
1098                                           REPORT_LOCATION_ARGS(loc)))
1099
1100 #define vWARN4(loc, m, a1, a2, a3)                                      \
1101     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1102                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1103                                        m REPORT_LOCATION,               \
1104                                        a1, a2, a3,                      \
1105                                        REPORT_LOCATION_ARGS(loc)))
1106
1107 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
1108     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1109                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1110                                           m REPORT_LOCATION,            \
1111                                           a1, a2, a3,                   \
1112                                           REPORT_LOCATION_ARGS(loc)))
1113
1114 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
1115     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1116                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1117                                        m REPORT_LOCATION,               \
1118                                        a1, a2, a3, a4,                  \
1119                                        REPORT_LOCATION_ARGS(loc)))
1120
1121 #define ckWARNexperimental(loc, class, m)                               \
1122     STMT_START {                                                        \
1123         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1124             RExC_warned_ ## class = 1;                                  \
1125             _WARN_HELPER(loc, packWARN(class),                          \
1126                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1127                                             m REPORT_LOCATION,          \
1128                                             REPORT_LOCATION_ARGS(loc)));\
1129         }                                                               \
1130     } STMT_END
1131
1132 #define ckWARNexperimental_with_arg(loc, class, m, arg)                 \
1133     STMT_START {                                                        \
1134         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1135             RExC_warned_ ## class = 1;                                  \
1136             _WARN_HELPER(loc, packWARN(class),                          \
1137                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1138                                        m REPORT_LOCATION,               \
1139                                        arg, REPORT_LOCATION_ARGS(loc)));\
1140         }                                                               \
1141     } STMT_END
1142
1143 /* Convert between a pointer to a node and its offset from the beginning of the
1144  * program */
1145 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
1146 #define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start)      \
1147                               (SSize_t) ((node) - RExC_emit_start))
1148
1149 #define ProgLen(ri) ri->proglen
1150 #define SetProgLen(ri,x) ri->proglen = x
1151
1152 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1153 #define EXPERIMENTAL_INPLACESCAN
1154 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1155
1156 #define DEBUG_RExC_seen()                                                   \
1157         DEBUG_OPTIMISE_MORE_r({                                             \
1158             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1159                                                                             \
1160             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1161                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1162                                                                             \
1163             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1164                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1165                                                                             \
1166             if (RExC_seen & REG_GPOS_SEEN)                                  \
1167                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1168                                                                             \
1169             if (RExC_seen & REG_RECURSE_SEEN)                               \
1170                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1171                                                                             \
1172             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1173                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1174                                                                             \
1175             if (RExC_seen & REG_VERBARG_SEEN)                               \
1176                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1177                                                                             \
1178             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1179                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1180                                                                             \
1181             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1182                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1183                                                                             \
1184             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1185                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1186                                                                             \
1187             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1188                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1189                                                                             \
1190             Perl_re_printf( aTHX_ "\n");                                    \
1191         });
1192
1193 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1194   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1195
1196
1197 #ifdef DEBUGGING
1198 #  define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
1199                     debug_studydata(where, data, depth, is_inf, min, stopmin, delta)
1200
1201 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1202                     debug_peep(str, pRExC_state, scan, depth, flags)
1203 #else
1204 #  define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
1205 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1206 #endif
1207
1208 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
1209 #ifdef DEBUGGING
1210 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
1211 #else
1212 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
1213 #endif
1214
1215 #define MADE_TRIE       1
1216 #define MADE_JUMP_TRIE  2
1217 #define MADE_EXACT_TRIE 4
1218
1219 #define INVLIST_INDEX                   0
1220 #define ONLY_LOCALE_MATCHES_INDEX       1
1221 #define DEFERRED_USER_DEFINED_INDEX     2
1222
1223 /* These two functions currently do the exact same thing */
1224 #define ssc_init_zero           ssc_init
1225
1226 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1227 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1228
1229 #ifdef DEBUGGING
1230 #define REGNODE_GUTS(state,op,extra_size) \
1231     regnode_guts_debug(state,op,extra_size)
1232 #else
1233 #define REGNODE_GUTS(state,op,extra_size) \
1234     regnode_guts(state,extra_size)
1235 #endif
1236
1237 #define CLEAR_OPTSTART                                                          \
1238     if (optstart) STMT_START {                                                  \
1239         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                  \
1240                               " (%" IVdf " nodes)\n", (IV)(node - optstart)));  \
1241         optstart=NULL;                                                          \
1242     } STMT_END
1243
1244 #define DUMPUNTIL(b,e)                                          \
1245     CLEAR_OPTSTART;                                             \
1246     node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
1247
1248
1249 #endif /* REGCOMP_INTERNAL_H */