Document when and why S_reg{,branch,piece,atom,class}() return NULL.
authorNicholas Clark <nick@ccl4.org>
Thu, 17 Jan 2013 10:47:13 +0000 (11:47 +0100)
committerNicholas Clark <nick@ccl4.org>
Tue, 19 Mar 2013 10:53:19 +0000 (11:53 +0100)
As documented in pod/perlreguts.pod, the call graph for regex parsing
involves several levels of functions in regcomp.c, sometimes recursing more
than once.

The top level compiling function, S_reg(), calls S_regbranch() to parse each
single branch of an alternation. In turn, that calls S_regpiece() to parse
a simple pattern followed by quantifier, which calls S_regatom() to parse
that simple pattern. S_regatom() can call S_regclass() to handle classes,
but can also recurse into S_reg() to handle subpatterns and some other
constructions. Some other routines call call S_reg(), sometimes using an
alternative pattern that they generate dynamically to represent their input.

These routines all return a pointer to a regnode structure, and take a
pointer to an integer that holds flags, which is also used to return
information.

Historically, it has not been clear when and why they return NULL, and
whether the return value can be ignored. In particular, "Jumbo regexp patch"
(commit c277df42229d99fe, from Nov 1997), added code with two calls from
S_reg() to S_regbranch(), one of which checks the return value and generates
a LONGJMP node if it returns NULL, the other of which is called in void
context, and so both ignores any return value, or the possibility that it is
NULL.

After some analysis I have untangled the possible return values from these
5 functions (and related functions which call S_reg()).

Starting from the top:
S_reg() will return NULL and set the flags to TRYAGAIN at the end of pragma-
like constructions that it handles. Otherwise, historically it would return
NULL if S_regbranch() returned NULL. In turn, S_regbranch() would return
NULL if S_regpiece() returned NULL without setting TRYAGAIN. If S_regpiece()
returns TRYAGAIN, S_regbranch() loops, and ultimately will not return NULL.

S_regpiece() returns NULL with TRYAGAIN if S_regatom() returns NULL with
TRYAGAIN, but (historically) if S_regatom() returns NULL without setting
the flags to TRYAGAIN, S_regpiece() would to. Where S_regatom() calls
S_reg() it has similar behaviour when passing back return values, although
often it is able to loop instead on getting a TRYAGAIN.

Which gets us back to S_reg(), which can only *generate* NULL in conjunction
with TRYAGAIN. NULL without TRYAGAIN could only be returned if a routine it
called generated it. All other functions that these call that return regnode
structures cannot return NULL. Hence

1) in the loop of functions called, there is no source for a return value of
   NULL without the TRYAGAIN flag being set
2) a return value of NULL with TRYAGAIN set from an inner function does not
   propagate out past S_regbranch()

Hence the only return values that most functions can generate are non-NULL,
or NULL with TRYAGAIN set, and as S_regbranch() catches these, it cannot
return NULL. The longest sequence of functions that can return NULL (with
TRYAGAIN set) is S_reg() -> S_regatom() -> S_regpiece() -> S_regbranch().
Rapidly returning right round the loop back to S_reg() is not possible.

Hence code added by commit c277df42229d99fe to handle a NULL return from
S_regbranch(), along with some other code is dead.

I have replaced all unreachable code with FAIL()s that panic.

regcomp.c

index 160a337..0f0eff0 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -5769,8 +5769,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
        SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
     }
     if (reg(pRExC_state, 0, &flags,1) == NULL) {
-       RExC_precomp = NULL;
-       return(NULL);
+        Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
     }
     if (code_blocksv)
        SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
@@ -5945,7 +5944,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
     if (reg(pRExC_state, 0, &flags,1) == NULL) {
        ReREFCNT_dec(rx);   
-       return(NULL);
+        Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
     }
     /* XXXX To minimize changes to RE engine we always allocate
        3-units-long substrs field. */
@@ -8555,6 +8554,9 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
 #endif
 
+/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
+   flags. Otherwise would only return NULL if regbranch() returns NULL, which
+   cannot happen.  */
 STATIC regnode *
 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
@@ -9091,9 +9093,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                  insert_if:
                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
                     br = regbranch(pRExC_state, &flags, 1,depth+1);
-                   if (br == NULL)
-                       br = reganode(pRExC_state, LONGJMP, 0);
-                   else
+                   if (br == NULL) {
+                        FAIL2("panic: regbranch returned NULL, flags=%#X",
+                              flags);
+                    } else
                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
                    c = *nextchar(pRExC_state);
                    if (flags&HASWIDTH)
@@ -9102,7 +9105,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        if (is_define) 
                            vFAIL("(?(DEFINE)....) does not allow branches");
                        lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
-                        regbranch(pRExC_state, &flags, 1,depth+1);
+                        if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
+                            FAIL2("panic: regbranch returned NULL, flags=%#X",
+                                  flags);
+                        }
                         REGTAIL(pRExC_state, ret, lastbr);
                        if (flags&HASWIDTH)
                            *flagp |= HASWIDTH;
@@ -9184,8 +9190,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 
     /*     branch_len = (paren != 0); */
 
-    if (br == NULL)
-       return(NULL);
+    if (br == NULL) {
+        FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+    }
     if (*RExC_parse == '|') {
        if (!SIZE_ONLY && RExC_extralen) {
            reginsert(pRExC_state, BRANCHJ, br, depth+1);
@@ -9224,8 +9231,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         }
         br = regbranch(pRExC_state, &flags, 0, depth+1);
 
-       if (br == NULL)
-           return(NULL);
+       if (br == NULL) {
+            FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+        }
         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
        lastbr = br;
        *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
@@ -9382,6 +9390,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
  - regbranch - one alternative of an | operator
  *
  * Implements the concatenation operator.
+ *
+ * would only return NULL if regpiece() returns NULL, which cannot happen.
  */
 STATIC regnode *
 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
@@ -9421,7 +9431,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
        if (latest == NULL) {
            if (flags & TRYAGAIN)
                continue;
-           return(NULL);
+            FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
        }
        else if (ret == NULL)
            ret = latest;
@@ -9455,6 +9465,9 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
  * both the endmarker for their branch list and the body of the last branch.
  * It might seem that this node could be dispensed with entirely, but the
  * endmarker role is not redundant.
+ *
+ * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
+ * TRYAGAIN.
  */
 STATIC regnode *
 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
@@ -9485,6 +9498,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     if (ret == NULL) {
        if (flags & TRYAGAIN)
            *flagp |= TRYAGAIN;
+        else
+            FAIL2("panic: regatom returned NULL, flags=%#X", flags);
        return(NULL);
     }
 
@@ -9918,7 +9933,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
        /* The values are Unicode, and therefore not subject to recoding */
        RExC_override_recoding = 1;
 
-       *node_p = reg(pRExC_state, 1, &flags, depth+1);
+       if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
+            FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
+                  flags);
+        } 
        *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
 
        RExC_parse = endbrace;
@@ -10118,6 +10136,9 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
    escape sequences, with the one for handling literal escapes requiring
    a dummy entry for all of the special escapes that are actually handled
    by the other.
+
+   Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
+   TRYAGAIN.  Otherwise does not return NULL.
 */
 
 STATIC regnode *
@@ -10185,6 +10206,10 @@ tryagain:
            RExC_parse = oregcomp_parse;
            vFAIL("Unmatched [");
        }
+        if (ret == NULL) {
+            FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
+                  *flagp);
+        }
        nextchar(pRExC_state);
         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
        break;
@@ -10201,7 +10226,7 @@ tryagain:
                    }
                    goto tryagain;
                }
-               return(NULL);
+                FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
        }
        *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
        break;
@@ -10393,6 +10418,9 @@ tryagain:
                                          It would be a bug if these returned
                                          non-portables */
                                NULL);
+                if (!ret)
+                    FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
+                          *flagp);
 
                RExC_parse--;
 
@@ -11504,13 +11532,18 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
                         RExC_parse++;
                     }
 
-                    (void) regclass(pRExC_state, flagp,depth+1,
-                                    is_posix_class, /* parse the whole char
-                                                       class only if not a
-                                                       posix class */
-                                    FALSE, /* don't allow multi-char folds */
-                                    TRUE, /* silence non-portable warnings. */
-                                    &current);
+                    /* regclass() can only return RESTART_UTF8 if multi-char
+                       folds are allowed.  */
+                    if (!regclass(pRExC_state, flagp,depth+1,
+                                  is_posix_class, /* parse the whole char
+                                                     class only if not a
+                                                     posix class */
+                                  FALSE, /* don't allow multi-char folds */
+                                  TRUE, /* silence non-portable warnings. */
+                                  &current))
+                        FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
+                              *flagp);
+
                     /* function call leaves parse pointing to the ']', except
                      * if we faked it */
                     if (is_posix_class) {
@@ -11667,12 +11700,15 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
                 vFAIL("Unexpected character");
 
             case '\\':
-                (void) regclass(pRExC_state, flagp,depth+1,
-                                TRUE, /* means parse just the next thing */
-                                FALSE, /* don't allow multi-char folds */
-                                FALSE, /* don't silence non-portable warnings.
-                                        */
-                                &current);
+                /* regclass() can only return RESTART_UTF8 if multi-char
+                   folds are allowed.  */
+                if (!regclass(pRExC_state, flagp,depth+1,
+                              TRUE, /* means parse just the next thing */
+                              FALSE, /* don't allow multi-char folds */
+                              FALSE, /* don't silence non-portable warnings.  */
+                              &current))
+                    FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
+                          *flagp);
                 /* regclass() will return with parsing just the \ sequence,
                  * leaving the parse pointer at the next thing to parse */
                 RExC_parse--;
@@ -11686,13 +11722,16 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
                     RExC_parse++;
                 }
 
-                (void) regclass(pRExC_state, flagp,depth+1,
-                                is_posix_class, /* parse the whole char class
-                                                   only if not a posix class */
-                                FALSE, /* don't allow multi-char folds */
-                                FALSE, /* don't silence non-portable warnings.
-                                        */
-                                &current);
+                /* regclass() can only return RESTART_UTF8 if multi-char
+                   folds are allowed.  */
+                if(!regclass(pRExC_state, flagp,depth+1,
+                             is_posix_class, /* parse the whole char class
+                                                only if not a posix class */
+                             FALSE, /* don't allow multi-char folds */
+                             FALSE, /* don't silence non-portable warnings.  */
+                             &current))
+                    FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
+                          *flagp);
                 /* function call leaves parse pointing to the ']', except if we
                  * faked it */
                 if (is_posix_class) {
@@ -11879,6 +11918,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f
                              well have generated non-portable code points, but
                              they're valid on this machine */
                     NULL);
+    if (!node)
+        FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", flagp);
     if (save_fold) {
         RExC_flags |= RXf_PMf_FOLD;
     }
@@ -11928,7 +11969,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * corresponding bit set if that character is in the list.  For characters
      * above 255, a range list or swash is used.  There are extra bits for \w,
      * etc. in locale ANYOFs, as what these match is not determinable at
-     * compile time */
+     * compile time
+     *
+     * Never returns NULL.
+     */
 
     dVAR;
     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;