maint policy: Separate build/installation issues from test failures

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 51c778d..d1fa74d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -87,7 +87,6 @@ EXTERN_C const struct regexp_engine my_reg_engine;
  #endif
  
  #include "dquote_static.c"
-#include "charclass_invlists.h"
  #include "inline_invlist.c"
  #include "unicode_constants.h"
  
@@ -5638,7 +5637,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
          }
          return final_minlen;
      }
-    NOT_REACHED;
+    NOT_REACHED; /* NOTREACHED */
  }
  
  STATIC U32
@@ -6413,7 +6412,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      DEBUG_r(if (!PL_colorset) reginitcolors());
  
-#ifndef PERL_IN_XSUB_RE
      /* Initialize these here instead of as-needed, as is quick and avoids
       * having to test them each time otherwise */
      if (! PL_AboveLatin1) {
@@ -6431,7 +6429,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
                                                      NUM_ANYOF_CODE_POINTS - 1);
      }
-#endif
  
      pRExC_state->code_blocks = NULL;
      pRExC_state->num_code_blocks = 0;
@@ -7894,7 +7891,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
              Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
                        (unsigned long) flags);
          }
-        NOT_REACHED; /* NOT REACHED */
+        NOT_REACHED; /* NOTREACHED */
      }
      return NULL;
  }
@@ -10098,14 +10095,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                  if (RExC_parse == RExC_end || *RExC_parse != ')')
                      vFAIL("Sequence (?&... not terminated");
                  goto gen_recurse_regop;
-                /* NOT REACHED */
+                /* NOTREACHED */
              case '+':
                  if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                      RExC_parse++;
                      vFAIL("Illegal pattern");
                  }
                  goto parse_recursion;
-                /* NOT REACHED*/
+                /* NOTREACHED*/
              case '-': /* (?-1) */
                  if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                      RExC_parse--; /* rewind to let it be handled later */
@@ -10178,7 +10175,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                  nextchar(pRExC_state);
                  return ret;
  
-            /* NOT REACHED */
+            /* NOTREACHED */
  
             case '?':           /* (??...) */
                 is_logical = 1;
@@ -10450,6 +10447,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
              Set_Node_Offset(ret, RExC_parse); /* MJD */
             is_open = 1;
         } else {
+            /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
+            paren = ':';
             ret = NULL;
         }
      }
@@ -11772,27 +11771,102 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              invert = 1;
              /* FALLTHROUGH */
         case 'b':
+          {
+           regex_charset charset = get_regex_charset(RExC_flags);
+
             RExC_seen_zerolen++;
              RExC_seen |= REG_LOOKBEHIND_SEEN;
-           op = BOUND + get_regex_charset(RExC_flags);
-            if (op > BOUNDA) {  /* /aa is same as /a */
-                op = BOUNDA;
-            }
-            else if (op == BOUNDL) {
-                RExC_contains_locale = 1;
-            }
+           op = BOUND + charset;
  
-            if (invert) {
-                op += NBOUND - BOUND;
+            if (op == BOUNDL) {
+                RExC_contains_locale = 1;
              }
  
             ret = reg_node(pRExC_state, op);
             *flagp |= SIMPLE;
-           if ((U8) *(RExC_parse + 1) == '{') {
-                /* diag_listed_as: Use "%s" instead of "%s" */
-               vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse);
+           if (*(RExC_parse + 1) != '{') {
+                FLAGS(ret) = TRADITIONAL_BOUND;
+                if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
+                    OP(ret) = BOUNDA;
+                }
+            }
+            else {
+                STRLEN length;
+                char name = *RExC_parse;
+                char * endbrace;
+                RExC_parse += 2;
+                endbrace = strchr(RExC_parse, '}');
+
+                if (! endbrace) {
+                    vFAIL2("Missing right brace on \\%c{}", name);
+                }
+                /* XXX Need to decide whether to take spaces or not.  Should be
+                 * consistent with \p{}, but that currently is SPACE, which
+                 * means vertical too, which seems wrong
+                 * while (isBLANK(*RExC_parse)) {
+                    RExC_parse++;
+                }*/
+                if (endbrace == RExC_parse) {
+                    RExC_parse++;  /* After the '}' */
+                    vFAIL2("Empty \\%c{}", name);
+                }
+                length = endbrace - RExC_parse;
+                /*while (isBLANK(*(RExC_parse + length - 1))) {
+                    length--;
+                }*/
+                switch (*RExC_parse) {
+                    case 'g':
+                        if (length != 1
+                            && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+                        {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = GCB_BOUND;
+                        break;
+                    case 's':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = SB_BOUND;
+                        break;
+                    case 'w':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = WB_BOUND;
+                        break;
+                    default:
+                      bad_bound_type:
+                        RExC_parse = endbrace;
+                       vFAIL2utf8f(
+                            "'%"UTF8f"' is an unknown bound type",
+                           UTF8fARG(UTF, length, endbrace - length));
+                        NOT_REACHED; /*NOTREACHED*/
+                }
+                RExC_parse = endbrace;
+                RExC_uni_semantics = 1;
+
+                if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
+                    OP(ret) = BOUNDU;
+                    length += 4;
+
+                    /* Don't have to worry about UTF-8, in this message because
+                     * to get here the contents of the \b must be ASCII */
+                    ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
+                              "Using /u for '%.*s' instead of /%s",
+                              (unsigned) length,
+                              endbrace - length + 1,
+                              (charset == REGEX_ASCII_RESTRICTED_CHARSET)
+                              ? ASCII_RESTRICT_PAT_MODS
+                              : ASCII_MORE_RESTRICT_PAT_MODS);
+                }
             }
+
+            if (PASS2 && invert) {
+                OP(ret) += NBOUND - BOUND;
+            }
             goto finish_meta_pat;
+          }
  
         case 'D':
              invert = 1;
@@ -12977,7 +13051,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
                             break;
                         case 'e':
                             if (memEQ(posixcc, "spac", 4)) /* space */
-                               namedclass = ANYOF_PSXSPC;
+                               namedclass = ANYOF_SPACE;
                             break;
                         case 'h':
                             if (memEQ(posixcc, "grap", 4)) /* graph */
@@ -14540,7 +14614,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  vFAIL2utf8f(
                      "Invalid [] range \"%"UTF8f"\"",
                      UTF8fARG(UTF, w, rangebegin));
-                NOT_REACHED; /* NOT REACHED */
+                NOT_REACHED; /* NOTREACHED */
             }
         }
         else {
@@ -15398,6 +15472,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      }
  
      if (ret_invlist) {
+        assert(cp_list);
+
          *ret_invlist = cp_list;
          SvREFCNT_dec(swash);
  
@@ -16405,8 +16481,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
      || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
      || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
      || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
-    || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
-    || _CC_VERTSPACE != 16
+    || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
    #error Need to adjust order of anyofs[]
  #endif
          "\\w",
@@ -16435,8 +16510,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          "[:^blank:]",
          "[:xdigit:]",
          "[:^xdigit:]",
-        "[:space:]",
-        "[:^space:]",
          "[:cntrl:]",
          "[:^cntrl:]",
          "[:ascii:]",
@@ -16735,6 +16808,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
              Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
          }
      }
+    else if (k == BOUND || k == NBOUND) {
+        /* Must be synced with order of 'bound_type' in regcomp.h */
+        const char * const bounds[] = {
+            "",      /* Traditional */
+            "{gcb}",
+            "{sb}",
+            "{wb}"
+        };
+        sv_catpv(sv, bounds[FLAGS(o)]);
+    }
      else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
      else if (OP(o) == SBOL)