This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
maint policy: Separate build/installation issues from test failures
[perl5.git] / regcomp.c
index 51c778d..d1fa74d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -87,7 +87,6 @@ EXTERN_C const struct regexp_engine my_reg_engine;
 #endif
 
 #include "dquote_static.c"
-#include "charclass_invlists.h"
 #include "inline_invlist.c"
 #include "unicode_constants.h"
 
@@ -5638,7 +5637,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
         }
         return final_minlen;
     }
-    NOT_REACHED;
+    NOT_REACHED; /* NOTREACHED */
 }
 
 STATIC U32
@@ -6413,7 +6412,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     DEBUG_r(if (!PL_colorset) reginitcolors());
 
-#ifndef PERL_IN_XSUB_RE
     /* Initialize these here instead of as-needed, as is quick and avoids
      * having to test them each time otherwise */
     if (! PL_AboveLatin1) {
@@ -6431,7 +6429,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
        PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
                                                     NUM_ANYOF_CODE_POINTS - 1);
     }
-#endif
 
     pRExC_state->code_blocks = NULL;
     pRExC_state->num_code_blocks = 0;
@@ -7894,7 +7891,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
                       (unsigned long) flags);
         }
-        NOT_REACHED; /* NOT REACHED */
+        NOT_REACHED; /* NOTREACHED */
     }
     return NULL;
 }
@@ -10098,14 +10095,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 if (RExC_parse == RExC_end || *RExC_parse != ')')
                     vFAIL("Sequence (?&... not terminated");
                 goto gen_recurse_regop;
-                /* NOT REACHED */
+                /* NOTREACHED */
             case '+':
                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                     RExC_parse++;
                     vFAIL("Illegal pattern");
                 }
                 goto parse_recursion;
-                /* NOT REACHED*/
+                /* NOTREACHED*/
             case '-': /* (?-1) */
                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                     RExC_parse--; /* rewind to let it be handled later */
@@ -10178,7 +10175,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 nextchar(pRExC_state);
                 return ret;
 
-            /* NOT REACHED */
+            /* NOTREACHED */
 
            case '?':           /* (??...) */
                is_logical = 1;
@@ -10450,6 +10447,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             Set_Node_Offset(ret, RExC_parse); /* MJD */
            is_open = 1;
        } else {
+            /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
+            paren = ':';
            ret = NULL;
        }
     }
@@ -11772,27 +11771,102 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             invert = 1;
             /* FALLTHROUGH */
        case 'b':
+          {
+           regex_charset charset = get_regex_charset(RExC_flags);
+
            RExC_seen_zerolen++;
             RExC_seen |= REG_LOOKBEHIND_SEEN;
-           op = BOUND + get_regex_charset(RExC_flags);
-            if (op > BOUNDA) {  /* /aa is same as /a */
-                op = BOUNDA;
-            }
-            else if (op == BOUNDL) {
-                RExC_contains_locale = 1;
-            }
+           op = BOUND + charset;
 
-            if (invert) {
-                op += NBOUND - BOUND;
+            if (op == BOUNDL) {
+                RExC_contains_locale = 1;
             }
 
            ret = reg_node(pRExC_state, op);
            *flagp |= SIMPLE;
-           if ((U8) *(RExC_parse + 1) == '{') {
-                /* diag_listed_as: Use "%s" instead of "%s" */
-               vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse);
+           if (*(RExC_parse + 1) != '{') {
+                FLAGS(ret) = TRADITIONAL_BOUND;
+                if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
+                    OP(ret) = BOUNDA;
+                }
+            }
+            else {
+                STRLEN length;
+                char name = *RExC_parse;
+                char * endbrace;
+                RExC_parse += 2;
+                endbrace = strchr(RExC_parse, '}');
+
+                if (! endbrace) {
+                    vFAIL2("Missing right brace on \\%c{}", name);
+                }
+                /* XXX Need to decide whether to take spaces or not.  Should be
+                 * consistent with \p{}, but that currently is SPACE, which
+                 * means vertical too, which seems wrong
+                 * while (isBLANK(*RExC_parse)) {
+                    RExC_parse++;
+                }*/
+                if (endbrace == RExC_parse) {
+                    RExC_parse++;  /* After the '}' */
+                    vFAIL2("Empty \\%c{}", name);
+                }
+                length = endbrace - RExC_parse;
+                /*while (isBLANK(*(RExC_parse + length - 1))) {
+                    length--;
+                }*/
+                switch (*RExC_parse) {
+                    case 'g':
+                        if (length != 1
+                            && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+                        {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = GCB_BOUND;
+                        break;
+                    case 's':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = SB_BOUND;
+                        break;
+                    case 'w':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = WB_BOUND;
+                        break;
+                    default:
+                      bad_bound_type:
+                        RExC_parse = endbrace;
+                       vFAIL2utf8f(
+                            "'%"UTF8f"' is an unknown bound type",
+                           UTF8fARG(UTF, length, endbrace - length));
+                        NOT_REACHED; /*NOTREACHED*/
+                }
+                RExC_parse = endbrace;
+                RExC_uni_semantics = 1;
+
+                if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
+                    OP(ret) = BOUNDU;
+                    length += 4;
+
+                    /* Don't have to worry about UTF-8, in this message because
+                     * to get here the contents of the \b must be ASCII */
+                    ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
+                              "Using /u for '%.*s' instead of /%s",
+                              (unsigned) length,
+                              endbrace - length + 1,
+                              (charset == REGEX_ASCII_RESTRICTED_CHARSET)
+                              ? ASCII_RESTRICT_PAT_MODS
+                              : ASCII_MORE_RESTRICT_PAT_MODS);
+                }
            }
+
+            if (PASS2 && invert) {
+                OP(ret) += NBOUND - BOUND;
+            }
            goto finish_meta_pat;
+          }
 
        case 'D':
             invert = 1;
@@ -12977,7 +13051,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
                            break;
                        case 'e':
                            if (memEQ(posixcc, "spac", 4)) /* space */
-                               namedclass = ANYOF_PSXSPC;
+                               namedclass = ANYOF_SPACE;
                            break;
                        case 'h':
                            if (memEQ(posixcc, "grap", 4)) /* graph */
@@ -14540,7 +14614,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 vFAIL2utf8f(
                     "Invalid [] range \"%"UTF8f"\"",
                     UTF8fARG(UTF, w, rangebegin));
-                NOT_REACHED; /* NOT REACHED */
+                NOT_REACHED; /* NOTREACHED */
            }
        }
        else {
@@ -15398,6 +15472,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     }
 
     if (ret_invlist) {
+        assert(cp_list);
+
         *ret_invlist = cp_list;
         SvREFCNT_dec(swash);
 
@@ -16405,8 +16481,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
-    || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
-    || _CC_VERTSPACE != 16
+    || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
   #error Need to adjust order of anyofs[]
 #endif
         "\\w",
@@ -16435,8 +16510,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         "[:^blank:]",
         "[:xdigit:]",
         "[:^xdigit:]",
-        "[:space:]",
-        "[:^space:]",
         "[:cntrl:]",
         "[:^cntrl:]",
         "[:ascii:]",
@@ -16735,6 +16808,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
         }
     }
+    else if (k == BOUND || k == NBOUND) {
+        /* Must be synced with order of 'bound_type' in regcomp.h */
+        const char * const bounds[] = {
+            "",      /* Traditional */
+            "{gcb}",
+            "{sb}",
+            "{wb}"
+        };
+        sv_catpv(sv, bounds[FLAGS(o)]);
+    }
     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
        Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
     else if (OP(o) == SBOL)