Deprecate spaces/comments in some regex tokens

author Karl Williamson <public@khwilliamson.com>

Tue, 23 Apr 2013 19:39:35 +0000 (13:39 -0600)

committer Karl Williamson <public@khwilliamson.com>

Thu, 2 May 2013 19:39:00 +0000 (13:39 -0600)
author Karl Williamson <public@khwilliamson.com>
Tue, 23 Apr 2013 19:39:35 +0000 (13:39 -0600)
committer Karl Williamson <public@khwilliamson.com>
Thu, 2 May 2013 19:39:00 +0000 (13:39 -0600)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 36646f8..a19c959 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -2308,6 +2308,26 @@ would otherwise result in the same message being repeated.
  Failure of user callbacks dispatched using the C<G_KEEPERR> flag could
  also result in this warning.  See L<perlcall/G_KEEPERR>.
  
+=item In '(*VERB...)', splitting the initial '(*' is deprecated in regex; marked by <-- HERE in m/%s/
+
+(D regexp, deprecated)
+The two-character sequence C<"(*"> in this context in a regular
+expression pattern should be an indivisible token, with nothing
+intervening between the C<"("> and the C<"*">, but you separated them.
+Due to an accident of implementation, this prohibition was not enforced,
+but we do plan to forbid it in a future Perl version.  This message
+serves as giving you fair warning of this pending change.
+
+=item In '(?...)', splitting the initial '(?' is deprecated in regex; marked by <-- HERE in m/%s/
+
+(D regexp, deprecated)
+The two-character sequence C<"(?"> in this context in a regular
+expression pattern should be an indivisible token, with nothing
+intervening between the C<"("> and the C<"?">, but you separated them.
+Due to an accident of implementation, this prohibition was not enforced,
+but we do plan to forbid it in a future Perl version.  This message
+serves as giving you fair warning of this pending change.
+
  =item Incomplete expression within '(?[ ])' in regex; marked by <-- HERE in m/%s/
  
  (F)
diff --git a/pod/perlre.pod b/pod/perlre.pod

index e4a0b11..ceba169 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -138,8 +138,8 @@ a C<\Q...\E> stays unaffected by C</x>.  And note that C</x> doesn't affect
  space interpretation within a single multi-character construct.  For
  example in C<\x{...}>, regardless of the C</x> modifier, there can be no
  spaces.  Same for a L<quantifier|/Quantifiers> such as C<{3}> or
-C<{5,}>.  Similarly, C<(?:...)> can't have a space between the C<?> and C<:>,
-but can between the C<(> and C<?>.  Within any delimiters for such a
+C<{5,}>.  Similarly, C<(?:...)> can't have a space between the C<(>,
+C<?>, and C<:>.  Within any delimiters for such a
  construct, allowed spaces are not affected by C</x>, and depend on the
  construct.  For example, C<\x{...}> can't have spaces because hexadecimal
  numbers don't have spaces in them.  But, Unicode properties can have spaces, so
diff --git a/regcomp.c b/regcomp.c

index 95f8958..d1bdf44 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -8601,7 +8601,10 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
     cannot happen.  */
  STATIC regnode *
  S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
-    /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+    /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
+     * 2 is like 1, but indicates that nextchar() has been called to advance
+     * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
+     * this flag alerts us to the need to check for that */
  {
      dVAR;
      regnode *ret;              /* Will be the head of the group. */
@@ -8629,6 +8632,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
  
      /* Make an OPEN node, if parenthesized. */
      if (paren) {
+
+        /* Under /x, space and comments can be gobbled up between the '(' and
+         * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
+         * intervening space, as the sequence is a token, and a token should be
+         * indivisible */
+        bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+
          if ( *RExC_parse == '*') { /* (*VERB:ARG) */
             char *start_verb = RExC_parse;
             STRLEN verb_len = 0;
@@ -8636,6 +8646,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             unsigned char op = 0;
             int argok = 1;
             int internal_argval = 0; /* internal_argval is only useful if !argok */
+
+            if (has_intervening_patws && SIZE_ONLY) {
+                ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+            }
             while ( *RExC_parse && *RExC_parse != ')' ) {
                 if ( *RExC_parse == ':' ) {
                     start_arg = RExC_parse + 1;
@@ -8737,6 +8751,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         if (*RExC_parse == '?') { /* (?...) */
             bool is_logical = 0;
             const char * const seqstart = RExC_parse;
+            if (has_intervening_patws && SIZE_ONLY) {
+                ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+            }
  
             RExC_parse++;
             paren = *RExC_parse++;
@@ -9322,7 +9339,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         case ':':
             ender = reg_node(pRExC_state, TAIL);
             break;
-       case 1:
+       case 1: case 2:
             ender = reganode(pRExC_state, CLOSE, parno);
             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
@@ -10312,7 +10329,7 @@ tryagain:
      }
      case '(':
         nextchar(pRExC_state);
-        ret = reg(pRExC_state, 1, &flags,depth+1);
+        ret = reg(pRExC_state, 2, &flags,depth+1);
         if (ret == NULL) {
                 if (flags & TRYAGAIN) {
                     if (RExC_parse == RExC_end) {
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t

index c0ede1c..2e936b7 100644 (file)
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -259,6 +259,8 @@ my @deprecated = (
      '/a\b{cde/' => '"\b{" is deprecated; use "\b\{" or "\b[{]" instead {#} m/a\{#}b{cde/',
      '/a\B{cde/' => '"\B{" is deprecated; use "\B\{" or "\B[{]" instead {#} m/a\{#}B{cde/',
      'use utf8; /(?x)\\85\85\\85/' => 'Escape literal pattern white space under /x {#} m/(?x)\\85\85{#}\\85/',
+    '/((?# This is a comment in the middle of a token)?:foo)/' => 'In \'(?...)\', splitting the initial \'(?\' is deprecated {#} m/((?# This is a comment in the middle of a token)?{#}:foo)/',
+    '/((?# This is a comment in the middle of a token)*FAIL)/' => 'In \'(*VERB...)\', splitting the initial \'(*\' is deprecated {#} m/((?# This is a comment in the middle of a token)*{#}FAIL)/',
  );
  
  while (my ($regex, $expect) = splice @death, 0, 2) {
author	Karl Williamson <public@khwilliamson.com>
	Tue, 23 Apr 2013 19:39:35 +0000 (13:39 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Thu, 2 May 2013 19:39:00 +0000 (13:39 -0600)
pod/perldiag.pod		patch \| blob \| blame \| history
pod/perlre.pod		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
t/re/reg_mesg.t		patch \| blob \| blame \| history