Regenerate headers and fix compilation with threads after last commit

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 7d1c474..bc7086f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2,7 +2,9 @@
   */
  
  /*
- * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
+ * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
+ *
+ *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   */
  
  /* This file contains functions for compiling a regular expression.  See
@@ -6551,7 +6553,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  /* reg_namedseq(pRExC_state,UVp)
     
     This is expected to be called by a parser routine that has 
-   recognized'\N' and needs to handle the rest. RExC_parse is 
+   recognized '\N' and needs to handle the rest. RExC_parse is
     expected to point at the first char following the N at the time
     of the call.
     
@@ -6565,11 +6567,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     be returned to indicate failure. (This will NOT be a valid pointer 
     to a regnode.)
     
-   If value is null then it is assumed that we are parsing normal text
+   If valuep is null then it is assumed that we are parsing normal text
     and inserts a new EXACT node into the program containing the resolved
     string and returns a pointer to the new node. If the string is 
     zerolength a NOTHING node is emitted.
-   
+
     On success RExC_parse is set to the char following the endbrace.
     Parsing failures will generate a fatal errorvia vFAIL(...)
     
@@ -6583,7 +6585,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     
   */
  STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) 
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
  {
      char * name;        /* start of the content of the name */
      char * endbrace;    /* endbrace following the name */
@@ -6595,8 +6597,22 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
   
      PERL_ARGS_ASSERT_REG_NAMEDSEQ;
     
-    if (*RExC_parse != '{') {
-        vFAIL("Missing braces on \\N{}");
+    if (*RExC_parse != '{' ||
+           (*RExC_parse == '{' && RExC_parse[1]
+            && strchr("0123456789", RExC_parse[1])))
+    {
+       GET_RE_DEBUG_FLAGS_DECL;
+       if (valuep)
+           /* no bare \N in a charclass */
+           vFAIL("Missing braces on \\N{}");
+       GET_RE_DEBUG_FLAGS;
+       nextchar(pRExC_state);
+       ret = reg_node(pRExC_state, REG_ANY);
+       *flagp |= HASWIDTH|SIMPLE;
+       RExC_naughty++;
+       RExC_parse--;
+        Set_Node_Length(ret, 1); /* MJD */
+       return ret;
      }
      name = RExC_parse+1;
      endbrace = strchr(RExC_parse, '}');
@@ -6615,20 +6631,30 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
              | PERL_SCAN_DISALLOW_PREFIX
              | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
          UV cp;
-       char string;
          len = (STRLEN)(endbrace - name - 2);
          cp = grok_hex(name + 2, &len, &fl, NULL);
          if ( len != (STRLEN)(endbrace - name - 2) ) {
              cp = 0xFFFD;
          }    
-        if (cp > 0xff)
-            RExC_utf8 = 1;
          if ( valuep ) {
+           if (cp > 0xff) RExC_utf8 = 1;
              *valuep = cp;
              return NULL;
          }
-       string = (char)cp;
-        sv_str= newSVpvn(&string, 1);
+
+       /* Need to convert to utf8 if either: won't fit into a byte, or the re
+        * is going to be in utf8 and the representation changes under utf8. */
+       if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
+           U8 string[UTF8_MAXBYTES+1];
+           U8 *tmps;
+           RExC_utf8 = 1;
+           tmps = uvuni_to_utf8(string, cp);
+           sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
+       } else {    /* Otherwise, no need for utf8, can skip that step */
+           char string;
+           string = (char)cp;
+           sv_str= newSVpvn(&string, 1);
+       }
      } else {
          /* fetch the charnames handler for this scope */
          HV * const table = GvHV(PL_hintgv);
@@ -6807,7 +6833,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
          Set_Node_Cur_Length(ret); /* MJD */
          RExC_parse--; 
          nextchar(pRExC_state);
-    } else {
+    } else {   /* zero length */
          ret = reg_node(pRExC_state,NOTHING);
      }
      if (!cached) {
@@ -7147,12 +7173,12 @@ tryagain:
             }
             break;
          case 'N': 
-            /* Handle \N{NAME} here and not below because it can be 
+            /* Handle \N and \N{NAME} here and not below because it can be
              multicharacter. join_exact() will join them up later on. 
              Also this makes sure that things like /\N{BLAH}+/ and 
              \N{BLAH} being multi char Just Happen. dmq*/
              ++RExC_parse;
-            ret= reg_namedseq(pRExC_state, NULL); 
+            ret= reg_namedseq(pRExC_state, NULL, flagp); 
              break;
         case 'k':    /* Handle \k<NAME> and \k'NAME' */
         parse_named_seq:
@@ -7418,6 +7444,19 @@ tryagain:
                              I32 flags = 0;
                             STRLEN numlen = 3;
                             ender = grok_oct(p, &numlen, &flags, NULL);
+
+                           /* An octal above 0xff is interpreted differently
+                            * depending on if the re is in utf8 or not.  If it
+                            * is in utf8, the value will be itself, otherwise
+                            * it is interpreted as modulo 0x100.  It has been
+                            * decided to discourage the use of octal above the
+                            * single-byte range.  For now, warn only when
+                            * it ends up modulo */
+                           if (SIZE_ONLY && ender >= 0x100
+                                   && ! UTF && ! PL_encoding
+                                   && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) {
+                               vWARNdep(p, "Use of octal value above 377 is deprecated");
+                           }
                             p += numlen;
                         }
                         else {
@@ -7792,6 +7831,22 @@ case ANYOF_N##NAME:                                     \
      what = WORD;                                        \
      break
  
+/* 
+   We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+   so that it is possible to override the option here without having to 
+   rebuild the entire core. as we are required to do if we change regcomp.h
+   which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
  /*
     parse a class specification and produce either an ANYOF node that
     matches the pattern or if the pattern matches a single char only and
@@ -7923,7 +7978,7 @@ parseit:
                      from earlier versions, OTOH that behaviour was broken
                      as well. */
                      UV v; /* value is register so we cant & it /grrr */
-                    if (reg_namedseq(pRExC_state, &v)) {
+                    if (reg_namedseq(pRExC_state, &v, NULL)) {
                          goto parseit;
                      }
                      value= v; 
@@ -8080,18 +8135,24 @@ parseit:
                  * A similar issue a little earlier when switching on value.
                  * --jhi */
                 switch ((I32)namedclass) {
+               
+               case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+               case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+               case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+               case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+               case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+               case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+               case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+               case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+               case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+               case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
                 case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
-               case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
-               case _C_C_T_(BLANK, isBLANK(value), "Blank");
-               case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
-               case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
-               case _C_C_T_(LOWER, isLOWER(value), "Lower");
-               case _C_C_T_(PRINT, isPRINT(value), "Print");
-               case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
-               case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
                 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
-               case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif         
                 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
@@ -8138,7 +8199,7 @@ parseit:
                             ANYOF_BITMAP_SET(ret, value);
                     }
                     yesno = '+';
-                   what = "Digit";
+                   what = POSIX_CC_UNI_NAME("Digit");
                     break;
                 case ANYOF_NDIGIT:
                     if (LOC)
@@ -8151,7 +8212,7 @@ parseit:
                             ANYOF_BITMAP_SET(ret, value);
                     }
                     yesno = '!';
-                   what = "Digit";
+                   what = POSIX_CC_UNI_NAME("Digit");
                     break;              
                 case ANYOF_MAX:
                     /* this is to handle \p and \P */
@@ -8962,6 +9023,17 @@ Perl_regdump(pTHX_ const regexp *r)
  /*
  - regprop - printable representation of opcode
  */
+#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
+STMT_START { \
+        if (do_sep) {                           \
+            Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
+            if (flags & ANYOF_INVERT)           \
+                /*make sure the invert info is in each */ \
+                sv_catpvs(sv, "^");             \
+            do_sep = 0;                         \
+        }                                       \
+} STMT_END
+
  void
  Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
  {
@@ -9087,6 +9159,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
      else if (k == ANYOF) {
         int i, rangestart = -1;
         const U8 flags = ANYOF_FLAGS(o);
+       int do_sep = 0;
  
         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
         static const char * const anyofs[] = {
@@ -9102,8 +9175,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
             "[:^alpha:]",
             "[:ascii:]",
             "[:^ascii:]",
-           "[:ctrl:]",
-           "[:^ctrl:]",
+           "[:cntrl:]",
+           "[:^cntrl:]",
             "[:graph:]",
             "[:^graph:]",
             "[:lower:]",
@@ -9129,6 +9202,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
         if (flags & ANYOF_INVERT)
             sv_catpvs(sv, "^");
+       
+       /* output what the standard cp 0-255 bitmap matches */
         for (i = 0; i <= 256; i++) {
             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
                 if (rangestart == -1)
@@ -9142,15 +9217,23 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
                     sv_catpvs(sv, "-");
                     put_byte(sv, i - 1);
                 }
+               do_sep = 1;
                 rangestart = -1;
             }
         }
-
+        
+        EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+        /* output any special charclass tests (used mostly under use locale) */
         if (o->flags & ANYOF_CLASS)
             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
-               if (ANYOF_CLASS_TEST(o,i))
+               if (ANYOF_CLASS_TEST(o,i)) {
                     sv_catpv(sv, anyofs[i]);
-
+                   do_sep = 1;
+               }
+        
+        EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+        
+        /* output information about the unicode matching */
         if (flags & ANYOF_UNICODE)
             sv_catpvs(sv, "{unicode}");
         else if (flags & ANYOF_UNICODE_ALL)
@@ -9163,7 +9246,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
             if (lv) {
                 if (sw) {
                     U8 s[UTF8_MAXBYTES_CASE+1];
-               
+
                     for (i = 0; i <= 256; i++) { /* just the first 256 */
                         uvchr_to_utf8(s, i);