Add Rodolfo Carvalho to AUTHORS

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 01334f9..70e9e2f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1387,8 +1387,8 @@ is the recommended Unicode-aware way of saying
                scan += len;                                                   \
                len = 0;                                                       \
             } else {                                                          \
-               uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
-               uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
+               len = UTF8SKIP(uc);\
+               uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
                 foldlen -= UNISKIP( uvc );                                    \
                 scan = foldbuf + UNISKIP( uvc );                              \
             }                                                                 \
@@ -4523,7 +4523,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
      struct regexp *r;
      register regexp_internal *ri;
      STRLEN plen;
-    char  *exp;
+    char* VOL exp;
      char* xend;
      regnode *scan;
      I32 flags;
@@ -4553,7 +4553,14 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
  
      DEBUG_r(if (!PL_colorset) reginitcolors());
  
-    RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+    exp = SvPV(pattern, plen);
+
+    if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
+       RExC_utf8 = RExC_orig_utf8 = 0;
+    }
+    else {
+       RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+    }
      RExC_uni_semantics = 0;
      RExC_contains_locale = 0;
  
@@ -4565,12 +4572,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
      }
  
      if (jump_ret == 0) {    /* First time through */
-       exp = SvPV(pattern, plen);
         xend = exp + plen;
-       /* ignore the utf8ness if the pattern is 0 length */
-       if (plen == 0) {
-           RExC_utf8 = RExC_orig_utf8 = 0;
-       }
  
          DEBUG_COMPILE_r({
              SV *dsv= sv_newmortal();
@@ -4602,7 +4604,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
          -- dmq */
          DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
+        exp = (char*)Perl_bytes_to_utf8(aTHX_
+                                       (U8*)SvPV_nomg(pattern, plen),
+                                       &len);
          xend = exp + len;
          RExC_orig_utf8 = RExC_utf8 = 1;
          SAVEFREEPV(exp);
@@ -9600,7 +9604,7 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
      }
  }
  
-/* No locale test, and always Unicode semantics */
+/* No locale test, and always Unicode semantics, no ignore-case differences */
  #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
  ANYOF_##NAME:                                                                  \
         for (value = 0; value < 256; value++)                                  \
@@ -9620,8 +9624,11 @@ case ANYOF_N##NAME:                                                            \
  /* Like the above, but there are differences if we are in uni-8-bit or not, so
   * there are two tests passed in, to use depending on that. There aren't any
   * cases where the label is different from the name, so no need for that
- * parameter */
-#define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
+ * parameter.
+ * Sets 'what' to WORD which is the property name for non-bitmap code points;
+ * But, uses FOLD_WORD instead if /i has been selected, to allow a different
+ * property name */
+#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD)                         \
  ANYOF_##NAME:                                                                  \
      if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
      else if (UNI_SEMANTICS) {                                                  \
@@ -9638,7 +9645,12 @@ ANYOF_##NAME:                                                                  \
          }                                                                      \
      }                                                                          \
      yesno = '+';                                                               \
-    what = WORD;                                                               \
+    if (FOLD) {                                                                \
+        what = FOLD_WORD;                                                      \
+    }                                                                          \
+    else {                                                                     \
+        what = WORD;                                                           \
+    }                                                                          \
      break;                                                                     \
  case ANYOF_N##NAME:                                                            \
      if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
@@ -9670,7 +9682,12 @@ case ANYOF_N##NAME:                                                            \
         }                                                                      \
      }                                                                          \
      yesno = '!';                                                               \
-    what = WORD;                                                               \
+    if (FOLD) {                                                                \
+        what = FOLD_WORD;                                                      \
+    }                                                                          \
+    else {                                                                     \
+        what = WORD;                                                           \
+    }                                                                          \
      break
  
  STATIC U8
@@ -10228,20 +10245,20 @@ parseit:
                  * --jhi */
                 switch ((I32)namedclass) {
                 
-               case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
-               case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
-               case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
-               case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
-               case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
-               case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
-               case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
-               case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
-               case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
-               case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
+               case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
+               case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
+               case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
+               case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
+               case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
+               case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
+               case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
+               case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
+               case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
+               case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
                  /* \s, \w match all unicode if utf8. */
-                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
-                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
-               case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
+                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
+                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
+               case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
                 case ANYOF_ASCII:
@@ -10307,7 +10324,7 @@ parseit:
                 }
                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
                     /* Strings such as "+utf8::isWord\n" */
-                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
+                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
                 }
  
                 continue;
@@ -10437,7 +10454,11 @@ parseit:
                 if (! PL_utf8_tofold) {
                     U8 dummy[UTF8_MAXBYTES+1];
                     STRLEN dummy_len;
-                   to_utf8_fold((U8*) "A", dummy, &dummy_len);
+
+                   /* This particular string is above \xff in both UTF-8 and
+                    * UTFEBCDIC */
+                   to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
+                   assert(PL_utf8_tofold); /* Verify that worked */
                 }
                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
             }