This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Fix some EBCDIC problems
authorKarl Williamson <public@khwilliamson.com>
Tue, 25 Jun 2013 03:14:37 +0000 (21:14 -0600)
committerKarl Williamson <khw@cpan.org>
Fri, 30 May 2014 15:32:08 +0000 (09:32 -0600)
We were testing for UTF-8 invariant, when we should have been testing
for ASCII.  This is a problem only on EBCDIC platforms, where they mean
two different sets of code points.

regexec.c
t/re/re_tests

index 2f4ef60..d4a7fdd 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1916,9 +1916,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     case NPOSIXA:
         if (utf8_target) {
             /* The complement of something that matches only ASCII matches all
-             * UTF-8 variant code points, plus everything in ASCII that isn't
-             * in the class */
-            REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
+             * non-ASCII, plus everything in ASCII that isn't in the class. */
+            REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
                                       || ! _generic_isCC_A(*s, FLAGS(c)));
             break;
         }
@@ -7339,10 +7338,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         else {
 
             /* The complement of something that matches only ASCII matches all
-             * UTF-8 variant code points, plus everything in ASCII that isn't
-             * in the class. */
+             * non-ASCII, plus everything in ASCII that isn't in the class. */
            while (hardcount < max && scan < loceol
-                   && (! UTF8_IS_INVARIANT(*scan)
+                   && (! isASCII_utf8(scan)
                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
             {
                 scan += UTF8SKIP(scan);
index d6a8b12..78bacc9 100644 (file)
@@ -652,6 +652,7 @@ $(?<=^(a))  a       y       $1      a
 ([[:^xdigit:]]+)       ABcd01Xy__--  ${nulnul}${ffff}  y       $1      Xy__--  ${nulnul}${ffff}
 [[:foo:]]      -       c       -       POSIX class [:foo:] unknown
 [[:^foo:]]     -       c       -       POSIX class [:^foo:] unknown
+'[[:^cntrl:]]+'u       a\x80   y       $&      a       \x80 was matching on EBCDIC platforms
 ((?>a+)b)      aaab    y       $1      aaab
 (?>(a+))b      aaab    y       $1      aaa
 ((?>[^()]+)|\([^()]*\))+       ((abc(ade)ufh()()x      y       $&      abc(ade)ufh()()x