Change meaning of \v, \V, and add \h, \H to match Perl6, add \R to match PCRE and...
authorYves Orton <demerphq@gmail.com>
Sun, 22 Apr 2007 23:34:55 +0000 (01:34 +0200)
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>
Mon, 23 Apr 2007 09:04:31 +0000 (09:04 +0000)
Message-ID: <9b18b3110704221434g43457742p28cab00289f83639@mail.gmail.com>

p4raw-id: //depot/perl@31026

12 files changed:
MANIFEST
lib/unicore/mktables
pod/perlre.pod
regcharclass.h [new file with mode: 0644]
regcomp.c
regcomp.h
regcomp.sym
regexec.c
regnodes.h
t/op/pat.t
t/op/re_tests
t/op/regexp.t

index baf1e13..d997f37 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -3297,6 +3297,7 @@ regcomp.c                 Regular expression compiler
 regcomp.h                      Private declarations for above
 regcomp.pl                     Builder of regnodes.h
 regcomp.sym                    Data for regnodes.h
+regcharclass.h                 Match various character classes efficiently
 regen_lib.pl                   Common file routines for generator scripts
 regen_perly.pl                 generate perly.{act,h,tab} from perly.y
 regen.pl                       Run all scripts that (re)generate files
index 72f33cb..64de8b1 100644 (file)
@@ -829,7 +829,10 @@ sub UnicodeData_Txt()
        Table->New(Is => 'Word',   Desc => "[[:Word:]]",   Fuzzy => 0);
     $Cat{SpacePerl} =
        Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0);
-
+    $Cat{VertSpace} =
+       Table->New(Is => 'VertSpace', Desc => '\v', Fuzzy => 0);
+    $Cat{HorizSpace} =
+       Table->New(Is => 'HorizSpace', Desc => '\h', Fuzzy => 0);
     my %To;
     $To{Upper} = Table->New();
     $To{Lower} = Table->New();
@@ -886,6 +889,15 @@ sub UnicodeData_Txt()
         $Cat{SpacePerl}->$op($code) if $isspace
                                       && $code != 0x000B; # Backward compat.
 
+        $Cat{VertSpace}->$op($code) if grep {$code == $_} 
+            ( 0x0A..0x0D,0x85,0x2028,0x2029 );
+
+        $Cat{HorizSpace}->$op($code) if grep {$code == $_} (
+            0x09,   0x20,   0xa0,   0x1680, 0x180e, 0x2000, 0x2001, 0x2002,
+            0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
+            0x202f, 0x205f, 0x3000
+        ); 
+
         $Cat{Blank}->$op($code) if $isspace
                                 && !($code == 0x000A ||
                                     $code == 0x000B ||
index 66935d2..1865232 100644 (file)
@@ -233,7 +233,7 @@ An unescaped C<$> or C<@> interpolates the corresponding variable,
 while escaping will cause the literal string C<\$> to be matched.
 You'll need to write something like C<m/\Quser\E\@\Qhost/>.
 
-=head3 Character classes
+=head3 Character Classes and other Special Escapes
 
 In addition, Perl defines the following:
 X<\w> X<\W> X<\s> X<\S> X<\d> X<\D> X<\X> X<\p> X<\P> X<\C>
@@ -265,8 +265,11 @@ X<word> X<whitespace> X<character class> X<backreference>
     \x12     Hexadecimal escape sequence
     \x{1234} Long hexadecimal escape sequence
     \K       Keep the stuff left of the \K, don't include it in $&
-    \v       Shortcut for (*PRUNE)
-    \V       Shortcut for (*SKIP)
+    \v       Vertical whitespace
+    \V       Not vertical whitespace
+    \h       Horizontal whitespace
+    \H       Not horizontal whitespace
+    \R       Linebreak (matches like \v inside of a charclass)
 
 A C<\w> matches a single alphanumeric character (an alphabetic
 character, or a decimal digit) or C<_>, not a whole word.  Use C<\w+>
@@ -283,6 +286,15 @@ your own C<\p> and C<\P> properties, and L<perluniintro> about Unicode
 in general.
 X<\w> X<\W> X<word>
 
+C<\R> will atomically match a linebreak, including the network line-ending
+"\x0D\x0A".  Specifically, X<\R> is exactly equivelent to
+
+  (?>\x0D\x0A?|[\x0A-\x0C\x85\x{2028}\x{2029}])
+
+B<Note:> C<\R> has no special meaning inside of a character class;
+use C<\v> instead (vertical whitespace).
+X<\R>
+
 The POSIX character class syntax
 X<character class>
 
diff --git a/regcharclass.h b/regcharclass.h
new file mode 100644 (file)
index 0000000..3fc92d7
--- /dev/null
@@ -0,0 +1,250 @@
+/****** WARNING WARNING WARNING ********/
+/*                                     */
+/* Autogenerated code, do not modify!  */
+/*                                     */
+/****** WARNING WARNING WARNING ********/
+
+/* LNBREAK Line Break: \j \J 
+  Codepoints:  0x0A, 0x0B, 0x0C, 0x0D, 0x0D.0x0A, 0x85, 0x2028, 0x2029
+  */
+#define is_LNBREAK(s,is_utf8) /*** Line Break: \j \J ***/                   \
+( ( ((U8*)s)[0]==13 ) ?                                                     \
+    ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) :                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 :                              \
+( (is_utf8) ?                                                               \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\
+  ( ((U8*)s)[0]==133 ) ) ) )                                                
+
+#define is_LNBREAK_safe(s,e,is_utf8) /*** Line Break: \j \J ***/            \
+( ( (e) - (s) > 2 ) ?                                                       \
+  ( ( ((U8*)s)[0]==13 ) ?                                                   \
+    ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) :                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 :                              \
+( (is_utf8) ?                                                               \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\
+  ( ((U8*)s)[0]==133 ) ) ) ) :                                              \
+( ( (e) - (s) > 1 ) ?                                                       \
+  ( ( ((U8*)s)[0]==13 ) ?                                                   \
+    ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) :                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 :                              \
+( (is_utf8) ?                                                               \
+  ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==133 )) ? 2 : 0) :                \
+  ( ((U8*)s)[0]==133 ) ) ) ) :                                              \
+( ( (e) - (s) > 0 ) ?                                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 :                              \
+( (!is_utf8) ?                                                              \
+  ( ((U8*)s)[0]==133 ) : 0 ) ) : 0 ) ) )                                    
+
+#define is_LNBREAK_utf8(s) /*** Line Break: \j \J ***/                      \
+( ( ((U8*)s)[0]==194 ) ?                                                    \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==226 ) ?                                                  \
+    ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\
+  ( ( ((U8*)s)[0]==13 ) ?                                                   \
+    ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) :                                       \
+  (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) )                                
+
+#define is_LNBREAK_utf8_safe(s,e) /*** Line Break: \j \J ***/               \
+( ( (e) - (s) > 2 ) ?                                                       \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==226 ) ?                                                  \
+    ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\
+  ( ( ((U8*)s)[0]==13 ) ?                                                   \
+    ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) :                                       \
+  (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) ) :                              \
+( ( (e) - (s) > 1 ) ?                                                       \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==13 ) ?                                                   \
+    ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) :                                       \
+  (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) :                                \
+( ( (e) - (s) > 0 ) ?                                                       \
+  (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) : 0 ) ) )                            
+
+#define is_LNBREAK_latin1(s) /*** Line Break: \j \J ***/                    \
+( ( ((U8*)s)[0]==13 ) ?                                                     \
+    ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) :                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) || ((U8*)s)[0]==133 ) )            
+
+#define is_LNBREAK_latin1_safe(s,e) /*** Line Break: \j \J ***/             \
+( ( (e) - (s) > 1 ) ?                                                       \
+  ( ( ((U8*)s)[0]==13 ) ?                                                   \
+    ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) :                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) || ((U8*)s)[0]==133 ) ) :          \
+( ( (e) - (s) > 0 ) ?                                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) : 0 ) )      
+
+#define is_LNBREAK_cp(cp) /*** Line Break: \j \J ***/                       \
+( (10<=cp && cp<=13) || ( cp > 13 && ( cp==133 || ( cp > 133 && ( cp==8232 || ( cp > 8232 && cp==8233 ) ) ) ) ) )
+
+/****** WARNING WARNING WARNING ********/
+/*                                     */
+/* Autogenerated code, do not modify!  */
+/*                                     */
+/****** WARNING WARNING WARNING ********/
+
+/* HORIZWS Horizontal Whitespace: \h \H 
+  Codepoints:  0x09, 0x20, 0xA0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002,
+  0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F,
+  0x205F, 0x3000
+  */
+#define is_HORIZWS(s,is_utf8) /*** Horizontal Whitespace: \h \H ***/        \
+( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 :                               \
+( (is_utf8) ?                                                               \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==225 ) ?                                                  \
+    ( ( ((U8*)s)[1]==160 ) ?                                                \
+      ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) :                                    \
+    ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) :            \
+  ( ( ((U8*)s)[0]==226 ) ?                                                  \
+    ( ( ((U8*)s)[1]==129 ) ?                                                \
+      ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) :                                    \
+    ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\
+  (((( ((U8*)s)[0]==227 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) ) ) :\
+  ( ((U8*)s)[0]==160 ) ) )                                                  
+
+#define is_HORIZWS_safe(s,e,is_utf8) /*** Horizontal Whitespace: \h \H ***/ \
+( ( (e) - (s) > 2 ) ?                                                       \
+  ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 :                             \
+( (is_utf8) ?                                                               \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==225 ) ?                                                  \
+    ( ( ((U8*)s)[1]==160 ) ?                                                \
+      ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) :                                    \
+    ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) :            \
+  ( ( ((U8*)s)[0]==226 ) ?                                                  \
+    ( ( ((U8*)s)[1]==129 ) ?                                                \
+      ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) :                                    \
+    ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\
+  (((( ((U8*)s)[0]==227 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) ) ) :\
+  ( ((U8*)s)[0]==160 ) ) ) :                                                \
+( ( (e) - (s) > 1 ) ?                                                       \
+  ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 :                             \
+( (is_utf8) ?                                                               \
+  ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==160 )) ? 2 : 0) :                \
+  ( ((U8*)s)[0]==160 ) ) ) :                                                \
+( ( (e) - (s) > 0 ) ?                                                       \
+  ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 :                             \
+( (!is_utf8) ?                                                              \
+  ( ((U8*)s)[0]==160 ) : 0 ) ) : 0 ) ) )                                    
+
+#define is_HORIZWS_utf8(s) /*** Horizontal Whitespace: \h \H ***/           \
+( ( ((U8*)s)[0]==194 ) ?                                                    \
+    ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==225 ) ?                                                  \
+    ( ( ((U8*)s)[1]==160 ) ?                                                \
+      ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) :                                    \
+    ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) :            \
+  ( ( ((U8*)s)[0]==226 ) ?                                                  \
+    ( ( ((U8*)s)[1]==129 ) ?                                                \
+      ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) :                                    \
+    ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\
+  ( ( ((U8*)s)[0]==227 ) ?                                                  \
+    ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) :              \
+  ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) ) ) )                             
+
+#define is_HORIZWS_utf8_safe(s,e) /*** Horizontal Whitespace: \h \H ***/    \
+( ( (e) - (s) > 2 ) ?                                                       \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==225 ) ?                                                  \
+    ( ( ((U8*)s)[1]==160 ) ?                                                \
+      ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) :                                    \
+    ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) :            \
+  ( ( ((U8*)s)[0]==226 ) ?                                                  \
+    ( ( ((U8*)s)[1]==129 ) ?                                                \
+      ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) :                                    \
+    ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\
+  ( ( ((U8*)s)[0]==227 ) ?                                                  \
+    ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) :              \
+  ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) ) ) ) :                           \
+( ( (e) - (s) > 1 ) ?                                                       \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) :                                      \
+  ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) :                                 \
+( ( (e) - (s) > 0 ) ?                                                       \
+  ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) : 0 ) ) )                           
+
+#define is_HORIZWS_latin1(s) /*** Horizontal Whitespace: \h \H ***/         \
+( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 || ((U8*)s)[0]==160 )                   
+
+#define is_HORIZWS_latin1_safe(s,e) /*** Horizontal Whitespace: \h \H ***/  \
+( ( (e) - (s) > 0 ) ?                                                       \
+  ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 || ((U8*)s)[0]==160 ) : 0 )           
+
+#define is_HORIZWS_cp(cp) /*** Horizontal Whitespace: \h \H ***/            \
+( cp==9 || ( cp > 9 && ( cp==32 || ( cp > 32 && ( cp==160 || ( cp > 160 && ( cp==5760 || ( cp > 5760 && ( cp==6158 || ( cp > 6158 && ( (8192<=cp && cp<=8202) || ( cp > 8202 && ( cp==8239 || ( cp > 8239 && ( cp==8287 || ( cp > 8287 && cp==12288 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) )
+
+/****** WARNING WARNING WARNING ********/
+/*                                     */
+/* Autogenerated code, do not modify!  */
+/*                                     */
+/****** WARNING WARNING WARNING ********/
+
+/* VERTWS Vertical Whitespace: \v \V 
+  Codepoints:  0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029
+  */
+#define is_VERTWS(s,is_utf8) /*** Vertical Whitespace: \v \V ***/           \
+( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 :                                \
+( (is_utf8) ?                                                               \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\
+  ( ((U8*)s)[0]==133 ) ) )                                                  
+
+#define is_VERTWS_safe(s,e,is_utf8) /*** Vertical Whitespace: \v \V ***/    \
+( ( (e) - (s) > 2 ) ?                                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 :                              \
+( (is_utf8) ?                                                               \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\
+  ( ((U8*)s)[0]==133 ) ) ) :                                                \
+( ( (e) - (s) > 1 ) ?                                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 :                              \
+( (is_utf8) ?                                                               \
+  ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==133 )) ? 2 : 0) :                \
+  ( ((U8*)s)[0]==133 ) ) ) :                                                \
+( ( (e) - (s) > 0 ) ?                                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 :                              \
+( (!is_utf8) ?                                                              \
+  ( ((U8*)s)[0]==133 ) : 0 ) ) : 0 ) ) )                                    
+
+#define is_VERTWS_utf8(s) /*** Vertical Whitespace: \v \V ***/              \
+( ( ((U8*)s)[0]==194 ) ?                                                    \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==226 ) ?                                                  \
+    ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\
+  (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) )                                  
+
+#define is_VERTWS_utf8_safe(s,e) /*** Vertical Whitespace: \v \V ***/       \
+( ( (e) - (s) > 2 ) ?                                                       \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  ( ( ((U8*)s)[0]==226 ) ?                                                  \
+    ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\
+  (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) ) :                                \
+( ( (e) - (s) > 1 ) ?                                                       \
+  ( ( ((U8*)s)[0]==194 ) ?                                                  \
+    ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) :                                      \
+  (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) :                                  \
+( ( (e) - (s) > 0 ) ?                                                       \
+  (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) : 0 ) ) )                            
+
+#define is_VERTWS_latin1(s) /*** Vertical Whitespace: \v \V ***/            \
+( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 )                
+
+#define is_VERTWS_latin1_safe(s,e) /*** Vertical Whitespace: \v \V ***/     \
+( ( (e) - (s) > 0 ) ?                                                       \
+  ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) : 0 )        
+
+#define is_VERTWS_cp(cp) /*** Vertical Whitespace: \v \V ***/               \
+( (10<=cp && cp<=13) || ( cp > 13 && ( cp==133 || ( cp > 133 && ( cp==8232 || ( cp > 8232 && cp==8233 ) ) ) ) ) )
+
index e24d146..48a8a30 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2400,6 +2400,34 @@ typedef struct scan_frame {
 
 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
 
+#define CASE_SYNST_FNC(nAmE)                                       \
+case nAmE:                                                         \
+    if (flags & SCF_DO_STCLASS_AND) {                              \
+           for (value = 0; value < 256; value++)                  \
+               if (!is_ ## nAmE ## _cp(value))                       \
+                   ANYOF_BITMAP_CLEAR(data->start_class, value);  \
+    }                                                              \
+    else {                                                         \
+           for (value = 0; value < 256; value++)                  \
+               if (is_ ## nAmE ## _cp(value))                        \
+                   ANYOF_BITMAP_SET(data->start_class, value);    \
+    }                                                              \
+    break;                                                         \
+case N ## nAmE:                                                    \
+    if (flags & SCF_DO_STCLASS_AND) {                              \
+           for (value = 0; value < 256; value++)                   \
+               if (is_ ## nAmE ## _cp(value))                         \
+                   ANYOF_BITMAP_CLEAR(data->start_class, value);   \
+    }                                                               \
+    else {                                                          \
+           for (value = 0; value < 256; value++)                   \
+               if (!is_ ## nAmE ## _cp(value))                        \
+                   ANYOF_BITMAP_SET(data->start_class, value);     \
+    }                                                               \
+    break
+
+
+
 STATIC I32
 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         I32 *minlenp, I32 *deltap,
@@ -3330,6 +3358,34 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                break;
            }
        }
+       else if (OP(scan) == LNBREAK) {
+           if (flags & SCF_DO_STCLASS) {
+               int value = 0;
+               data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
+               if (flags & SCF_DO_STCLASS_AND) {
+                    for (value = 0; value < 256; value++)
+                        if (!is_LNBREAK_cp(value))                   
+                            ANYOF_BITMAP_CLEAR(data->start_class, value);  
+                }                                                              
+                else {                                                         
+                    for (value = 0; value < 256; value++)
+                        if (is_LNBREAK_cp(value))                    
+                            ANYOF_BITMAP_SET(data->start_class, value);           
+                }                                                              
+                if (flags & SCF_DO_STCLASS_OR)
+                   cl_and(data->start_class, and_withp);
+               flags &= ~SCF_DO_STCLASS;
+            }
+           min += 1;
+           delta += 2;
+            if (flags & SCF_DO_SUBSTR) {
+               SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
+               data->pos_min += 1;
+               data->pos_delta += 2;
+               data->longest = &(data->longest_float);
+           }
+           
+       }
        else if (strchr((const char*)PL_simple,OP(scan))) {
            int value = 0;
 
@@ -3524,6 +3580,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        }
                    }
                    break;
+               CASE_SYNST_FNC(VERTWS);
+               CASE_SYNST_FNC(HORIZWS);
+               
                }
                if (flags & SCF_DO_STCLASS_OR)
                    cl_and(data->start_class, and_withp);
@@ -3894,6 +3953,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
        }
 #endif /* old or new */
 #endif /* TRIE_STUDY_OPT */    
+
        /* Else: zero-length, ignore. */
        scan = regnext(scan);
     }
@@ -6585,15 +6645,25 @@ tryagain:
            ret = reg_node(pRExC_state, NDIGIT);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
+       case 'R':
+           ret = reg_node(pRExC_state, LNBREAK);
+           *flagp |= HASWIDTH|SIMPLE;
+           goto finish_meta_pat;
+       case 'h':
+           ret = reg_node(pRExC_state, HORIZWS);
+           *flagp |= HASWIDTH|SIMPLE;
+           goto finish_meta_pat;
+       case 'H':
+           ret = reg_node(pRExC_state, NHORIZWS);
+           *flagp |= HASWIDTH|SIMPLE;
+           goto finish_meta_pat;
        case 'v':
-           ret = reganode(pRExC_state, PRUNE, 0);
-           ret->flags = 1;
-           *flagp |= SIMPLE;
+           ret = reg_node(pRExC_state, VERTWS);
+           *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'V':
-           ret = reganode(pRExC_state, SKIP, 0);
-           ret->flags = 1;
-           *flagp |= SIMPLE;
+           ret = reg_node(pRExC_state, NVERTWS);
+           *flagp |= HASWIDTH|SIMPLE;
          finish_meta_pat:          
            nextchar(pRExC_state);
             Set_Node_Length(ret, 2); /* MJD */
@@ -6815,11 +6885,13 @@ tryagain:
                    case 'C':             /* Single char !DANGEROUS! */
                    case 'd': case 'D':   /* digit class */
                    case 'g': case 'G':   /* generic-backref, pos assertion */
+                   case 'h': case 'H':   /* HORIZWS */
                    case 'k': case 'K':   /* named backref, keep marker */
                    case 'N':             /* named char sequence */
                    case 'p': case 'P':   /* unicode property */
+                             case 'R':   /* LNBREAK */
                    case 's': case 'S':   /* space class */
-                   case 'v': case 'V':   /* (*PRUNE) and (*SKIP) */
+                   case 'v': case 'V':   /* VERTWS */
                    case 'w': case 'W':   /* word class */
                    case 'X':             /* eXtended Unicode "combining character sequence" */
                    case 'z': case 'Z':   /* End of line/string assertion */
@@ -7242,6 +7314,21 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+#define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
+ANYOF_##NAME:                                           \
+       for (value = 0; value < 256; value++)           \
+           if (TEST)                                   \
+               ANYOF_BITMAP_SET(ret, value);           \
+    yesno = '+';                                        \
+    what = WORD;                                        \
+    break;                                              \
+case ANYOF_N##NAME:                                     \
+       for (value = 0; value < 256; value++)           \
+           if (!TEST)                                  \
+               ANYOF_BITMAP_SET(ret, value);           \
+    yesno = '!';                                        \
+    what = WORD;                                        \
+    break
 
 /*
    parse a class specification and produce either an ANYOF node that
@@ -7254,10 +7341,10 @@ STATIC regnode *
 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 {
     dVAR;
-    register UV value = 0;
     register UV nextvalue;
     register IV prevvalue = OOB_UNICODE;
     register IV range = 0;
+    UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
     register regnode *ret;
     STRLEN numlen;
     IV namedclass;
@@ -7360,6 +7447,10 @@ parseit:
            case 'S':   namedclass = ANYOF_NSPACE;      break;
            case 'd':   namedclass = ANYOF_DIGIT;       break;
            case 'D':   namedclass = ANYOF_NDIGIT;      break;
+           case 'v':   namedclass = ANYOF_VERTWS;      break;
+           case 'V':   namedclass = ANYOF_NVERTWS;     break;
+           case 'h':   namedclass = ANYOF_HORIZWS;     break;
+           case 'H':   namedclass = ANYOF_NHORIZWS;    break;
             case 'N':  /* Handle \N{NAME} in class */
                 {
                     /* We only pay attention to the first char of 
@@ -7538,6 +7629,8 @@ parseit:
                case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
                case _C_C_T_(UPPER, isUPPER(value), "Upper");
                case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
+               case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
+               case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
                case ANYOF_ASCII:
                    if (LOC)
                        ANYOF_CLASS_SET(ret, ANYOF_ASCII);
index 1a0916a..70fdeb5 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -7,6 +7,7 @@
  *    License or the Artistic License, as specified in the README file.
  *
  */
+#include "regcharclass.h"
 
 typedef OP OP_4tree;                   /* Will be redefined later. */
 
@@ -177,7 +178,7 @@ struct regnode_2 {
 
 
 #define ANYOF_BITMAP_SIZE      32      /* 256 b/(8 b/B) */
-#define ANYOF_CLASSBITMAP_SIZE  4      /* up to 32 (8*4) named classes */
+#define ANYOF_CLASSBITMAP_SIZE  4      /* up to 40 (8*5) named classes */
 
 /* also used by trie */
 struct regnode_charclass {
@@ -345,6 +346,14 @@ struct regnode_charclass_class {   /* has [[:blah:]] classes */
 
 #define ANYOF_MAX      32
 
+/* pseudo classes, not stored in the class bitmap, but used as flags
+   during compilation of char classes */
+
+#define ANYOF_VERTWS   (ANYOF_MAX+1)
+#define ANYOF_NVERTWS  (ANYOF_MAX+2)
+#define ANYOF_HORIZWS  (ANYOF_MAX+3)
+#define ANYOF_NHORIZWS (ANYOF_MAX+4)
+
 /* Backward source code compatibility. */
 
 #define ANYOF_ALNUML    ANYOF_ALNUM
@@ -444,6 +453,8 @@ EXTCONST U8 PL_simple[] = {
     SPACE,     SPACEL,
     NSPACE,    NSPACEL,
     DIGIT,     NDIGIT,
+    VERTWS,     NVERTWS,
+    HORIZWS,    NHORIZWS,
     0
 };
 #endif
@@ -799,3 +810,4 @@ re.pm, especially to the documentation.
 #endif /* DEBUG RELATED DEFINES */
 
 
+
index c57a386..070fe98 100644 (file)
@@ -185,7 +185,14 @@ CUTGROUP   VERB,      no-sv 1      On failure go to the next alternation in the group
 #*Control what to keep in $&.
 KEEPS          KEEPS,  no      $& begins here.
 
-# NEW STUFF ABOVE THIS LINE -- Please update counts below. 
+#*New charclass like patterns
+LNBREAK                LNBREAK,   none         generic newline pattern
+VERTWS         VERTWS,    none         vertical whitespace         (Perl 6)
+NVERTWS                NVERTWS,   none         not vertical whitespace     (Perl 6)
+HORIZWS                HORIZWS,   none         horizontal whitespace       (Perl 6)
+NHORIZWS       NHORIZWS,  none         not horizontal whitespace   (Perl 6)
+
+# NEW STUFF ABOVE THIS LINE  
 
 ################################################################################
 
index 1eb7ff2..fa853a4 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1110,6 +1110,15 @@ REXEC_FBC_SCAN(                                       \
 if ((!reginfo || regtry(reginfo, &s))) \
     goto got_it
 
+#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
+    if (do_utf8) {                                             \
+       REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
+    }                                                          \
+    else {                                                     \
+       REXEC_FBC_CLASS_SCAN(CoNd);                            \
+    }                                                          \
+    break
+    
 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
     if (do_utf8) {                                             \
        UtFpReLoAd;                                            \
@@ -1425,6 +1434,31 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                !isDIGIT_LC_utf8((U8*)s),
                !isDIGIT_LC(*s)
            );
+       case LNBREAK:
+           REXEC_FBC_CSCAN(
+               is_LNBREAK_utf8(s),
+               is_LNBREAK_latin1(s)
+           );
+       case VERTWS:
+           REXEC_FBC_CSCAN(
+               is_VERTWS_utf8(s),
+               is_VERTWS_latin1(s)
+           );
+       case NVERTWS:
+           REXEC_FBC_CSCAN(
+               !is_VERTWS_utf8(s),
+               !is_VERTWS_latin1(s)
+           );
+       case HORIZWS:
+           REXEC_FBC_CSCAN(
+               is_HORIZWS_utf8(s),
+               is_HORIZWS_latin1(s)
+           );
+       case NHORIZWS:
+           REXEC_FBC_CSCAN(
+               !is_HORIZWS_utf8(s),
+               !is_HORIZWS_latin1(s)
+           );      
        case AHOCORASICKC:
        case AHOCORASICK: 
            {
@@ -3207,8 +3241,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                      * pack("U0U*", 0xDF) =~ /ss/i,
                      * the 0xC3 0x9F are the UTF-8
                      * byte sequence for the U+00DF. */
+
                     if (!(do_utf8 &&
-                          toLOWER(s[0]) == 's' &&
+                          toLOWER(s[0]) == 's' &&
                           ln >= 2 &&
                           toLOWER(s[1]) == 's' &&
                           (U8)l[0] == 0xC3 &&
@@ -4972,6 +5007,35 @@ NULL
             /* NOTREACHED */
 #undef ST
 
+        case LNBREAK:
+            if ((n=is_LNBREAK(locinput,do_utf8))) {
+                locinput += n;
+                nextchr = UCHARAT(locinput);
+            } else
+                sayNO;
+            break;
+
+#define CASE_CLASS(nAmE)                              \
+        case nAmE:                                    \
+            if ((n=is_##nAmE(locinput,do_utf8))) {    \
+                locinput += n;                        \
+                nextchr = UCHARAT(locinput);          \
+            } else                                    \
+                sayNO;                                \
+            break;                                    \
+        case N##nAmE:                                 \
+            if ((n=is_##nAmE(locinput,do_utf8))) {    \
+                sayNO;                                \
+            } else {                                  \
+                locinput += UTF8SKIP(locinput);       \
+                nextchr = UCHARAT(locinput);          \
+            }                                         \
+            break
+
+        CASE_CLASS(VERTWS);
+        CASE_CLASS(HORIZWS);
+#undef CASE_CLASS
+
        default:
            PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
                          PTR2UV(scan), OP(scan));
@@ -5382,7 +5446,77 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            while (scan < loceol && !isDIGIT(*scan))
                scan++;
        }
+    case LNBREAK:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+               scan += c;
+               hardcount++;
+           }
+       } else {
+           /*
+             LNBREAK can match two latin chars, which is ok,
+             because we have a null terminated string, but we
+             have to use hardcount in this situation
+           */
+           while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
+               scan+=c;
+               hardcount++;
+           }
+       }       
+       break;
+    case HORIZWS:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+               scan += c;
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && is_HORIZWS_latin1(scan)) 
+               scan++;         
+       }       
        break;
+    case NHORIZWS:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !is_HORIZWS_latin1(scan))
+               scan++;
+
+       }       
+       break;
+    case VERTWS:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+               scan += c;
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && is_VERTWS_latin1(scan)) 
+               scan++;
+
+       }       
+       break;
+    case NVERTWS:
+        if (do_utf8) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !is_VERTWS_latin1(scan)) 
+               scan++;
+          
+       }       
+       break;
+
     default:           /* Called on something of 0 width. */
        break;          /* So match right here or not at all. */
     }
index e704427..3c3a5d6 100644 (file)
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX            84
-#define REGMATCH_STATE_MAX     124
+#define REGNODE_MAX            89
+#define REGMATCH_STATE_MAX     129
 
 #define        END                     0       /* 0000 End of program. */
 #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
 #define        COMMIT                  80      /* 0x50 Pattern fails outright if backtracking through this */
 #define        CUTGROUP                81      /* 0x51 On failure go to the next alternation in the group */
 #define        KEEPS                   82      /* 0x52 $& begins here. */
-#define        OPTIMIZED               83      /* 0x53 Placeholder for dump. */
-#define        PSEUDO                  84      /* 0x54 Pseudo opcode for internal use. */
+#define        LNBREAK                 83      /* 0x53 generic newline pattern */
+#define        VERTWS                  84      /* 0x54 vertical whitespace         (Perl 6) */
+#define        NVERTWS                 85      /* 0x55 not vertical whitespace     (Perl 6) */
+#define        HORIZWS                 86      /* 0x56 horizontal whitespace       (Perl 6) */
+#define        NHORIZWS                87      /* 0x57 not horizontal whitespace   (Perl 6) */
+#define        OPTIMIZED               88      /* 0x58 Placeholder for dump. */
+#define        PSEUDO                  89      /* 0x59 Pseudo opcode for internal use. */
        /* ------------ States ------------- */
 #define        TRIE_next               (REGNODE_MAX + 1)       /* state for TRIE */
 #define        TRIE_next_fail          (REGNODE_MAX + 2)       /* state for TRIE */
@@ -225,6 +230,11 @@ EXTCONST U8 PL_regkind[] = {
        VERB,           /* COMMIT                 */
        VERB,           /* CUTGROUP               */
        KEEPS,          /* KEEPS                  */
+       LNBREAK,        /* LNBREAK                */
+       VERTWS,         /* VERTWS                 */
+       NVERTWS,        /* NVERTWS                */
+       HORIZWS,        /* HORIZWS                */
+       NHORIZWS,       /* NHORIZWS               */
        NOTHING,        /* OPTIMIZED              */
        PSEUDO,         /* PSEUDO                 */
        /* ------------ States ------------- */
@@ -358,6 +368,11 @@ static const U8 regarglen[] = {
        EXTRA_SIZE(struct regnode_1),           /* COMMIT       */
        EXTRA_SIZE(struct regnode_1),           /* CUTGROUP     */
        0,                                      /* KEEPS        */
+       0,                                      /* LNBREAK      */
+       0,                                      /* VERTWS       */
+       0,                                      /* NVERTWS      */
+       0,                                      /* HORIZWS      */
+       0,                                      /* NHORIZWS     */
        0,                                      /* OPTIMIZED    */
        0,                                      /* PSEUDO       */
 };
@@ -448,6 +463,11 @@ static const char reg_off_by_arg[] = {
        0,      /* COMMIT       */
        0,      /* CUTGROUP     */
        0,      /* KEEPS        */
+       0,      /* LNBREAK      */
+       0,      /* VERTWS       */
+       0,      /* NVERTWS      */
+       0,      /* HORIZWS      */
+       0,      /* NHORIZWS     */
        0,      /* OPTIMIZED    */
        0,      /* PSEUDO       */
 };
@@ -543,8 +563,13 @@ EXTCONST char * const PL_reg_name[] = {
        "COMMIT",                       /* 0x50 */
        "CUTGROUP",                     /* 0x51 */
        "KEEPS",                        /* 0x52 */
-       "OPTIMIZED",                    /* 0x53 */
-       "PSEUDO",                       /* 0x54 */
+       "LNBREAK",                      /* 0x53 */
+       "VERTWS",                       /* 0x54 */
+       "NVERTWS",                      /* 0x55 */
+       "HORIZWS",                      /* 0x56 */
+       "NHORIZWS",                     /* 0x57 */
+       "OPTIMIZED",                    /* 0x58 */
+       "PSEUDO",                       /* 0x59 */
        /* ------------ States ------------- */
        "TRIE_next",                    /* REGNODE_MAX +0x01 */
        "TRIE_next_fail",               /* REGNODE_MAX +0x02 */
index 1af8fb3..a5b98f6 100755 (executable)
@@ -3386,7 +3386,7 @@ ok(("foba  ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i)
 }
 
 
-
+print "# set PERL_SKIP_PSYCHO_TEST to skip this test\n";
 if (!$ENV{PERL_SKIP_PSYCHO_TEST}){
     my @normal=qw(these are some normal words);
     my $psycho=join "|",@normal,map chr $_,255..20000;
@@ -3773,6 +3773,7 @@ sub iseq($$;$) {
 if ($ENV{PERL_SKIP_PSYCHO_TEST}){
   printf "ok %d Skip: No psycho tests\n", $test++;
 } else {    
+  print "# set PERL_SKIP_PSYCHO_TEST to skip this test\n";
   my $r = qr/^
            (?:
                ( (?:a|z+)+ )
@@ -3913,25 +3914,6 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
     1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g;
     iseq($count,4,"/.(*PRUNE)/");
 }
-{   # Test the \v form of the (*PRUNE) pattern
-    our $count = 0;
-    'aaab'=~/a+b?(?{$count++})(*FAIL)/;
-    iseq($count,9,"expect 9 for no \\v");
-    $count = 0;
-    'aaab'=~/a+b?\v(?{$count++})(*FAIL)/;
-    iseq($count,3,"expect 3 with \\v");
-    local $_='aaab';
-    $count=0;
-    1 while /.\v(?{$count++})(*FAIL)/g;
-    iseq($count,4,"/.\\v/");
-    $count = 0;
-    'aaab'=~/a+b?(??{'\v'})(?{$count++})(*FAIL)/;
-    iseq($count,3,"expect 3 with \\v");
-    local $_='aaab';
-    $count=0;
-    1 while /.(??{'\v'})(?{$count++})(*FAIL)/g;
-    iseq($count,4,"/.\\v/");
-}
 {   # Test the (*SKIP) pattern
     our $count = 0;
     'aaab'=~/a+b?(*SKIP)(?{$count++})(*FAIL)/;
@@ -3947,21 +3929,6 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
     iseq($count,2,"Expect 2 with (*SKIP)" );
     iseq("@res","aaab aaab","adjacent (*SKIP) works as expected" );
 }
-{   # Test the \V form of the (*SKIP) pattern
-    our $count = 0;
-    'aaab'=~/a+b?\V(?{$count++})(*FAIL)/;
-    iseq($count,1,"expect 1 with \\V");
-    local $_='aaab';
-    $count=0;
-    1 while /.\V(?{$count++})(*FAIL)/g;
-    iseq($count,4,"/.\\V/");
-    $_='aaabaaab';
-    $count=0;
-    our @res=();
-    1 while /(a+b?)\V(?{$count++; push @res,$1})(*FAIL)/g;
-    iseq($count,2,"Expect 2 with \\V" );
-    iseq("@res","aaab aaab","adjacent \\V works as expected" );
-}
 {   # Test the (*SKIP) pattern
     our $count = 0;
     'aaab'=~/a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/;
@@ -4345,7 +4312,41 @@ sub kt
         iseq("$1$2",'foooooobaaaaar');    
     }
     iseq("$1$2","foobar");
+}
+{
+    local $_="\t \r\n \n \t".chr(11)."\n";
+    s/\H/H/g;
+    s/\h/h/g;
+    iseq($_,"hhHHhHhhHH");
+    $_="\t \r\n \n \t".chr(11)."\n";
+    utf8::upgrade($_);
+    s/\H/H/g;
+    s/\h/h/g;
+    iseq($_,"hhHHhHhhHH");
 }    
+{
+    my @h=map { chr( $_ ) } (
+        0x09,   0x20,   0xa0,   0x1680, 0x180e, 0x2000, 0x2001, 0x2002,
+        0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
+        0x202f, 0x205f, 0x3000
+    );
+    my @v=map { chr( $_ ) } ( 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 0x2029 );
+    my @lb=( "\x0D\x0A",
+             map { chr( $_ ) } ( 0x0A..0x0D,0x85,0x2028,0x2029 ));
+    foreach my $t ([\@h,qr/\h/,qr/\h+/],[\@v,qr/\v/,qr/\v+/],[\@lb,qr/\R/,qr/\R+/],){
+        my $ary=shift @$t;
+        foreach my $pat (@$t) {
+            foreach my $str (@$ary) {
+                ok($str=~/($pat)/);
+                iseq($1,$str);
+                utf8::upgrade($str);
+                ok($str=~/($pat)/);
+                iseq($1,$str);
+            }
+        }
+    }
+}
+
 # Test counter is at bottom of file. Put new tests above here.
 #-------------------------------------------------------------------
 # Keep the following tests last -- they may crash perl
@@ -4427,7 +4428,8 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/);
 iseq(0+$::test,$::TestCount,"Got the right number of tests!");
 # Don't forget to update this!
 BEGIN {
-    $::TestCount = 1663;
+    $::TestCount = 1928;
     print "1..$::TestCount\n";
 }
 
+
index aa07b56..6eb03a2 100644 (file)
@@ -1283,3 +1283,44 @@ X(\w+)(?=\s)|X(\w+)      Xab     y       [$1-$2] [-ab]
 (?(DEFINE)(?<A>(?&B)+)(?<B>a))(?&A)    a       y       $&      a
 (?(DEFINE)(?<A>(?&B)+)(?<B>a))(?&A)    aa      y       $&      aa
 \x{100}?(??{""})xxx    xxx     y       $&      xxx
+
+foo(\R)bar     foo\r\nbar      y       $1      \r\n
+foo(\R)bar     foo\nbar        y       $1      \n
+foo(\R)bar     foo\rbar        y       $1      \r
+
+foo(\R+)bar    foo\r\n\x{85}\r\n\nbar  y       $1      \r\n\x{85}\r\n\n
+(\V+)(\R)      foo\r\n\x{85}\r\n\nbar  y       $1-$2   foo-\r\n
+(\R+)(\V)      foo\r\n\x{85}\r\n\nbar  y       $1-$2   \r\n\x{85}\r\n\n-b
+foo(\R)bar     foo\x{85}bar    y       $1      \x{85}
+(\V)(\R)       foo\x{85}bar    y       $1-$2   o-\x{85}
+(\R)(\V)       foo\x{85}bar    y       $1-$2   \x{85}-b
+foo(\R)bar     foo\r\nbar      y       $1      \r\n
+(\V)(\R)       foo\r\nbar      y       $1-$2   o-\r\n
+(\R)(\V)       foo\r\nbar      y       $1-$2   \r\n-b
+foo(\R)bar     foo\r\nbar      y       $1      \r\n
+(\V)(\R)       foo\r\nbar      y       $1-$2   o-\r\n
+(\R)(\V)       foo\r\nbar      y       $1-$2   \r\n-b
+foo(\R)bar     foo\rbar        y       $1      \r
+(\V)(\R)       foo\rbar        y       $1-$2   o-\r
+(\R)(\V)       foo\rbar        y       $1-$2   \r-b
+
+foo(\v+)bar    foo\r\n\x{85}\r\n\nbar  y       $1      \r\n\x{85}\r\n\n
+(\V+)(\v)      foo\r\n\x{85}\r\n\nbar  y       $1-$2   foo-\r
+(\v+)(\V)      foo\r\n\x{85}\r\n\nbar  y       $1-$2   \r\n\x{85}\r\n\n-b
+foo(\v)bar     foo\x{85}bar    y       $1      \x{85}
+(\V)(\v)       foo\x{85}bar    y       $1-$2   o-\x{85}
+(\v)(\V)       foo\x{85}bar    y       $1-$2   \x{85}-b
+foo(\v)bar     foo\rbar        y       $1      \r
+(\V)(\v)       foo\rbar        y       $1-$2   o-\r
+(\v)(\V)       foo\rbar        y       $1-$2   \r-b
+
+
+foo(\h+)bar    foo\t\x{A0}bar  y       $1      \t\x{A0}
+(\H+)(\h)      foo\t\x{A0}bar  y       $1-$2   foo-\t
+(\h+)(\H)      foo\t\x{A0}bar  y       $1-$2   \t\x{A0}-b
+foo(\h)bar     foo\x{A0}bar    y       $1      \x{A0}
+(\H)(\h)       foo\x{A0}bar    y       $1-$2   o-\x{A0}
+(\h)(\H)       foo\x{A0}bar    y       $1-$2   \x{A0}-b
+foo(\h)bar     foo\tbar        y       $1      \t
+(\H)(\h)       foo\tbar        y       $1-$2   o-\t
+(\h)(\H)       foo\tbar        y       $1-$2   \t-b
index 919a239..7ad7d89 100755 (executable)
@@ -127,6 +127,9 @@ EOFCODE
                 \$got = "$repl";
 EOFCODE
         }
+        #$code.=qq[\n\$expect="$expect";\n];
+        #use Devel::Peek;
+        #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
        {
            # Probably we should annotate specific tests with which warnings
            # categories they're known to trigger, and hence should be