Document \s change for VT, commit 075b9d7d9a6d4473b240a047655e507c8baa6db3

author Karl Williamson <public@khwilliamson.com>

Sun, 24 Feb 2013 18:17:19 +0000 (11:17 -0700)

committer Karl Williamson <public@khwilliamson.com>

Sun, 24 Feb 2013 19:23:06 +0000 (12:23 -0700)
author Karl Williamson <public@khwilliamson.com>
Sun, 24 Feb 2013 18:17:19 +0000 (11:17 -0700)
committer Karl Williamson <public@khwilliamson.com>
Sun, 24 Feb 2013 19:23:06 +0000 (12:23 -0700)
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index ae8af02..c364c83 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -13209,7 +13209,7 @@ sub compile_perl() {
      # No Posix equivalent for vertical space
  
      my $Space = $perl->add_match_table('Space',
-                Description => '\s including beyond ASCII plus vertical tab',
+                Description => '\s including beyond ASCII and vertical tab',
                  Initialize => $Blank + $VertSpace,
      );
      $Space->add_alias('XPosixSpace');
@@ -13218,7 +13218,7 @@ sub compile_perl() {
                              Initialize => $Space & $ASCII,
                              );
  
-    # Perl's traditional space doesn't include Vertical Tab
+    # Perl's traditional space doesn't include Vertical Tab prior to v5.18
      my $XPerlSpace = $perl->add_match_table('XPerlSpace',
                                    Description => '\s, including beyond ASCII',
                                    #Initialize => $Space - 0x000B,
diff --git a/pod/perlre.pod b/pod/perlre.pod

index d6a405f..343cbda 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -353,7 +353,8 @@ When it appears singly, it causes the sequences C<\d>, C<\s>, C<\w>, and
  the Posix character classes to match only in the ASCII range.  They thus
  revert to their pre-5.6, pre-Unicode meanings.  Under C</a>,  C<\d>
  always means precisely the digits C<"0"> to C<"9">; C<\s> means the five
-characters C<[ \f\n\r\t]>; C<\w> means the 63 characters
+characters C<[ \f\n\r\t]>, and starting in Perl v5.18, experimentally,
+the vertical tab; C<\w> means the 63 characters
  C<[A-Za-z0-9_]>; and likewise, all the Posix classes such as
  C<[[:print:]]> match only the appropriate ASCII-range characters.
  
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod

index 7478932..4cc3dfd 100644 (file)
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -209,9 +209,11 @@ C<\s> matches any single character considered whitespace.
  
  =item If the C</a> modifier is in effect ...
  
-C<\s> matches the 5 characters [\t\n\f\r ]; that is, the horizontal tab,
-the newline, the form feed, the carriage return, and the space.  (Note
-that it doesn't match the vertical tab, C<\cK> on ASCII platforms.)
+In all Perl versions, C<\s> matches the 5 characters [\t\n\f\r ]; that
+is, the horizontal tab,
+the newline, the form feed, the carriage return, and the space.
+Starting in Perl v5.18, experimentally, it also matches the vertical tab, C<\cK>.
+See note C<[1]> below for a discussion of this.
  
  =item otherwise ...
  
@@ -228,9 +230,7 @@ in the table below.
  
  =item if locale rules are in effect ...
  
-C<\s> matches whatever the locale considers to be whitespace.  Note that
-this is likely to include the vertical space, unlike non-locale C<\s>
-matching.
+C<\s> matches whatever the locale considers to be whitespace.
  
  =item if Unicode rules are in effect or if on an EBCDIC platform ...
  
@@ -239,7 +239,9 @@ table below.
  
  =item otherwise ...
  
-C<\s> matches [\t\n\f\r ].
+C<\s> matches [\t\n\f\r\cK ] and, starting, experimentally in Perl
+v5.18, the vertical tab, C<\cK>.
+(See note C<[1]> below for a discussion of this.)
  Note that this list doesn't include the non-breaking space.
  
  =back
@@ -278,9 +280,9 @@ Note that unlike C<\s> (and C<\d> and C<\w>), C<\h> and C<\v> always match
  the same characters, without regard to other factors, such as the active
  locale or whether the source string is in UTF-8 format.
  
-One might think that C<\s> is equivalent to C<[\h\v]>. This is not true.
-The difference is that the vertical tab (C<"\x0b">) is not matched by
-C<\s>; it is however considered vertical whitespace.
+One might think that C<\s> is equivalent to C<[\h\v]>. This is indeed true
+starting in Perl v5.18, but prior to that, the sole difference was that the
+vertical tab (C<"\cK">) was not matched by C<\s>.
  
  The following table is a complete listing of characters matched by
  C<\s>, C<\h> and C<\v> as of Unicode 6.0.
@@ -292,12 +294,12 @@ page is in effect that changes the C<\s> matching).
  
   0x0009        CHARACTER TABULATION   h s
   0x000a              LINE FEED (LF)    vs
- 0x000b             LINE TABULATION    v
+ 0x000b             LINE TABULATION    vs  [1]
   0x000c              FORM FEED (FF)    vs
   0x000d        CARRIAGE RETURN (CR)    vs
   0x0020                       SPACE   h s
- 0x0085             NEXT LINE (NEL)    vs  [1]
- 0x00a0              NO-BREAK SPACE   h s  [1]
+ 0x0085             NEXT LINE (NEL)    vs  [2]
+ 0x00a0              NO-BREAK SPACE   h s  [2]
   0x1680            OGHAM SPACE MARK   h s
   0x180e   MONGOLIAN VOWEL SEPARATOR   h s
   0x2000                     EN QUAD   h s
@@ -321,6 +323,16 @@ page is in effect that changes the C<\s> matching).
  
  =item [1]
  
+Prior to Perl v5.18, C<\s> did not match the vertical tab.  The change
+in v5.18 is considered an experiment, which means it could be backed out
+in v5.20 or v5.22 if experience indicates that it breaks too much
+existing code.  If this change adversely affects you, send email to
+C<perlbug@perl.org>; if it affects you positively, email
+C<perlthanks@perl.org>.  In the meantime, C<[^\S\cK]> (obscurely)
+matches what C<\s> traditionally did.
+
+=item [2]
+
  NEXT LINE and NO-BREAK SPACE may or may not match C<\s> depending
  on the rules in effect.  See
  L<the beginning of this section|/Whitespace>.
@@ -666,7 +678,8 @@ Perl recognizes the following POSIX character classes:
   lower  Any lowercase character ("[a-z]").
   print  Any printable character, including a space.  See Note [4] below.
   punct  Any graphical character excluding "word" characters.  Note [5].
- space  Any whitespace character. "\s" plus the vertical tab ("\cK").
+ space  Any whitespace character. "\s" including the vertical tab
+        ("\cK").
   upper  Any uppercase character ("[A-Z]").
   word   A Perl extension ("[A-Za-z0-9_]"), equivalent to "\w".
   xdigit Any hexadecimal digit ("[0-9a-fA-F]").
@@ -757,9 +770,10 @@ Unicode considers symbols.
  
  =item [6]
  
-C<\p{SpacePerl}> and C<\p{Space}> differ only in that in non-locale
-matching, C<\p{Space}> additionally
-matches the vertical tab, C<\cK>.   Same for the two ASCII-only range forms.
+C<\p{SpacePerl}> and C<\p{Space}> match identically starting with Perl
+v5.18.  In earlier versions, these differ only in that in non-locale
+matching, C<\p{SpacePerl}> does not match the vertical tab, C<\cK>.
+Same for the two ASCII-only range forms.
  
  =back
  
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index 86db3ec..7a0b915 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -721,7 +721,8 @@ This is a synonym for C<\p{Present_In=*}>
  
  =item B<C<\p{PerlSpace}>>
  
-This is the same as C<\s>, restricted to ASCII, namely C<S<[ \f\n\r\t]>>.
+This is the same as C<\s>, restricted to ASCII, namely C<S<[ \f\n\r\t]>>
+and starting in Perl v5.18, experimentally, a vertical tab.
  
  Mnemonic: Perl's (original) space
author	Karl Williamson <public@khwilliamson.com>
	Sun, 24 Feb 2013 18:17:19 +0000 (11:17 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Sun, 24 Feb 2013 19:23:06 +0000 (12:23 -0700)
lib/unicore/mktables		patch \| blob \| blame \| history
pod/perlre.pod		patch \| blob \| blame \| history
pod/perlrecharclass.pod		patch \| blob \| blame \| history
pod/perlunicode.pod		patch \| blob \| blame \| history