#if defined(PERL_IN_PERL_C)
static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */
- 1524, /* Number of elements */
+ 1535, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
1 if it starts at the element beyond 0 */
0x0,
+ 0x9,
0xA,
0xB,
0xD,
0xE,
+ 0x20,
+ 0x21,
0x22,
0x23,
0x27,
0x7B,
0x85,
0x86,
+ 0xA0,
+ 0xA1,
0xAA,
0xAB,
0xAD,
0x1FF5,
0x1FF6,
0x1FFD,
+ 0x2000,
+ 0x200B,
0x200C,
0x200E,
0x2010,
0x2028,
0x202A,
0x202F,
+ 0x2030,
0x203F,
0x2041,
0x2044,
0x2045,
0x2054,
0x2055,
+ 0x205F,
0x2060,
0x2065,
0x2066,
0x2E00,
0x2E2F,
0x2E30,
+ 0x3000,
+ 0x3001,
0x3005,
0x3006,
0x302A,
#if defined(PERL_IN_REGEXEC_C)
-#define WB_ENUM_COUNT 19
+#define WB_ENUM_COUNT 20
typedef enum {
WB_Other = 0,
WB_MidNumLet = 13,
WB_Newline = 14,
WB_Numeric = 15,
- WB_Regional_Indicator = 16,
- WB_Single_Quote = 17,
- WB_UNKNOWN = 18
+ WB_Perl_Tailored_HSpace = 16,
+ WB_Regional_Indicator = 17,
+ WB_Single_Quote = 18,
+ WB_UNKNOWN = 19
} WB_enum;
static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_LF,
WB_Newline,
WB_CR,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Double_Quote,
WB_Other,
WB_Single_Quote,
WB_Other,
WB_Newline,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Format,
WB_ALetter,
WB_Other,
WB_ALetter,
- WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_ALetter,
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Extend,
WB_Format,
WB_Other,
WB_MidLetter,
WB_Newline,
WB_Format,
+ WB_Perl_Tailored_HSpace,
WB_Other,
WB_ExtendNumLet,
WB_Other,
WB_Other,
WB_ExtendNumLet,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_Format,
WB_Other,
WB_Format,
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Extend,
#if defined(PERL_IN_PERL_C)
static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */
- 1549, /* Number of elements */
+ 1558, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
1 if it starts at the element beyond 0 */
0x0,
+ 0x5,
+ 0x6,
0xB,
0xD,
0xE,
0x16,
0x25,
0x26,
+ 0x40,
0x42,
0x4A,
0x4B,
0x1FF5,
0x1FF6,
0x1FFD,
+ 0x2000,
+ 0x200B,
0x200C,
0x200E,
0x2010,
0x2028,
0x202A,
0x202F,
+ 0x2030,
0x203F,
0x2041,
0x2044,
0x2045,
0x2054,
0x2055,
+ 0x205F,
0x2060,
0x2065,
0x2066,
0x2E00,
0x2E2F,
0x2E30,
+ 0x3000,
+ 0x3001,
0x3005,
0x3006,
0x302A,
#if defined(PERL_IN_REGEXEC_C)
-#define WB_ENUM_COUNT 19
+#define WB_ENUM_COUNT 20
typedef enum {
WB_Other = 0,
WB_MidNumLet = 13,
WB_Newline = 14,
WB_Numeric = 15,
- WB_Regional_Indicator = 16,
- WB_Single_Quote = 17,
- WB_UNKNOWN = 18
+ WB_Perl_Tailored_HSpace = 16,
+ WB_Regional_Indicator = 17,
+ WB_Single_Quote = 18,
+ WB_UNKNOWN = 19
} WB_enum;
static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Newline,
WB_CR,
WB_Other,
WB_Other,
WB_Newline,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_MidNumLet,
WB_ALetter,
WB_Other,
WB_ALetter,
- WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_ALetter,
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Extend,
WB_Format,
WB_Other,
WB_MidLetter,
WB_Newline,
WB_Format,
+ WB_Perl_Tailored_HSpace,
WB_Other,
WB_ExtendNumLet,
WB_Other,
WB_Other,
WB_ExtendNumLet,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_Format,
WB_Other,
WB_Format,
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Extend,
#if defined(PERL_IN_PERL_C)
static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */
- 1545, /* Number of elements */
+ 1554, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
1 if it starts at the element beyond 0 */
0x0,
+ 0x5,
+ 0x6,
0xB,
0xD,
0xE,
0x16,
0x25,
0x26,
+ 0x40,
0x42,
0x4A,
0x4B,
0x1FF5,
0x1FF6,
0x1FFD,
+ 0x2000,
+ 0x200B,
0x200C,
0x200E,
0x2010,
0x2028,
0x202A,
0x202F,
+ 0x2030,
0x203F,
0x2041,
0x2044,
0x2045,
0x2054,
0x2055,
+ 0x205F,
0x2060,
0x2065,
0x2066,
0x2E00,
0x2E2F,
0x2E30,
+ 0x3000,
+ 0x3001,
0x3005,
0x3006,
0x302A,
#if defined(PERL_IN_REGEXEC_C)
-#define WB_ENUM_COUNT 19
+#define WB_ENUM_COUNT 20
typedef enum {
WB_Other = 0,
WB_MidNumLet = 13,
WB_Newline = 14,
WB_Numeric = 15,
- WB_Regional_Indicator = 16,
- WB_Single_Quote = 17,
- WB_UNKNOWN = 18
+ WB_Perl_Tailored_HSpace = 16,
+ WB_Regional_Indicator = 17,
+ WB_Single_Quote = 18,
+ WB_UNKNOWN = 19
} WB_enum;
static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Newline,
WB_CR,
WB_Other,
WB_Other,
WB_LF,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_MidNumLet,
WB_ALetter,
WB_Other,
WB_ALetter,
- WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_ALetter,
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Extend,
WB_Format,
WB_Other,
WB_MidLetter,
WB_Newline,
WB_Format,
+ WB_Perl_Tailored_HSpace,
WB_Other,
WB_ExtendNumLet,
WB_Other,
WB_Other,
WB_ExtendNumLet,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_Format,
WB_Other,
WB_Format,
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Extend,
#if defined(PERL_IN_PERL_C)
static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */
- 1547, /* Number of elements */
+ 1556, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
1 if it starts at the element beyond 0 */
0x0,
+ 0x5,
+ 0x6,
0xB,
0xD,
0xE,
0x16,
0x25,
0x26,
+ 0x40,
0x42,
0x4A,
0x4B,
0x1FF5,
0x1FF6,
0x1FFD,
+ 0x2000,
+ 0x200B,
0x200C,
0x200E,
0x2010,
0x2028,
0x202A,
0x202F,
+ 0x2030,
0x203F,
0x2041,
0x2044,
0x2045,
0x2054,
0x2055,
+ 0x205F,
0x2060,
0x2065,
0x2066,
0x2E00,
0x2E2F,
0x2E30,
+ 0x3000,
+ 0x3001,
0x3005,
0x3006,
0x302A,
#if defined(PERL_IN_REGEXEC_C)
-#define WB_ENUM_COUNT 19
+#define WB_ENUM_COUNT 20
typedef enum {
WB_Other = 0,
WB_MidNumLet = 13,
WB_Newline = 14,
WB_Numeric = 15,
- WB_Regional_Indicator = 16,
- WB_Single_Quote = 17,
- WB_UNKNOWN = 18
+ WB_Perl_Tailored_HSpace = 16,
+ WB_Regional_Indicator = 17,
+ WB_Single_Quote = 18,
+ WB_UNKNOWN = 19
} WB_enum;
static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Newline,
WB_CR,
WB_Other,
WB_Other,
WB_Newline,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_MidNumLet,
WB_ALetter,
WB_Other,
WB_ALetter,
- WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_ALetter,
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Extend,
WB_Format,
WB_Other,
WB_MidLetter,
WB_Newline,
WB_Format,
+ WB_Perl_Tailored_HSpace,
WB_Other,
WB_ExtendNumLet,
WB_Other,
WB_Other,
WB_ExtendNumLet,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_Format,
WB_Other,
WB_Format,
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Extend,
* 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
* 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
* a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * 21f614a12bfde0478588228d46f1b594bf7e23c7d1f51492c70b13f7c9b8de09 lib/unicore/mktables
+ * 2b18fcfeafc8e8a26ff1124ad4ca94020f287bc4651be7ea199d69ecd5dcf9c5 lib/unicore/mktables
* 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
* 996abda3c0fbc2bfd575092af09e3b9b0331e624eb2e969a268457f8fd31ecbb regen/charset_translations.pl
- * 8a097f8f726bb1619af2f27f149ab87e60a1602f790147e3a561358be16abd27 regen/mk_invlists.pl
+ * 214ab3909a11fcc57cb6ee0611897342109b5a895b2b42d5227b80d948744a0a regen/mk_invlists.pl
* ex: set ro: */
}
}
+ # Perl tailors the WordBreak property so that \b{wb} doesn't split
+ # adjacent spaces into separate words. First create a copy of the regular
+ # WB property as '_Perl_WB'. (On Unicode releases earlier than when WB
+ # was defined for, this will already have been done by the substitute file
+ # portion for 'Input_file' code for WB.)
+ my $perl_wb = property_ref('_Perl_WB');
+ if (! defined $perl_wb) {
+ $perl_wb = Property->new('_Perl_WB',
+ Fate => $INTERNAL_ONLY,
+ Perl_Extension => 1,
+ Directory => $map_directory,
+ Type => $STRING);
+ my $wb = property_ref('Word_Break');
+ $perl_wb->initialize($wb);
+ $perl_wb->set_default_map($wb->default_map);
+ }
+
+ # And simply replace the mappings of horizontal space characters that
+ # otherwise would map to the default to instead map to our tailoring.
+ my $default = $perl_wb->default_map;
+ for my $range ($Blank->ranges) {
+ for my $i ($range->start .. $range->end) {
+ next unless $perl_wb->value_of($i) eq $default;
+ $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
+ Replace => $UNCONDITIONALLY);
+ }
+ }
+
# Here done with all the basic stuff. Ready to populate the information
# about each character if annotating them.
if ($annotate) {
Skip => $Documentation,
),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
- Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
+ Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
+
+ # Don't use _Perl_WB as a synonym for
+ # Word_Break in later perls, as it is tailored
+ # and isn't the same as Word_Break
+ 'ONLY_EARLY' ],
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
$template =~ s/$breakable_utf8/$breakable/g;
}
+ # Perl customizes wb. So change the official tests accordingly
+ if ($break_type eq 'wb') {
+
+ # Split into elements that alternate between code point and
+ # break/no-break
+ my @line = split / +/, $template;
+
+ # Look at each code point and its following one
+ for (my $i = 1; $i < @line - 1 - 1; $i+=2) {
+
+ # The customization only involves changing some breaks to
+ # non-breaks.
+ next if $line[$i+1] =~ /$nobreak/;
+
+ my $lhs = chr hex $line[$i];
+ my $rhs = chr hex $line[$i+2];
+
+ # And it only affects adjacent space characters.
+ next if $lhs !~ /\s/u;
+
+ # But, we want to make sure to test spaces followed by a Extend
+ # or Format.
+ next if $rhs !~ /[\s\p{WB=Extend}\p{WB=Format}]/;
+
+ # To test the customization, add some white-space before this to
+ # create a span. The $lhs white space may or may not be bound to
+ # that span, and also with the $rhs. If the $rhs is a binding
+ # character, the $lhs is bound to it and not to the span, unless
+ # $lhs is vertical space. In all other cases, the $lhs is bound
+ # to the span. If the $rhs is white space, it is bound to the
+ # $lhs
+ my $bound;
+ my $span;
+ if ($rhs =~ /[\p{WB=Extend}\p{WB=Format}]/) {
+ if ($lhs =~ /\v/) {
+ $bound = $breakable;
+ $span = $nobreak;
+ }
+ else {
+ $bound = $nobreak;
+ $span = $breakable;
+ }
+ }
+ else {
+ $span = $nobreak;
+ $bound = $nobreak;
+ }
+
+ splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
+ $i += 4;
+ $line[$i+1] = $bound;
+ }
+ $template = join " ", @line;
+ }
+
# The input is just the break/no-break symbols and sequences of Unicode
# code points as hex digits separated by spaces for legibility. e.g.:
# ÷ 0020 × 0308 ÷ 0020 ÷
Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
+
+# Make sure this gets tested; it was not part of the official test suite at
+# the time this was addded. Note that this is as it would appear in the
+# official suite, and gets modified to check for the perl tailoring by
+# Test_WB()
+Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
[ List each incompatible change as a =head2 entry ]
+=head2 C<qr/\b{wb}/> is now tailored to Perl expectations
+
+This is now more suited to be a drop-in replacement for plain C<\b>, but
+giving better results for parsing natural language. Previously it
+strictly followed the current Unicode rules which calls for it to match
+between each white space character. Now it doesn't generally match
+within spans of white space, behaving like C<\b> does. See
+L<perlrebackslash/\b{wb}>
+
=head1 Deprecations
XXX Any deprecated features, syntax, modules etc. should be listed here.
=item C<\b{wb}>
-This matches a Unicode "Word Boundary". This gives better (though not
+This matches a Unicode "Word Boundary", but tailored to Perl
+expectations. This gives better (though not
perfect) results for natural language processing than plain C<\b>
(without braces) does. For example, it understands that apostrophes can
be in the middle of words and that parentheses aren't (see the examples
below). More details are at L<http://www.unicode.org/reports/tr29/>.
+The current Unicode definition of a Word Boundary matches between every
+white space character. Perl tailors this, starting in version 5.24, to
+generally not break up spans of white space, just as plain C<\b> has
+always functioned. This allows C<\b{wb}> to be a drop-in replacement for
+C<\b>, but with generally better results for natural language
+processing. (The exception to this tailoring is when a span of white
+space is immediately followed by something like U+0303, COMBINING TILDE.
+If the final space character in the span is a horizontal white space, it
+is broken out so that it attaches instead to the combining character.
+To be precise, if a span of white space that ends in a horizontal space
+has the character immediately following it have either of the Word
+Boundary property values "Extend" or "Format", the boundary between the
+final horizontal space character and the rest of the span matches
+C<\b{wb}>. In all other cases the boundary between two white space
+characters matches C<\B{wb}>.)
+
=back
It is important to realize when you use these Unicode boundaries,
* 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
* 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
* a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * 21f614a12bfde0478588228d46f1b594bf7e23c7d1f51492c70b13f7c9b8de09 lib/unicore/mktables
+ * 2b18fcfeafc8e8a26ff1124ad4ca94020f287bc4651be7ea199d69ecd5dcf9c5 lib/unicore/mktables
* 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
* 996abda3c0fbc2bfd575092af09e3b9b0331e624eb2e969a268457f8fd31ecbb regen/charset_translations.pl
* d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 regen/regcharclass.pl
'Newline',
'Numeric',
'Other',
+ 'Perl_Tailored_HSpace',
'Regional_Indicator',
'Single_Quote',
'UNKNOWN',
const bool utf8_target)
{
/* Return a boolean as to if the boundary between 'before' and 'after' is
- * a Unicode word break, using their published algorithm. Context may be
+ * a Unicode word break, using their published algorithm, but tailored for
+ * Perl by treating spans of white space as one unit. Context may be
* needed to make this determination. If the value for the character
* before 'before' is known, it is passed as 'previous'; otherwise that
* should be set to WB_UNKNOWN. The other input parameters give the
return TRUE;
}
- /* WB 3: Do not break within CRLF. */
- if (before == WB_CR && after == WB_LF) {
- return FALSE;
+ /* WB 3 is: "Do not break within CRLF." Perl extends this so that all
+ * white space sequences ending in a vertical space are treated as one
+ * unit. */
+
+ if (after == WB_CR || after == WB_LF || after == WB_Newline) {
+ if (before == WB_CR || before == WB_LF || before == WB_Newline
+ || before == WB_Perl_Tailored_HSpace)
+ {
+ return FALSE;
+ }
+
+ /* WB 3a: Otherwise break before Newlines (including CR and LF) */
+ return TRUE;
}
- /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
- * and LF) */
+ /* Here, we know that 'after' is not a vertical space character, but
+ * 'before' could be. WB 3b is: "Otherwise break after Newlines (including
+ * CR and LF)." Perl changes that to not break-up spans of white space,
+ * except when horizontal space is followed by an Extend or Format
+ * character. These apply just to the final white space character in the
+ * span, so it is broken away from the rest. (If the Extend or Format
+ * character follows a vertical space character, it is treated as beginning
+ * a line, and doesn't modify the preceeding character.) */
if ( before == WB_CR || before == WB_LF || before == WB_Newline
- || after == WB_CR || after == WB_LF || after == WB_Newline)
+ || before == WB_Perl_Tailored_HSpace)
{
- return TRUE;
+ if (after == WB_Perl_Tailored_HSpace) {
+ U8 * temp_pos = (U8 *) curpos;
+ const WB_enum next
+ = advance_one_WB(&temp_pos, strend, utf8_target,
+ FALSE /* Don't skip Extend nor Format */ );
+ return next == WB_Extend || next == WB_Format;
+ }
+ else if (before != WB_Perl_Tailored_HSpace) {
+
+ /* Here, 'before' must be one of the vertical space characters, and
+ * after is not any type of white-space. Follow WB 3b. */
+ return TRUE;
+ }
+
+ /* Here, 'before' is horizontal space, and 'after' is not any kind of
+ * space. Normal rules apply */
}
/* Ignore Format and Extend characters, except when they appear at the