perlunicode.pod: Elaborate unicode bug for POSIX

[perl5.git] / pod / perlunicode.pod
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index 518d239..76dc40d 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -6,106 +6,136 @@ perlunicode - Unicode support in Perl
  
  =head2 Important Caveats
  
-Unicode support is an extensive requirement. While perl does not
+Unicode support is an extensive requirement. While Perl does not
  implement the Unicode standard or the accompanying technical reports
  from cover to cover, Perl does support many Unicode features.
  
+People who want to learn to use Unicode in Perl, should probably read
+the L<Perl Unicode tutorial, perlunitut|perlunitut>, before reading
+this reference document.
+
+Also, the use of Unicode may present security issues that aren't obvious.
+Read L<Unicode Security Considerations|http://www.unicode.org/reports/tr36>.
+
  =over 4
  
-=item Input and Output Disciplines
+=item Input and Output Layers
  
-A filehandle can be marked as containing perl's internal Unicode
-encoding (UTF-8 or UTF-EBCDIC) by opening it with the ":utf8" layer.
-Other encodings can be converted to perl's encoding on input, or from
-perl's encoding on output by use of the ":encoding(...)" layer.
-See L<open>.
+Perl knows when a filehandle uses Perl's internal Unicode encodings
+(UTF-8, or UTF-EBCDIC if in EBCDIC) if the filehandle is opened with
+the ":utf8" layer.  Other encodings can be converted to Perl's
+encoding on input or from Perl's encoding on output by use of the
+":encoding(...)"  layer.  See L<open>.
  
-To mark the Perl source itself as being in a particular encoding,
-see L<encoding>.
+To indicate that Perl source itself is in UTF-8, use C<use utf8;>.
  
  =item Regular Expressions
  
  The regular expression compiler produces polymorphic opcodes.  That is,
-the pattern adapts to the data and automatically switch to the Unicode
-character scheme when presented with Unicode data, or a traditional
-byte scheme when presented with byte data.
+the pattern adapts to the data and automatically switches to the Unicode
+character scheme when presented with data that is internally encoded in
+UTF-8, or instead uses a traditional byte scheme when presented with
+byte data.
  
  =item C<use utf8> still needed to enable UTF-8/UTF-EBCDIC in scripts
  
-As a compatibility measure, this pragma must be explicitly used to
-enable recognition of UTF-8 in the Perl scripts themselves on ASCII
-based machines, or to recognize UTF-EBCDIC on EBCDIC based machines.
-B<NOTE: this should be the only place where an explicit C<use utf8>
-is needed>.
+As a compatibility measure, the C<use utf8> pragma must be explicitly
+included to enable recognition of UTF-8 in the Perl scripts themselves
+(in string or regular expression literals, or in identifier names) on
+ASCII-based machines or to recognize UTF-EBCDIC on EBCDIC-based
+machines.  B<These are the only times when an explicit C<use utf8>
+is needed.>  See L<utf8>.
+
+=item BOM-marked scripts and UTF-16 scripts autodetected
+
+If a Perl script begins marked with the Unicode BOM (UTF-16LE, UTF16-BE,
+or UTF-8), or if the script looks like non-BOM-marked UTF-16 of either
+endianness, Perl will correctly read in the script as Unicode.
+(BOMless UTF-8 cannot be effectively recognized or differentiated from
+ISO 8859-1 or other eight-bit encodings.)
+
+=item C<use encoding> needed to upgrade non-Latin-1 byte strings
  
-You can also use the C<encoding> pragma to change the default encoding
-of the data in your script; see L<encoding>.
+By default, there is a fundamental asymmetry in Perl's Unicode model:
+implicit upgrading from byte strings to Unicode strings assumes that
+they were encoded in I<ISO 8859-1 (Latin-1)>, but Unicode strings are
+downgraded with UTF-8 encoding.  This happens because the first 256
+codepoints in Unicode happens to agree with Latin-1.
+
+See L</"Byte and Character Semantics"> for more details.
  
  =back
  
-=head2 Byte and Character semantics
+=head2 Byte and Character Semantics
  
-Beginning with version 5.6, Perl uses logically wide characters to
+Beginning with version 5.6, Perl uses logically-wide characters to
  represent strings internally.
  
-In future, Perl-level operations can be expected to work with
-characters rather than bytes, in general.
+In future, Perl-level operations will be expected to work with
+characters rather than bytes.
  
-However, as strictly an interim compatibility measure, Perl aims to
+However, as an interim compatibility measure, Perl aims to
  provide a safe migration path from byte semantics to character
  semantics for programs.  For operations where Perl can unambiguously
-decide that the input data is characters, Perl now switches to
+decide that the input data are characters, Perl switches to
  character semantics.  For operations where this determination cannot
  be made without additional information from the user, Perl decides in
-favor of compatibility, and chooses to use byte semantics.
+favor of compatibility and chooses to use byte semantics.
+
+Under byte semantics, when C<use locale> is in effect, Perl uses the
+semantics associated with the current locale.  Absent a C<use locale>, and
+absent a C<use feature 'unicode_strings'> pragma, Perl currently uses US-ASCII
+(or Basic Latin in Unicode terminology) byte semantics, meaning that characters
+whose ordinal numbers are in the range 128 - 255 are undefined except for their
+ordinal numbers.  This means that none have case (upper and lower), nor are any
+a member of character classes, like C<[:alpha:]> or C<\w>.  (But all do belong
+to the C<\W> class or the Perl regular expression extension C<[:^alpha:]>.)
  
  This behavior preserves compatibility with earlier versions of Perl,
-which allowed byte semantics in Perl operations, but only as long as
-none of the program's inputs are marked as being as source of Unicode
+which allowed byte semantics in Perl operations only if
+none of the program's inputs were marked as being a source of Unicode
  character data.  Such data may come from filehandles, from calls to
  external programs, from information provided by the system (such as %ENV),
  or from literals and constants in the source text.
  
-On Windows platforms, if the C<-C> command line switch is used, (or the
-${^WIDE_SYSTEM_CALLS} global flag is set to C<1>), all system calls
-will use the corresponding wide character APIs.  Note that this is
-currently only implemented on Windows since other platforms lack an
-API standard on this area.
+The C<bytes> pragma will always, regardless of platform, force byte
+semantics in a particular lexical scope.  See L<bytes>.
  
-Regardless of the above, the C<bytes> pragma can always be used to
-force byte semantics in a particular lexical scope.  See L<bytes>.
+The C<use feature 'unicode_strings'> pragma is intended to always, regardless
+of platform, force character (Unicode) semantics in a particular lexical scope.
+In release 5.12, it is partially implemented, applying only to case changes.
+See L</The "Unicode Bug"> below.
  
  The C<utf8> pragma is primarily a compatibility device that enables
  recognition of UTF-(8|EBCDIC) in literals encountered by the parser.
-Note that this pragma is only required until a future version of Perl
-in which character semantics will become the default.  This pragma may
-then become a no-op.  See L<utf8>.
-
-Unless mentioned otherwise, Perl operators will use character semantics
-when they are dealing with Unicode data, and byte semantics otherwise.
-Thus, character semantics for these operations apply transparently; if
-the input data came from a Unicode source (for example, by adding a
-character encoding discipline to the filehandle whence it came, or a
-literal Unicode string constant in the program), character semantics
-apply; otherwise, byte semantics are in effect.  To force byte semantics
-on Unicode data, the C<bytes> pragma should be used.
-
-Notice that if you concatenate strings with byte semantics and strings
-with Unicode character data, the bytes will by default be upgraded
-I<as if they were ISO 8859-1 (Latin-1)> (or if in EBCDIC, after a
-translation to ISO 8859-1). This is done without regard to the
-system's native 8-bit encoding, so to change this for systems with
-non-Latin-1 (or non-EBCDIC) native encodings, use the C<encoding>
-pragma, see L<encoding>.
+Note that this pragma is only required while Perl defaults to byte
+semantics; when character semantics become the default, this pragma
+may become a no-op.  See L<utf8>.
+
+Unless explicitly stated, Perl operators use character semantics
+for Unicode data and byte semantics for non-Unicode data.
+The decision to use character semantics is made transparently.  If
+input data comes from a Unicode source--for example, if a character
+encoding layer is added to a filehandle or a literal Unicode
+string constant appears in a program--character semantics apply.
+Otherwise, byte semantics are in effect.  The C<bytes> pragma should
+be used to force byte semantics on Unicode data, and the C<use feature
+'unicode_strings'> pragma to force Unicode semantics on byte data (though in
+5.12 it isn't fully implemented).
+
+If strings operating under byte semantics and strings with Unicode
+character data are concatenated, the new string will have
+character semantics.  This can cause surprises: See L</BUGS>, below.
+You can choose to be warned when this happens.  See L<encoding::warnings>.
  
  Under character semantics, many operations that formerly operated on
-bytes change to operating on characters. A character in Perl is
+bytes now operate on characters. A character in Perl is
  logically just a number ranging from 0 to 2**31 or so. Larger
-characters may encode to longer sequences of bytes internally, but
-this is just an internal detail which is hidden at the Perl level.
-See L<perluniintro> for more on this.
+characters may encode into longer sequences of bytes internally, but
+this internal detail is mostly hidden for Perl code.
+See L<perluniintro> for more.
  
-=head2 Effects of character semantics
+=head2 Effects of Character Semantics
  
  Character semantics have the following effects:
  
@@ -113,487 +143,850 @@ Character semantics have the following effects:
  
  =item *
  
-Strings and patterns may contain characters that have an ordinal value
-larger than 255.
+Strings--including hash keys--and regular expression patterns may
+contain characters that have an ordinal value larger than 255.
  
-If you use a Unicode editor to edit your program, Unicode characters
-may occur directly within the literal strings in one of the various
-Unicode encodings (UTF-8, UTF-EBCDIC, UCS-2, etc.), but are recognized
-as such (and converted to Perl's internal representation) only if the
-appropriate L<encoding> is specified.
+If you use a Unicode editor to edit your program, Unicode characters may
+occur directly within the literal strings in UTF-8 encoding, or UTF-16.
+(The former requires a BOM or C<use utf8>, the latter requires a BOM.)
  
-You can also get Unicode characters into a string by using the C<\x{...}>
-notation, putting the Unicode code for the desired character, in
-hexadecimal, into the curlies. For instance, a smiley face is C<\x{263A}>.
-This works only for characters with a code 0x100 and above.
+Unicode characters can also be added to a string by using the C<\N{U+...}>
+notation.  The Unicode code for the desired character, in hexadecimal,
+should be placed in the braces, after the C<U>. For instance, a smiley face is
+C<\N{U+263A}>.
+
+Alternatively, you can use the C<\x{...}> notation for characters 0x100 and
+above.  For characters below 0x100 you may get byte semantics instead of
+character semantics;  see L</The "Unicode Bug">.  On EBCDIC machines there is
+the additional problem that the value for such characters gives the EBCDIC
+character rather than the Unicode one.
  
  Additionally, if you
+
     use charnames ':full';
-you can use the C<\N{...}> notation, putting the official Unicode character
-name within the curlies. For example, C<\N{WHITE SMILING FACE}>.
-This works for all characters that have names.
+
+you can use the C<\N{...}> notation and put the official Unicode
+character name within the braces, such as C<\N{WHITE SMILING FACE}>.
+See L<charnames>.
  
  =item *
  
-If an appropriate L<encoding> is specified,
-identifiers within the Perl script may contain Unicode alphanumeric
-characters, including ideographs.  (You are currently on your own when
-it comes to using the canonical forms of characters--Perl doesn't
-(yet) attempt to canonicalize variable names for you.)
+If an appropriate L<encoding> is specified, identifiers within the
+Perl script may contain Unicode alphanumeric characters, including
+ideographs.  Perl does not currently attempt to canonicalize variable
+names.
  
  =item *
  
-Regular expressions match characters instead of bytes.  For instance,
-"." matches a character instead of a byte.  (However, the C<\C> pattern
-is provided to force a match a single byte ("C<char>" in C, hence C<\C>).)
+Regular expressions match characters instead of bytes.  "." matches
+a character instead of a byte.
  
  =item *
  
-Character classes in regular expressions match characters instead of
-bytes, and match against the character properties specified in the
-Unicode properties database.  So C<\w> can be used to match an
+Bracketed character classes in regular expressions match characters instead of
+bytes and match against the character properties specified in the
+Unicode properties database.  C<\w> can be used to match a Japanese
  ideograph, for instance.
  
  =item *
  
-Named Unicode properties, scripts, and block ranges may be used like
-character classes via the new C<\p{}> (matches property) and C<\P{}>
-(doesn't match property) constructs. For instance, C<\p{Lu}> matches any
-character with the Unicode "Lu" (Letter, uppercase) property, while
-C<\p{M}> matches any character with a "M" (mark -- accents and such)
-property. Single letter properties may omit the brackets, so that can be
-written C<\pM> also. Many predefined properties are available, such
-as C<\p{Mirrored}> and C<\p{Tibetan}>.
+Named Unicode properties, scripts, and block ranges may be used (like bracketed
+character classes) by using the C<\p{}> "matches property" construct and
+the C<\P{}> negation, "doesn't match property".
+See L</"Unicode Character Properties"> for more details.
+
+You can define your own character properties and use them
+in the regular expression with the C<\p{}> or C<\P{}> construct.
+See L</"User-Defined Character Properties"> for more details.
+
+=item *
+
+The special pattern C<\X> matches a logical character, an "extended grapheme
+cluster" in Standardese.  In Unicode what appears to the user to be a single
+character, for example an accented C<G>, may in fact be composed of a sequence
+of characters, in this case a C<G> followed by an accent character.  C<\X>
+will match the entire sequence.
+
+=item *
  
-The official Unicode script and block names have spaces and dashes as
-separators, but for convenience you can have dashes, spaces, and underbars
-at every word division, and you need not care about correct casing. It is
-recommended, however, that for consistency you use the following naming:
-the official Unicode script, block, or property name (see below for the
-additional rules that apply to block names), with whitespace and dashes
-removed, and the words "uppercase-first-lowercase-rest". That is, "Latin-1
-Supplement" becomes "Latin1Supplement".
+The C<tr///> operator translates characters instead of bytes.  Note
+that the C<tr///CU> functionality has been removed.  For similar
+functionality see pack('U0', ...) and pack('C0', ...).
  
-You can also negate both C<\p{}> and C<\P{}> by introducing a caret
-(^) between the first curly and the property name: C<\p{^Tamil}> is
+=item *
+
+Case translation operators use the Unicode case translation tables
+when character input is provided.  Note that C<uc()>, or C<\U> in
+interpolated strings, translates to uppercase, while C<ucfirst>,
+or C<\u> in interpolated strings, translates to titlecase in languages
+that make the distinction (which is equivalent to uppercase in languages
+without the distinction).
+
+=item *
+
+Most operators that deal with positions or lengths in a string will
+automatically switch to using character positions, including
+C<chop()>, C<chomp()>, C<substr()>, C<pos()>, C<index()>, C<rindex()>,
+C<sprintf()>, C<write()>, and C<length()>.  An operator that
+specifically does not switch is C<vec()>.  Operators that really don't
+care include operators that treat strings as a bucket of bits such as
+C<sort()>, and operators dealing with filenames.
+
+=item *
+
+The C<pack()>/C<unpack()> letter C<C> does I<not> change, since it is often
+used for byte-oriented formats.  Again, think C<char> in the C language.
+
+There is a new C<U> specifier that converts between Unicode characters
+and code points. There is also a C<W> specifier that is the equivalent of
+C<chr>/C<ord> and properly handles character values even if they are above 255.
+
+=item *
+
+The C<chr()> and C<ord()> functions work on characters, similar to
+C<pack("W")> and C<unpack("W")>, I<not> C<pack("C")> and
+C<unpack("C")>.  C<pack("C")> and C<unpack("C")> are methods for
+emulating byte-oriented C<chr()> and C<ord()> on Unicode strings.
+While these methods reveal the internal encoding of Unicode strings,
+that is not something one normally needs to care about at all.
+
+=item *
+
+The bit string operators, C<& | ^ ~>, can operate on character data.
+However, for backward compatibility, such as when using bit string
+operations when characters are all less than 256 in ordinal value, one
+should not use C<~> (the bit complement) with characters of both
+values less than 256 and values greater than 256.  Most importantly,
+DeMorgan's laws (C<~($x|$y) eq ~$x&~$y> and C<~($x&$y) eq ~$x|~$y>)
+will not hold.  The reason for this mathematical I<faux pas> is that
+the complement cannot return B<both> the 8-bit (byte-wide) bit
+complement B<and> the full character-wide bit complement.
+
+=item *
+
+You can define your own mappings to be used in C<lc()>,
+C<lcfirst()>, C<uc()>, and C<ucfirst()> (or their double-quoted string inlined
+versions such as C<\U>).
+See L</"User-Defined Case Mappings"> for more details.
+
+=back
+
+=over 4
+
+=item *
+
+And finally, C<scalar reverse()> reverses by character rather than by byte.
+
+=back
+
+=head2 Unicode Character Properties
+
+Most Unicode character properties are accessible by using regular expressions.
+They are used (like bracketed character classes) by using the C<\p{}> "matches
+property" construct and the C<\P{}> negation, "doesn't match property".
+
+Note that the only time that Perl considers a sequence of individual code
+points as a single logical character is in the C<\X> construct, already
+mentioned above.   Therefore "character" in this discussion means a single
+Unicode code point.
+
+For instance, C<\p{Uppercase}> matches any single character with the Unicode
+"Uppercase" property, while C<\p{L}> matches any character with a
+General_Category of "L" (letter) property.  Brackets are not
+required for single letter property names, so C<\p{L}> is equivalent to C<\pL>.
+
+More formally, C<\p{Uppercase}> matches any single character whose Unicode
+Uppercase property value is True, and C<\P{Uppercase}> matches any character
+whose Uppercase property value is False, and they could have been written as
+C<\p{Uppercase=True}> and C<\p{Uppercase=False}>, respectively.
+
+This formality is needed when properties are not binary, that is if they can
+take on more values than just True and False.  For example, the Bidi_Class (see
+L</"Bidirectional Character Types"> below), can take on a number of different
+values, such as Left, Right, Whitespace, and others.  To match these, one needs
+to specify the property name (Bidi_Class), and the value being matched against
+(Left, Right, etc.).  This is done, as in the examples above, by having the
+two components separated by an equal sign (or interchangeably, a colon), like
+C<\p{Bidi_Class: Left}>.
+
+All Unicode-defined character properties may be written in these compound forms
+of C<\p{property=value}> or C<\p{property:value}>, but Perl provides some
+additional properties that are written only in the single form, as well as
+single-form short-cuts for all binary properties and certain others described
+below, in which you may omit the property name and the equals or colon
+separator.
+
+Most Unicode character properties have at least two synonyms (or aliases if you
+prefer), a short one that is easier to type, and a longer one which is more
+descriptive and hence it is easier to understand what it means.  Thus the "L"
+and "Letter" above are equivalent and can be used interchangeably.  Likewise,
+"Upper" is a synonym for "Uppercase", and we could have written
+C<\p{Uppercase}> equivalently as C<\p{Upper}>.  Also, there are typically
+various synonyms for the values the property can be.   For binary properties,
+"True" has 3 synonyms: "T", "Yes", and "Y"; and "False has correspondingly "F",
+"No", and "N".  But be careful.  A short form of a value for one property may
+not mean the same thing as the same short form for another.  Thus, for the
+General_Category property, "L" means "Letter", but for the Bidi_Class property,
+"L" means "Left".  A complete list of properties and synonyms is in
+L<perluniprops>.
+
+Upper/lower case differences in the property names and values are irrelevant,
+thus C<\p{Upper}> means the same thing as C<\p{upper}> or even C<\p{UpPeR}>.
+Similarly, you can add or subtract underscores anywhere in the middle of a
+word, so that these are also equivalent to C<\p{U_p_p_e_r}>.  And white space
+is irrelevant adjacent to non-word characters, such as the braces and the equals
+or colon separators so C<\p{   Upper  }> and C<\p{ Upper_case : Y }> are
+equivalent to these as well.  In fact, in most cases, white space and even
+hyphens can be added or deleted anywhere.  So even C<\p{ Up-per case = Yes}> is
+equivalent.  All this is called "loose-matching" by Unicode.  The few places
+where stricter matching is employed is in the middle of numbers, and the Perl
+extension properties that begin or end with an underscore.  Stricter matching
+cares about white space (except adjacent to the non-word characters) and
+hyphens, and non-interior underscores.
+
+You can also use negation in both C<\p{}> and C<\P{}> by introducing a caret
+(^) between the first brace and the property name: C<\p{^Tamil}> is
  equal to C<\P{Tamil}>.
  
-Here are the basic Unicode General Category properties, followed by their
-long form (you can use either, e.g. C<\p{Lu}> and C<\p{LowercaseLetter}>
-are identical).
+=head3 B<General_Category>
+
+Every Unicode character is assigned a general category, which is the "most
+usual categorization of a character" (from
+L<http://www.unicode.org/reports/tr44>).
+
+The compound way of writing these is like C<\p{General_Category=Number}>
+(short, C<\p{gc:n}>).  But Perl furnishes shortcuts in which everything up
+through the equal or colon separator is omitted.  So you can instead just write
+C<\pN>.
+
+Here are the short and long forms of the General Category properties:
  
      Short       Long
  
      L           Letter
-    Lu          UppercaseLetter
-    Ll          LowercaseLetter
-    Lt          TitlecaseLetter
-    Lm          ModifierLetter
-    Lo          OtherLetter
+    LC, L&      Cased_Letter (that is: [\p{Ll}\p{Lu}\p{Lt}])
+    Lu          Uppercase_Letter
+    Ll          Lowercase_Letter
+    Lt          Titlecase_Letter
+    Lm          Modifier_Letter
+    Lo          Other_Letter
  
      M           Mark
-    Mn          NonspacingMark
-    Mc          SpacingMark
-    Me          EnclosingMark
+    Mn          Nonspacing_Mark
+    Mc          Spacing_Mark
+    Me          Enclosing_Mark
  
      N           Number
-    Nd          DecimalNumber
-    Nl          LetterNumber
-    No          OtherNumber
-
-    P           Punctuation
-    Pc          ConnectorPunctuation
-    Pd          DashPunctuation
-    Ps          OpenPunctuation
-    Pe          ClosePunctuation
-    Pi          InitialPunctuation
+    Nd          Decimal_Number (also Digit)
+    Nl          Letter_Number
+    No          Other_Number
+
+    P           Punctuation (also Punct)
+    Pc          Connector_Punctuation
+    Pd          Dash_Punctuation
+    Ps          Open_Punctuation
+    Pe          Close_Punctuation
+    Pi          Initial_Punctuation
                  (may behave like Ps or Pe depending on usage)
-    Pf          FinalPunctuation
+    Pf          Final_Punctuation
                  (may behave like Ps or Pe depending on usage)
-    Po          OtherPunctuation
+    Po          Other_Punctuation
  
      S           Symbol
-    Sm          MathSymbol
-    Sc          CurrencySymbol
-    Sk          ModifierSymbol
-    So          OtherSymbol
+    Sm          Math_Symbol
+    Sc          Currency_Symbol
+    Sk          Modifier_Symbol
+    So          Other_Symbol
  
      Z           Separator
-    Zs          SpaceSeparator
-    Zl          LineSeparator
-    Zp          ParagraphSeparator
+    Zs          Space_Separator
+    Zl          Line_Separator
+    Zp          Paragraph_Separator
  
      C           Other
-    Cc          Control
+    Cc          Control (also Cntrl)
      Cf          Format
      Cs          Surrogate   (not usable)
-    Co          PrivateUse
+    Co          Private_Use
      Cn          Unassigned
  
-The single-letter properties match all characters in any of the
+Single-letter properties match all characters in any of the
  two-letter sub-properties starting with the same letter.
-There's also C<L&> which is an alias for C<Ll>, C<Lu>, and C<Lt>.
+C<LC> and C<L&> are special cases, which are both aliases for the set consisting of everything matched by C<Ll>, C<Lu>, and C<Lt>.
  
  Because Perl hides the need for the user to understand the internal
-representation of Unicode characters, it has no need to support the
-somewhat messy concept of surrogates. Therefore, the C<Cs> property is not
+representation of Unicode characters, there is no need to implement
+the somewhat messy concept of surrogates. C<Cs> is therefore not
  supported.
  
-Because scripts differ in their directionality (for example Hebrew is
-written right to left), Unicode supplies these properties:
+=head3 B<Bidirectional Character Types>
+
+Because scripts differ in their directionality (Hebrew is
+written right to left, for example) Unicode supplies these properties in
+the Bidi_Class class:
  
      Property    Meaning
  
-    BidiL       Left-to-Right
-    BidiLRE     Left-to-Right Embedding
-    BidiLRO     Left-to-Right Override
-    BidiR       Right-to-Left
-    BidiAL      Right-to-Left Arabic
-    BidiRLE     Right-to-Left Embedding
-    BidiRLO     Right-to-Left Override
-    BidiPDF     Pop Directional Format
-    BidiEN      European Number
-    BidiES      European Number Separator
-    BidiET      European Number Terminator
-    BidiAN      Arabic Number
-    BidiCS      Common Number Separator
-    BidiNSM     Non-Spacing Mark
-    BidiBN      Boundary Neutral
-    BidiB       Paragraph Separator
-    BidiS       Segment Separator
-    BidiWS      Whitespace
-    BidiON      Other Neutrals
-
-For example, C<\p{BidiR}> matches all characters that are normally
+    L           Left-to-Right
+    LRE         Left-to-Right Embedding
+    LRO         Left-to-Right Override
+    R           Right-to-Left
+    AL          Arabic Letter
+    RLE         Right-to-Left Embedding
+    RLO         Right-to-Left Override
+    PDF         Pop Directional Format
+    EN          European Number
+    ES          European Separator
+    ET          European Terminator
+    AN          Arabic Number
+    CS          Common Separator
+    NSM         Non-Spacing Mark
+    BN          Boundary Neutral
+    B           Paragraph Separator
+    S           Segment Separator
+    WS          Whitespace
+    ON          Other Neutrals
+
+This property is always written in the compound form.
+For example, C<\p{Bidi_Class:R}> matches characters that are normally
  written right to left.
  
+=head3 B<Scripts>
+
+The world's languages are written in a number of scripts.  This sentence
+(unless you're reading it in translation) is written in Latin, while Russian is
+written in Cyrllic, and Greek is written in, well, Greek; Japanese mainly in
+Hiragana or Katakana.  There are many more.
+
+The Unicode Script property gives what script a given character is in,
+and the property can be specified with the compound form like
+C<\p{Script=Hebrew}> (short: C<\p{sc=hebr}>).  Perl furnishes shortcuts for all
+script names.  You can omit everything up through the equals (or colon), and
+simply write C<\p{Latin}> or C<\P{Cyrillic}>.
+
+A complete list of scripts and their shortcuts is in L<perluniprops>.
+
+=head3 B<Use of "Is" Prefix>
+
+For backward compatibility (with Perl 5.6), all properties mentioned
+so far may have C<Is> or C<Is_> prepended to their name, so C<\P{Is_Lu}>, for
+example, is equal to C<\P{Lu}>, and C<\p{IsScript:Arabic}> is equal to
+C<\p{Arabic}>.
+
+=head3 B<Blocks>
+
+In addition to B<scripts>, Unicode also defines B<blocks> of
+characters.  The difference between scripts and blocks is that the
+concept of scripts is closer to natural languages, while the concept
+of blocks is more of an artificial grouping based on groups of Unicode
+characters with consecutive ordinal values. For example, the "Basic Latin"
+block is all characters whose ordinals are between 0 and 127, inclusive, in
+other words, the ASCII characters.  The "Latin" script contains some letters
+from this block as well as several more, like "Latin-1 Supplement",
+"Latin Extended-A", etc., but it does not contain all the characters from
+those blocks. It does not, for example, contain digits, because digits are
+shared across many scripts. Digits and similar groups, like punctuation, are in
+the script called C<Common>.  There is also a script called C<Inherited> for
+characters that modify other characters, and inherit the script value of the
+controlling character.
+
+For more about scripts versus blocks, see UAX#24 "Unicode Script Property":
+L<http://www.unicode.org/reports/tr24>
+
+The Script property is likely to be the one you want to use when processing
+natural language; the Block property may be useful in working with the nuts and
+bolts of Unicode.
+
+Block names are matched in the compound form, like C<\p{Block: Arrows}> or
+C<\p{Blk=Hebrew}>.  Unlike most other properties only a few block names have a
+Unicode-defined short name.  But Perl does provide a (slight) shortcut:  You
+can say, for example C<\p{In_Arrows}> or C<\p{In_Hebrew}>.  For backwards
+compatibility, the C<In> prefix may be omitted if there is no naming conflict
+with a script or any other property, and you can even use an C<Is> prefix
+instead in those cases.  But it is not a good idea to do this, for a couple
+reasons:
+
+=over 4
+
+=item 1
+
+It is confusing.  There are many naming conflicts, and you may forget some.
+For example, C<\p{Hebrew}> means the I<script> Hebrew, and NOT the I<block>
+Hebrew.  But would you remember that 6 months from now?
+
+=item 2
+
+It is unstable.  A new version of Unicode may pre-empt the current meaning by
+creating a property with the same name.  There was a time in very early Unicode
+releases when C<\p{Hebrew}> would have matched the I<block> Hebrew; now it
+doesn't.
+
  =back
  
-=head2 Scripts
-
-The scripts available via C<\p{...}> and C<\P{...}>, for example
-C<\p{Latin}> or \p{Cyrillic>, are as follows:
-
-    Arabic
-    Armenian
-    Bengali
-    Bopomofo
-    CanadianAboriginal
-    Cherokee
-    Cyrillic
-    Deseret
-    Devanagari
-    Ethiopic
-    Georgian
-    Gothic
-    Greek
-    Gujarati
-    Gurmukhi
-    Han
-    Hangul
-    Hebrew
-    Hiragana
-    Inherited
-    Kannada
-    Katakana
-    Khmer
-    Lao
-    Latin
-    Malayalam
-    Mongolian
-    Myanmar
-    Ogham
-    OldItalic
-    Oriya
-    Runic
-    Sinhala
-    Syriac
-    Tamil
-    Telugu
-    Thaana
-    Thai
-    Tibetan
-    Yi
-
-There are also extended property classes that supplement the basic
-properties, defined by the F<PropList> Unicode database:
-
-    ASCII_Hex_Digit
-    BidiControl
-    Dash
-    Diacritic
-    Extender
-    HexDigit
-    Hyphen
-    Ideographic
-    JoinControl
-    NoncharacterCodePoint
-    OtherAlphabetic
-    OtherLowercase
-    OtherMath
-    OtherUppercase
-    QuotationMark
-    WhiteSpace
-
-and further derived properties:
-
-    Alphabetic      Lu + Ll + Lt + Lm + Lo + OtherAlphabetic
-    Lowercase       Ll + OtherLowercase
-    Uppercase       Lu + OtherUppercase
-    Math            Sm + OtherMath
-
-    ID_Start        Lu + Ll + Lt + Lm + Lo + Nl
-    ID_Continue     ID_Start + Mn + Mc + Nd + Pc
-
-    Any             Any character
-    Assigned        Any non-Cn character (i.e. synonym for C<\P{Cn}>)
-    Unassigned      Synonym for C<\p{Cn}>
-    Common          Any character (or unassigned code point)
-                    not explicitly assigned to a script
-
-For backward compatability, all properties mentioned so far may have C<Is>
-prepended to their name (e.g. C<\P{IsLu}> is equal to C<\P{Lu}>).
-
-=head2 Blocks
-
-In addition to B<scripts>, Unicode also defines B<blocks> of characters.
-The difference between scripts and blocks is that the scripts concept is
-closer to natural languages, while the blocks concept is more an artificial
-grouping based on groups of mostly 256 Unicode characters. For example, the
-C<Latin> script contains letters from many blocks. On the other hand, the
-C<Latin> script does not contain all the characters from those blocks. It
-does not, for example, contain digits because digits are shared across many
-scripts. Digits and other similar groups, like punctuation, are in a
-category called C<Common>.
-
-For more about scripts, see the UTR #24:
-
-   http://www.unicode.org/unicode/reports/tr24/
-
-For more about blocks, see:
-
-   http://www.unicode.org/Public/UNIDATA/Blocks.txt
-
-Blocks names are given with the C<In> prefix. For example, the
-Katakana block is referenced via C<\p{InKatakana}>. The C<In>
-prefix may be omitted if there is no nameing conflict with a script
-or any other property, but it is recommended that C<In> always be used
-to avoid confusion.
-
-These block names are supported:
-
-   InAlphabeticPresentationForms
-   InArabicBlock
-   InArabicPresentationFormsA
-   InArabicPresentationFormsB
-   InArmenianBlock
-   InArrows
-   InBasicLatin
-   InBengaliBlock
-   InBlockElements
-   InBopomofoBlock
-   InBopomofoExtended
-   InBoxDrawing
-   InBraillePatterns
-   InByzantineMusicalSymbols
-   InCJKCompatibility
-   InCJKCompatibilityForms
-   InCJKCompatibilityIdeographs
-   InCJKCompatibilityIdeographsSupplement
-   InCJKRadicalsSupplement
-   InCJKSymbolsAndPunctuation
-   InCJKUnifiedIdeographs
-   InCJKUnifiedIdeographsExtensionA
-   InCJKUnifiedIdeographsExtensionB
-   InCherokeeBlock
-   InCombiningDiacriticalMarks
-   InCombiningHalfMarks
-   InCombiningMarksForSymbols
-   InControlPictures
-   InCurrencySymbols
-   InCyrillicBlock
-   InDeseretBlock
-   InDevanagariBlock
-   InDingbats
-   InEnclosedAlphanumerics
-   InEnclosedCJKLettersAndMonths
-   InEthiopicBlock
-   InGeneralPunctuation
-   InGeometricShapes
-   InGeorgianBlock
-   InGothicBlock
-   InGreekBlock
-   InGreekExtended
-   InGujaratiBlock
-   InGurmukhiBlock
-   InHalfwidthAndFullwidthForms
-   InHangulCompatibilityJamo
-   InHangulJamo
-   InHangulSyllables
-   InHebrewBlock
-   InHighPrivateUseSurrogates
-   InHighSurrogates
-   InHiraganaBlock
-   InIPAExtensions
-   InIdeographicDescriptionCharacters
-   InKanbun
-   InKangxiRadicals
-   InKannadaBlock
-   InKatakanaBlock
-   InKhmerBlock
-   InLaoBlock
-   InLatin1Supplement
-   InLatinExtendedAdditional
-   InLatinExtended-A
-   InLatinExtended-B
-   InLetterlikeSymbols
-   InLowSurrogates
-   InMalayalamBlock
-   InMathematicalAlphanumericSymbols
-   InMathematicalOperators
-   InMiscellaneousSymbols
-   InMiscellaneousTechnical
-   InMongolianBlock
-   InMusicalSymbols
-   InMyanmarBlock
-   InNumberForms
-   InOghamBlock
-   InOldItalicBlock
-   InOpticalCharacterRecognition
-   InOriyaBlock
-   InPrivateUse
-   InRunicBlock
-   InSinhalaBlock
-   InSmallFormVariants
-   InSpacingModifierLetters
-   InSpecials
-   InSuperscriptsAndSubscripts
-   InSyriacBlock
-   InTags
-   InTamilBlock
-   InTeluguBlock
-   InThaanaBlock
-   InThaiBlock
-   InTibetanBlock
-   InUnifiedCanadianAboriginalSyllabics
-   InYiRadicals
-   InYiSyllables
+Some people just prefer to always use C<\p{Block: foo}> and C<\p{Script: bar}>
+instead of the shortcuts, for clarity, and because they can't remember the
+difference between 'In' and 'Is' anyway (or aren't confident that those who
+eventually will read their code will know).
  
-=over 4
+A complete list of blocks and their shortcuts is in L<perluniprops>.
  
-=item *
+=head3 B<Other Properties>
  
-The special pattern C<\X> matches any extended Unicode sequence
-(a "combining character sequence" in Standardese), where the first
-character is a base character and subsequent characters are mark
-characters that apply to the base character.  It is equivalent to
-C<(?:\PM\pM*)>.
+There are many more properties than the very basic ones described here.
+A complete list is in L<perluniprops>.
  
-=item *
+Unicode defines all its properties in the compound form, so all single-form
+properties are Perl extensions.  A number of these are just synonyms for the
+Unicode ones, but some are genunine extensions, including a couple that are in
+the compound form.  And quite a few of these are actually recommended by Unicode
+(in L<http://www.unicode.org/reports/tr18>).
  
-The C<tr///> operator translates characters instead of bytes.  Note
-that the C<tr///CU> functionality has been removed, as the interface
-was a mistake.  For similar functionality see pack('U0', ...) and
-pack('C0', ...).
+This section gives some details on all the extensions that aren't synonyms for
+compound-form Unicode properties (for those, you'll have to refer to the
+L<Unicode Standard|http://www.unicode.org/reports/tr44>.
  
-=item *
+=over
  
-Case translation operators use the Unicode case translation tables
-when provided character input.  Note that C<uc()> (also known as C<\U>
-in doublequoted strings) translates to uppercase, while C<ucfirst>
-(also known as C<\u> in doublequoted strings) translates to titlecase
-(for languages that make the distinction).  Naturally the
-corresponding backslash sequences have the same semantics.
+=item B<C<\p{All}>>
  
-=item *
+This matches any of the 1_114_112 Unicode code points.  It is a synonym for
+C<\p{Any}>.
  
-Most operators that deal with positions or lengths in the string will
-automatically switch to using character positions, including
-C<chop()>, C<substr()>, C<pos()>, C<index()>, C<rindex()>,
-C<sprintf()>, C<write()>, and C<length()>.  Operators that
-specifically don't switch include C<vec()>, C<pack()>, and
-C<unpack()>.  Operators that really don't care include C<chomp()>, as
-well as any other operator that treats a string as a bucket of bits,
-such as C<sort()>, and the operators dealing with filenames.
+=item B<C<\p{Alnum}>>
  
-=item *
+This matches any C<\p{Alphabetic}> or C<\p{Decimal_Number}> character.
  
-The C<pack()>/C<unpack()> letters "C<c>" and "C<C>" do I<not> change,
-since they're often used for byte-oriented formats.  (Again, think
-"C<char>" in the C language.)  However, there is a new "C<U>" specifier
-that will convert between Unicode characters and integers.
+=item B<C<\p{Any}>>
  
-=item *
+This matches any of the 1_114_112 Unicode code points.  It is a synonym for
+C<\p{All}>.
  
-The C<chr()> and C<ord()> functions work on characters.  This is like
-C<pack("U")> and C<unpack("U")>, not like C<pack("C")> and
-C<unpack("C")>.  In fact, the latter are how you now emulate
-byte-oriented C<chr()> and C<ord()> for Unicode strings.
-(Note that this reveals the internal encoding of Unicode strings,
-which is not something one normally needs to care about at all.)
+=item B<C<\p{Assigned}>>
  
-=item *
+This matches any assigned code point; that is, any code point whose general
+category is not Unassigned (or equivalently, not Cn).
  
-The bit string operators C<& | ^ ~> can operate on character data.
-However, for backward compatibility reasons (bit string operations
-when the characters all are less than 256 in ordinal value) one should
-not mix C<~> (the bit complement) and characters both less than 256 and
-equal or greater than 256.  Most importantly, the DeMorgan's laws
-(C<~($x|$y) eq ~$x&~$y>, C<~($x&$y) eq ~$x|~$y>) won't hold.
-Another way to look at this is that the complement cannot return
-B<both> the 8-bit (byte) wide bit complement B<and> the full character
-wide bit complement.
+=item B<C<\p{Blank}>>
  
-=item *
+This is the same as C<\h> and C<\p{HorizSpace}>:  A character that changes the
+spacing horizontally.
  
-lc(), uc(), lcfirst(), and ucfirst() work for the following cases:
+=item B<C<\p{Decomposition_Type: Non_Canonical}>>    (Short: C<\p{Dt=NonCanon}>)
  
-=over 8
+Matches a character that has a non-canonical decomposition.
  
-=item *
+To understand the use of this rarely used property=value combination, it is
+necessary to know some basics about decomposition.
+Consider a character, say H.  It could appear with various marks around it,
+such as an acute accent, or a circumflex, or various hooks, circles, arrows,
+I<etc.>, above, below, to one side and/or the other, etc.  There are many
+possibilities among the world's languages.  The number of combinations is
+astronomical, and if there were a character for each combination, it would
+soon exhaust Unicode's more than a million possible characters.  So Unicode
+took a different approach: there is a character for the base H, and a
+character for each of the possible marks, and they can be combined variously
+to get a final logical character.  So a logical character--what appears to be a
+single character--can be a sequence of more than one individual characters.
+This is called an "extended grapheme cluster".  (Perl furnishes the C<\X>
+construct to match such sequences.)
  
-the case mapping is from a single Unicode character to another
-single Unicode character
+But Unicode's intent is to unify the existing character set standards and
+practices, and a number of pre-existing standards have single characters that
+mean the same thing as some of these combinations.  An example is ISO-8859-1,
+which has quite a few of these in the Latin-1 range, an example being "LATIN
+CAPITAL LETTER E WITH ACUTE".  Because this character was in this pre-existing
+standard, Unicode added it to its repertoire.  But this character is considered
+by Unicode to be equivalent to the sequence consisting of first the character
+"LATIN CAPITAL LETTER E", then the character "COMBINING ACUTE ACCENT".
  
-=item *
+"LATIN CAPITAL LETTER E WITH ACUTE" is called a "pre-composed" character, and
+the equivalence with the sequence is called canonical equivalence.  All
+pre-composed characters are said to have a decomposition (into the equivalent
+sequence) and the decomposition type is also called canonical.
+
+However, many more characters have a different type of decomposition, a
+"compatible" or "non-canonical" decomposition.  The sequences that form these
+decompositions are not considered canonically equivalent to the pre-composed
+character.  An example, again in the Latin-1 range, is the "SUPERSCRIPT ONE".
+It is kind of like a regular digit 1, but not exactly; its decomposition
+into the digit 1 is called a "compatible" decomposition, specifically a
+"super" decomposition.  There are several such compatibility
+decompositions (see L<http://www.unicode.org/reports/tr44>), including one
+called "compat" which means some miscellaneous type of decomposition
+that doesn't fit into the decomposition categories that Unicode has chosen. 
+
+Note that most Unicode characters don't have a decomposition, so their
+decomposition type is "None".
+
+Perl has added the C<Non_Canonical> type, for your convenience, to mean any of
+the compatibility decompositions.
+
+=item B<C<\p{Graph}>>
+
+Matches any character that is graphic.  Theoretically, this means a character
+that on a printer would cause ink to be used.
+
+=item B<C<\p{HorizSpace}>>
+
+This is the same as C<\h> and C<\p{Blank}>:  A character that changes the
+spacing horizontally.
+
+=item B<C<\p{In=*}>> 
+
+This is a synonym for C<\p{Present_In=*}>
+
+=item B<C<\p{PerlSpace}>>
+
+This is the same as C<\s>, restricted to ASCII, namely C<S<[ \f\n\r\t]>>.
+
+Mnemonic: Perl's (original) space
+
+=item B<C<\p{PerlWord}>>
+
+This is the same as C<\w>, restricted to ASCII, namely C<[A-Za-z0-9_]>
+
+Mnemonic: Perl's (original) word.
+
+=item B<C<\p{PosixAlnum}>>
+
+This matches any alphanumeric character in the ASCII range, namely
+C<[A-Za-z0-9]>.
+
+=item B<C<\p{PosixAlpha}>>
+
+This matches any alphabetic character in the ASCII range, namely C<[A-Za-z]>.
+
+=item B<C<\p{PosixBlank}>>
+
+This matches any blank character in the ASCII range, namely C<S<[ \t]>>.
+
+=item B<C<\p{PosixCntrl}>>
+
+This matches any control character in the ASCII range, namely C<[\x00-\x1F\x7F]>
+
+=item B<C<\p{PosixDigit}>>
  
-the case mapping is from a single Unicode character to more
-than one Unicode character
+This matches any digit character in the ASCII range, namely C<[0-9]>.
+
+=item B<C<\p{PosixGraph}>>
+
+This matches any graphical character in the ASCII range, namely C<[\x21-\x7E]>.
+
+=item B<C<\p{PosixLower}>>
+
+This matches any lowercase character in the ASCII range, namely C<[a-z]>.
+
+=item B<C<\p{PosixPrint}>>
+
+This matches any printable character in the ASCII range, namely C<[\x20-\x7E]>.
+These are the graphical characters plus SPACE.
+
+=item B<C<\p{PosixPunct}>>
+
+This matches any punctuation character in the ASCII range, namely
+C<[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]>.  These are the
+graphical characters that aren't word characters.  Note that the Posix standard
+includes in its definition of punctuation, those characters that Unicode calls
+"symbols."
+
+=item B<C<\p{PosixSpace}>>
+
+This matches any space character in the ASCII range, namely
+C<S<[ \f\n\r\t\x0B]>> (the last being a vertical tab).
+
+=item B<C<\p{PosixUpper}>>
+
+This matches any uppercase character in the ASCII range, namely C<[A-Z]>.
+
+=item B<C<\p{Present_In: *}>>    (Short: C<\p{In=*}>)
+
+This property is used when you need to know in what Unicode version(s) a
+character is.
+
+The "*" above stands for some two digit Unicode version number, such as
+C<1.1> or C<4.0>; or the "*" can also be C<Unassigned>.  This property will
+match the code points whose final disposition has been settled as of the
+Unicode release given by the version number; C<\p{Present_In: Unassigned}>
+will match those code points whose meaning has yet to be assigned.
+
+For example, C<U+0041> "LATIN CAPITAL LETTER A" was present in the very first
+Unicode release available, which is C<1.1>, so this property is true for all
+valid "*" versions.  On the other hand, C<U+1EFF> was not assigned until version
+5.1 when it became "LATIN SMALL LETTER Y WITH LOOP", so the only "*" that
+would match it are 5.1, 5.2, and later.
+
+Unicode furnishes the C<Age> property from which this is derived.  The problem
+with Age is that a strict interpretation of it (which Perl takes) has it
+matching the precise release a code point's meaning is introduced in.  Thus
+C<U+0041> would match only 1.1; and C<U+1EFF> only 5.1.  This is not usually what
+you want.
+
+Some non-Perl implementations of the Age property may change its meaning to be
+the same as the Perl Present_In property; just be aware of that.
+
+Another confusion with both these properties is that the definition is not
+that the code point has been assigned, but that the meaning of the code point
+has been determined.  This is because 66 code points will always be
+unassigned, and, so the Age for them is the Unicode version the decision to
+make them so was made in.  For example, C<U+FDD0> is to be permanently
+unassigned to a character, and the decision to do that was made in version 3.1,
+so C<\p{Age=3.1}> matches this character and C<\p{Present_In: 3.1}> and up
+matches as well.
+
+=item B<C<\p{Print}>>
+
+This matches any character that is graphical or blank, except controls.
+
+=item B<C<\p{SpacePerl}>>
+
+This is the same as C<\s>, including beyond ASCII.
+
+Mnemonic: Space, as modified by Perl.  (It doesn't include the vertical tab
+which both the Posix standard and Unicode consider to be space.)
+
+=item B<C<\p{VertSpace}>>
+
+This is the same as C<\v>:  A character that changes the spacing vertically.
+
+=item B<C<\p{Word}>>
+
+This is the same as C<\w>, including beyond ASCII.
  
  =back
  
-What doesn't yet work are the following cases:
+=head2 User-Defined Character Properties
+
+You can define your own binary character properties by defining subroutines
+whose names begin with "In" or "Is".  The subroutines can be defined in any
+package.  The user-defined properties can be used in the regular expression
+C<\p> and C<\P> constructs; if you are using a user-defined property from a
+package other than the one you are in, you must specify its package in the
+C<\p> or C<\P> construct.
+
+    # assuming property Is_Foreign defined in Lang::
+    package main;  # property package name required
+    if ($txt =~ /\p{Lang::IsForeign}+/) { ... }
+
+    package Lang;  # property package name not required
+    if ($txt =~ /\p{IsForeign}+/) { ... }
+
  
-=over 8
+Note that the effect is compile-time and immutable once defined.
+
+The subroutines must return a specially-formatted string, with one
+or more newline-separated lines.  Each line must be one of the following:
+
+=over 4
  
  =item *
  
-the "final sigma" (Greek)
+A single hexadecimal number denoting a Unicode code point to include.
  
  =item *
  
-anything to with locales (Lithuanian, Turkish, Azeri)
+Two hexadecimal numbers separated by horizontal whitespace (space or
+tabular characters) denoting a range of Unicode code points to include.
  
-=back
+=item *
  
-See the Unicode Technical Report #21, Case Mappings, for more details.
+Something to include, prefixed by "+": a built-in character
+property (prefixed by "utf8::") or a user-defined character property,
+to represent all the characters in that property; two hexadecimal code
+points for a range; or a single hexadecimal code point.
  
  =item *
  
-And finally, C<scalar reverse()> reverses by character rather than by byte.
+Something to exclude, prefixed by "-": an existing character
+property (prefixed by "utf8::") or a user-defined character property,
+to represent all the characters in that property; two hexadecimal code
+points for a range; or a single hexadecimal code point.
+
+=item *
+
+Something to negate, prefixed "!": an existing character
+property (prefixed by "utf8::") or a user-defined character property,
+to represent all the characters in that property; two hexadecimal code
+points for a range; or a single hexadecimal code point.
+
+=item *
+
+Something to intersect with, prefixed by "&": an existing character
+property (prefixed by "utf8::") or a user-defined character property,
+for all the characters except the characters in the property; two
+hexadecimal code points for a range; or a single hexadecimal code point.
  
  =back
  
-=head2 Character encodings for input and output
+For example, to define a property that covers both the Japanese
+syllabaries (hiragana and katakana), you can define
+
+    sub InKana {
+        return <<END;
+    3040\t309F
+    30A0\t30FF
+    END
+    }
+
+Imagine that the here-doc end marker is at the beginning of the line.
+Now you can use C<\p{InKana}> and C<\P{InKana}>.
+
+You could also have used the existing block property names:
+
+    sub InKana {
+        return <<'END';
+    +utf8::InHiragana
+    +utf8::InKatakana
+    END
+    }
+
+Suppose you wanted to match only the allocated characters,
+not the raw block ranges: in other words, you want to remove
+the non-characters:
+
+    sub InKana {
+        return <<'END';
+    +utf8::InHiragana
+    +utf8::InKatakana
+    -utf8::IsCn
+    END
+    }
+
+The negation is useful for defining (surprise!) negated classes.
+
+    sub InNotKana {
+        return <<'END';
+    !utf8::InHiragana
+    -utf8::InKatakana
+    +utf8::IsCn
+    END
+    }
+
+Intersection is useful for getting the common characters matched by
+two (or more) classes.
+
+    sub InFooAndBar {
+        return <<'END';
+    +main::Foo
+    &main::Bar
+    END
+    }
+
+It's important to remember not to use "&" for the first set; that
+would be intersecting with nothing (resulting in an empty set).
+
+=head2 User-Defined Case Mappings
+
+You can also define your own mappings to be used in C<lc()>,
+C<lcfirst()>, C<uc()>, and C<ucfirst()> (or their string-inlined versions,
+C<\L>, C<\l>, C<\U>, and C<\u>).
+The principle is similar to that of user-defined character
+properties: to define subroutines
+with names C<ToLower> (for C<lc()> and C<lcfirst()>); C<ToTitle> (for
+C<ucfirst()>); and C<ToUpper> (for C<uc()>).
+
+The string returned by the subroutines needs to be two hexadecimal numbers
+separated by two tabulators: the two numbers being, respectively, the source
+code point and the destination code point.  For example:
+
+    sub ToUpper {
+        return <<END;
+    0061\t\t0041
+    END
+    }
+
+defines a mapping for C<uc()> (and C<\U>) that causes only the character "a"
+to be mapped to "A"; all other characters will remain unchanged.
+
+(For serious hackers only)  The above means you have to furnish a complete
+mapping; you can't just override a couple of characters and leave the rest
+unchanged.  You can find all the official mappings in the directory
+C<$Config{privlib}>F</unicore/To/>.  The mapping data is returned as the
+here-document.  The C<utf8::ToSpecI<Foo>> hashes in those files are special
+exception mappings derived from
+C<$Config{privlib}>F</unicore/SpecialCasing.txt>.  (The "Digit" and
+"Fold" mappings that one can see in the directory are not directly
+user-accessible, one can use either the L<Unicode::UCD> module, or just match
+case-insensitively, which is what uses the "Fold" mapping.  Neither are user
+overridable.)
+
+If you have many mappings to change, you can take the official mapping data,
+change by hand the affected code points, and place the whole thing into your
+subroutine.  But this will only be valid on Perls that use the same Unicode
+version.  Another option would be to have your subroutine read the official
+mapping file(s) and overwrite the affected code points.
+
+If you have only a few mappings to change, starting in 5.14 you can use the
+following trick, here illustrated for Turkish.
+
+    use Config;
+
+    sub ToUpper {
+        my $official = do "$Config{privlib}/unicore/To/Upper.pl";
+        $utf8::ToSpecUpper{'i'} = 
+                           "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}";
+        return $official;
+    }
+
+This takes the official mappings and overrides just one, for "LATIN SMALL
+LETTER I".  The keys to the hash must be in UTF-8 (or on EBCDIC platforms,
+UTF-EBCDIC), as illustrated by the inverse function.
+
+    sub ToLower {
+        my $official = do $lower;
+        $utf8::ToSpecLower{"\xc4\xb0"} = "i";
+        return $official;
+    }
+
+This example is for an ASCII platform, and C<\xc4\xb0> is the UTF-8 string that 
+represents C<\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}>, C<U+0130>.
+
+(The trick illustrated here does work in earlier releases, but only if all the
+characters you want to override have ordinal values of 256 or higher.)
+
+The mappings are in effect only for the package they are defined in, and only
+on scalars that have been marked as having Unicode characters, for example by
+using C<utf8::upgrade()>.  You can get around the latter restriction in the
+scope of a C<S<use subs>>:
+
+    use subs qw(uc ucfirst lc lcfirst);
+
+    sub uc($) {
+        my $string = shift;
+        utf8::upgrade($string);
+        return CORE::uc($string);
+    }
+
+    sub lc($) {
+        my $string = shift;
+        utf8::upgrade($string);
+
+        # Unless an I is before a dot_above, it turns into a dotless i.
+        $string =~ 
+              s/I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/gx;
+
+        # But when the I is followed by a dot_above, remove the
+        # dot_above so the end result will be i.
+        $string =~ s/I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/gx;
+        return CORE::lc($string);
+    }
+
+These examples (also for Turkish) make sure the input is in UTF-8, and then
+call the corresponding official function, which will use the C<ToUpper()> and
+C<ToLower()> functions you have defined in the package.  The C<lc()> example
+shows how you can add context-dependent casing.  (For Turkish, there other
+required functions: C<ucfirst>, C<lcfirst>, and C<ToTitle>.  These are very
+similar to the ones given above.)
+
+=head2 Character Encodings for Input and Output
  
  See L<Encode>.
  
  =head2 Unicode Regular Expression Support Level
  
-The following list of Unicode regular expression support describes
-feature by feature the Unicode support implemented in Perl as of Perl
-5.8.0.  The "Level N" and the section numbers refer to the Unicode
-Technical Report 18, "Unicode Regular Expression Guidelines".
+The following list of Unicode support for regular expressions describes
+all the features currently supported.  The references to "Level N"
+and the section numbers refer to the Unicode Technical Standard #18,
+"Unicode Regular Expressions", version 11, in May 2005.
  
  =over 4
  
@@ -601,83 +994,117 @@ Technical Report 18, "Unicode Regular Expression Guidelines".
  
  Level 1 - Basic Unicode Support
  
-        2.1 Hex Notation                        - done          [1]
-            Named Notation                      - done          [2]
-        2.2 Categories                          - done          [3][4]
-        2.3 Subtraction                         - MISSING       [5][6]
-        2.4 Simple Word Boundaries              - done          [7]
-        2.5 Simple Loose Matches                - done          [8]
-        2.6 End of Line                         - MISSING       [9][10]
-
-        [ 1] \x{...}
-        [ 2] \N{...}
-        [ 3] . \p{...} \P{...}
-        [ 4] now scripts (see UTR#24 Script Names) in addition to blocks
-        [ 5] have negation
-        [ 6] can use look-ahead to emulate subtraction (*)
-        [ 7] include Letters in word characters
-        [ 8] note that perl does Full casefolding in matching, not Simple:
-             for example U+1F88 is equivalent with U+1F000 U+03B9,
-             not with 1F80.  This difference matters for certain Greek
-             capital letters with certain modifiers: the Full casefolding
-             decomposes the letter, while the Simple casefolding would map
-             it to a single character.
-        [ 9] see UTR#13 Unicode Newline Guidelines
-        [10] should do ^ and $ also on \x{85}, \x{2028} and \x{2029})
-             (should also affect <>, $., and script line numbers)
-             (the \x{85}, \x{2028} and \x{2029} do match \s)
-
-(*) You can mimic class subtraction using lookahead.
-For example, what TR18 might write as
+        RL1.1   Hex Notation                     - done          [1]
+        RL1.2   Properties                       - done          [2][3]
+        RL1.2a  Compatibility Properties         - done          [4]
+        RL1.3   Subtraction and Intersection     - MISSING       [5]
+        RL1.4   Simple Word Boundaries           - done          [6]
+        RL1.5   Simple Loose Matches             - done          [7]
+        RL1.6   Line Boundaries                  - MISSING       [8]
+        RL1.7   Supplementary Code Points        - done          [9]
+
+        [1]  \x{...}
+        [2]  \p{...} \P{...}
+        [3]  supports not only minimal list, but all Unicode character
+             properties (see L</Unicode Character Properties>)
+        [4]  \d \D \s \S \w \W \X [:prop:] [:^prop:]
+        [5]  can use regular expression look-ahead [a] or
+             user-defined character properties [b] to emulate set
+             operations
+        [6]  \b \B
+        [7]  note that Perl does Full case-folding in matching (but with
+             bugs), not Simple: for example U+1F88 is equivalent to
+             U+1F00 U+03B9, not with 1F80.  This difference matters
+             mainly for certain Greek capital letters with certain
+             modifiers: the Full case-folding decomposes the letter,
+             while the Simple case-folding would map it to a single
+             character.
+        [8]  should do ^ and $ also on U+000B (\v in C), FF (\f), CR
+             (\r), CRLF (\r\n), NEL (U+0085), LS (U+2028), and PS
+             (U+2029); should also affect <>, $., and script line
+             numbers; should not split lines within CRLF [c] (i.e. there
+             is no empty line between \r and \n)
+        [9]  UTF-8/UTF-EBDDIC used in perl allows not only U+10000 to
+             U+10FFFF but also beyond U+10FFFF [d]
+
+[a] You can mimic class subtraction using lookahead.
+For example, what UTS#18 might write as
  
      [{Greek}-[{UNASSIGNED}]]
  
  in Perl can be written as:
  
-    (?!\p{Unassigned})\p{InGreek}
-    (?=\p{Assigned})\p{InGreek}
+    (?!\p{Unassigned})\p{InGreekAndCoptic}
+    (?=\p{Assigned})\p{InGreekAndCoptic}
  
  But in this particular example, you probably really want
  
-    \p{Greek}
+    \p{GreekAndCoptic}
  
  which will match assigned characters known to be part of the Greek script.
  
-=item *
+Also see the Unicode::Regex::Set module, it does implement the full
+UTS#18 grouping, intersection, union, and removal (subtraction) syntax.
  
-Level 2 - Extended Unicode Support
+[b] '+' for union, '-' for removal (set-difference), '&' for intersection
+(see L</"User-Defined Character Properties">)
  
-        3.1 Surrogates                          - MISSING
-        3.2 Canonical Equivalents               - MISSING       [11][12]
-        3.3 Locale-Independent Graphemes        - MISSING       [13]
-        3.4 Locale-Independent Words            - MISSING       [14]
-        3.5 Locale-Independent Loose Matches    - MISSING       [15]
+[c] Try the C<:crlf> layer (see L<PerlIO>).
  
-        [11] see UTR#15 Unicode Normalization
-        [12] have Unicode::Normalize but not integrated to regexes
-        [13] have \X but at this level . should equal that
-        [14] need three classes, not just \w and \W
-        [15] see UTR#21 Case Mappings
+[d] U+FFFF will currently generate a warning message if 'utf8' warnings are
+    enabled
  
  =item *
  
-Level 3 - Locale-Sensitive Support
+Level 2 - Extended Unicode Support
+
+        RL2.1   Canonical Equivalents           - MISSING       [10][11]
+        RL2.2   Default Grapheme Clusters       - MISSING       [12]
+        RL2.3   Default Word Boundaries         - MISSING       [14]
+        RL2.4   Default Loose Matches           - MISSING       [15]
+        RL2.5   Name Properties                 - MISSING       [16]
+        RL2.6   Wildcard Properties             - MISSING
  
-        4.1 Locale-Dependent Categories         - MISSING
-        4.2 Locale-Dependent Graphemes          - MISSING       [16][17]
-        4.3 Locale-Dependent Words              - MISSING
-        4.4 Locale-Dependent Loose Matches      - MISSING
-        4.5 Locale-Dependent Ranges             - MISSING
+        [10] see UAX#15 "Unicode Normalization Forms"
+        [11] have Unicode::Normalize but not integrated to regexes
+        [12] have \X but we don't have a "Grapheme Cluster Mode"
+        [14] see UAX#29, Word Boundaries
+        [15] see UAX#21 "Case Mappings"
+        [16] missing loose match [e]
  
-        [16] see UTR#10 Unicode Collation Algorithms
-        [17] have Unicode::Collate but not integrated to regexes
+[e] C<\N{...}> allows namespaces (see L<charnames>).
+
+=item *
+
+Level 3 - Tailored Support
+
+        RL3.1   Tailored Punctuation            - MISSING
+        RL3.2   Tailored Grapheme Clusters      - MISSING       [17][18]
+        RL3.3   Tailored Word Boundaries        - MISSING
+        RL3.4   Tailored Loose Matches          - MISSING
+        RL3.5   Tailored Ranges                 - MISSING
+        RL3.6   Context Matching                - MISSING       [19]
+        RL3.7   Incremental Matches             - MISSING
+      ( RL3.8   Unicode Set Sharing )
+        RL3.9   Possible Match Sets             - MISSING
+        RL3.10  Folded Matching                 - MISSING       [20]
+        RL3.11  Submatchers                     - MISSING
+
+        [17] see UAX#10 "Unicode Collation Algorithms"
+        [18] have Unicode::Collate but not integrated to regexes
+        [19] have (?<=x) and (?=x), but look-aheads or look-behinds
+             should see outside of the target substring
+        [20] need insensitive matching for linguistic features other
+             than case; for example, hiragana to katakana, wide and
+             narrow, simplified Han to traditional Han (see UTR#30
+             "Character Foldings")
  
  =back
  
  =head2 Unicode Encodings
  
-Unicode characters are assigned to I<code points> which are abstract
-numbers.  To use these numbers various encodings are needed.
+Unicode characters are assigned to I<code points>, which are abstract
+numbers.  To use these numbers, various encodings are needed.
  
  =over 4
  
@@ -686,27 +1113,32 @@ numbers.  To use these numbers various encodings are needed.
  UTF-8
  
  UTF-8 is a variable-length (1 to 6 bytes, current character allocations
-require 4 bytes), byteorder independent encoding. For ASCII, UTF-8 is
-transparent (and we really do mean 7-bit ASCII, not another 8-bit encoding).
+require 4 bytes), byte-order independent encoding. For ASCII (and we
+really do mean 7-bit ASCII, not another 8-bit encoding), UTF-8 is
+transparent.
  
  The following table is from Unicode 3.2.
  
   Code Points            1st Byte  2nd Byte  3rd Byte  4th Byte
  
     U+0000..U+007F       00..7F
-   U+0080..U+07FF       C2..DF    80..BF
-   U+0800..U+0FFF       E0        A0..BF    80..BF  
-   U+1000..U+CFFF       E1..EC    80..BF    80..BF  
-   U+D000..U+D7FF       ED        80..9F    80..BF  
-   U+D800..U+DFFF       ******* ill-formed *******
-   U+E000..U+FFFF       EE..EF    80..BF    80..BF  
-  U+10000..U+3FFFF      F0        90..BF    80..BF    80..BF
+   U+0080..U+07FF     * C2..DF    80..BF
+   U+0800..U+0FFF       E0      * A0..BF    80..BF
+   U+1000..U+CFFF       E1..EC    80..BF    80..BF
+   U+D000..U+D7FF       ED        80..9F    80..BF
+   U+D800..U+DFFF       +++++++ utf16 surrogates, not legal utf8 +++++++
+   U+E000..U+FFFF       EE..EF    80..BF    80..BF
+  U+10000..U+3FFFF      F0      * 90..BF    80..BF    80..BF
    U+40000..U+FFFFF      F1..F3    80..BF    80..BF    80..BF
   U+100000..U+10FFFF     F4        80..8F    80..BF    80..BF
  
-Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
-the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
-Or, another way to look at it, as bits:
+Note the gaps before several of the byte entries above marked by '*'.  These are
+caused by legal UTF-8 avoiding non-shortest encodings: it is technically
+possible to UTF-8-encode a single code point in different ways, but that is
+explicitly forbidden, and the shortest possible encoding should always be used
+(and that is what Perl does).
+
+Another way to look at it is via bits:
  
   Code Points                    1st Byte   2nd Byte  3rd Byte  4th Byte
  
@@ -715,285 +1147,688 @@ Or, another way to look at it, as bits:
              ccccbbbbbbaaaaaa     1110cccc  10bbbbbb  10aaaaaa
    00000dddccccccbbbbbbaaaaaa     11110ddd  10cccccc  10bbbbbb  10aaaaaa
  
-As you can see, the continuation bytes all begin with C<10>, and the
-leading bits of the start byte tell how many bytes the are in the
+As you can see, the continuation bytes all begin with "10", and the
+leading bits of the start byte tell how many bytes there are in the
  encoded character.
  
  =item *
  
  UTF-EBCDIC
  
-Like UTF-8, but EBCDIC-safe, as UTF-8 is ASCII-safe.
+Like UTF-8 but EBCDIC-safe, in the way that UTF-8 is ASCII-safe.
  
  =item *
  
-UTF-16, UTF-16BE, UTF16-LE, Surrogates, and BOMs (Byte Order Marks)
+UTF-16, UTF-16BE, UTF-16LE, Surrogates, and BOMs (Byte Order Marks)
  
-(The followings items are mostly for reference, Perl doesn't
-use them internally.)
+The followings items are mostly for reference and general Unicode
+knowledge, Perl doesn't use these constructs internally.
  
  UTF-16 is a 2 or 4 byte encoding.  The Unicode code points
-0x0000..0xFFFF are stored in two 16-bit units, and the code points
-0x010000..0x10FFFF in two 16-bit units.  The latter case is
+C<U+0000..U+FFFF> are stored in a single 16-bit unit, and the code
+points C<U+10000..U+10FFFF> in two 16-bit units.  The latter case is
  using I<surrogates>, the first 16-bit unit being the I<high
  surrogate>, and the second being the I<low surrogate>.
  
-Surrogates are code points set aside to encode the 0x01000..0x10FFFF
+Surrogates are code points set aside to encode the C<U+10000..U+10FFFF>
  range of Unicode code points in pairs of 16-bit units.  The I<high
-surrogates> are the range 0xD800..0xDBFF, and the I<low surrogates>
-are the range 0xDC00..0xDFFFF.  The surrogate encoding is
+surrogates> are the range C<U+D800..U+DBFF> and the I<low surrogates>
+are the range C<U+DC00..U+DFFF>.  The surrogate encoding is
  
-       $hi = ($uni - 0x10000) / 0x400 + 0xD800;
-       $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
+    $hi = ($uni - 0x10000) / 0x400 + 0xD800;
+    $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
  
  and the decoding is
  
-       $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00);
+    $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
  
  If you try to generate surrogates (for example by using chr()), you
-will get a warning if warnings are turned on (C<-w> or C<use
-warnings;>) because those code points are not valid for a Unicode
-character.
+will get a warning, if warnings are turned on, because those code
+points are not valid for a Unicode character.
  
-Because of the 16-bitness, UTF-16 is byteorder dependent.  UTF-16
+Because of the 16-bitness, UTF-16 is byte-order dependent.  UTF-16
  itself can be used for in-memory computations, but if storage or
-transfer is required, either UTF-16BE (Big Endian) or UTF-16LE
-(Little Endian) must be chosen.
+transfer is required either UTF-16BE (big-endian) or UTF-16LE
+(little-endian) encodings must be chosen.
  
  This introduces another problem: what if you just know that your data
-is UTF-16, but you don't know which endianness?  Byte Order Marks
-(BOMs) are a solution to this.  A special character has been reserved
+is UTF-16, but you don't know which endianness?  Byte Order Marks, or
+BOMs, are a solution to this.  A special character has been reserved
  in Unicode to function as a byte order marker: the character with the
-code point 0xFEFF is the BOM.
+code point C<U+FEFF> is the BOM.
  
  The trick is that if you read a BOM, you will know the byte order,
-since if it was written on a big endian platform, you will read the
-bytes 0xFE 0xFF, but if it was written on a little endian platform,
-you will read the bytes 0xFF 0xFE.  (And if the originating platform
-was writing in UTF-8, you will read the bytes 0xEF 0xBB 0xBF.)
+since if it was written on a big-endian platform, you will read the
+bytes C<0xFE 0xFF>, but if it was written on a little-endian platform,
+you will read the bytes C<0xFF 0xFE>.  (And if the originating platform
+was writing in UTF-8, you will read the bytes C<0xEF 0xBB 0xBF>.)
  
  The way this trick works is that the character with the code point
-0xFFFE is guaranteed not to be a valid Unicode character, so the
-sequence of bytes 0xFF 0xFE is unambiguously "BOM, represented in
-little-endian format" and cannot be "0xFFFE, represented in big-endian
-format".
+C<U+FFFE> is guaranteed not to be a valid Unicode character, so the
+sequence of bytes C<0xFF 0xFE> is unambiguously "BOM, represented in
+little-endian format" and cannot be C<U+FFFE>, represented in big-endian
+format".  (Actually, C<U+FFFE> is legal for use by your program, even for
+input/output, but better not use it if you need a BOM.  But it is "illegal for
+interchange", so that an unsuspecting program won't get confused.)
  
  =item *
  
-UTF-32, UTF-32BE, UTF32-LE
+UTF-32, UTF-32BE, UTF-32LE
  
  The UTF-32 family is pretty much like the UTF-16 family, expect that
  the units are 32-bit, and therefore the surrogate scheme is not
-needed.  The BOM signatures will be 0x00 0x00 0xFE 0xFF for BE and
-0xFF 0xFE 0x00 0x00 for LE.
+needed.  The BOM signatures will be C<0x00 0x00 0xFE 0xFF> for BE and
+C<0xFF 0xFE 0x00 0x00> for LE.
  
  =item *
  
  UCS-2, UCS-4
  
  Encodings defined by the ISO 10646 standard.  UCS-2 is a 16-bit
-encoding, UCS-4 is a 32-bit encoding.  Unlike UTF-16, UCS-2
-is not extensible beyond 0xFFFF, because it does not use surrogates.
+encoding.  Unlike UTF-16, UCS-2 is not extensible beyond C<U+FFFF>,
+because it does not use surrogates.  UCS-4 is a 32-bit encoding,
+functionally identical to UTF-32.
  
  =item *
  
  UTF-7
  
-A seven-bit safe (non-eight-bit) encoding, useful if the
-transport/storage is not eight-bit safe.  Defined by RFC 2152.
+A seven-bit safe (non-eight-bit) encoding, which is useful if the
+transport or storage is not eight-bit safe.  Defined by RFC 2152.
  
  =back
  
-=head2 Security Implications of Malformed UTF-8
+=head2 Security Implications of Unicode
+
+Read L<Unicode Security Considerations|http://www.unicode.org/reports/tr36>.
+Also, note the following:
+
+=over 4
+
+=item *
+
+Malformed UTF-8
  
  Unfortunately, the specification of UTF-8 leaves some room for
  interpretation of how many bytes of encoded output one should generate
-from one input Unicode character.  Strictly speaking, one is supposed
-to always generate the shortest possible sequence of UTF-8 bytes,
-because otherwise there is potential for input buffer overflow at
+from one input Unicode character.  Strictly speaking, the shortest
+possible sequence of UTF-8 bytes should be generated,
+because otherwise there is potential for an input buffer overflow at
  the receiving end of a UTF-8 connection.  Perl always generates the
-shortest length UTF-8, and with warnings on (C<-w> or C<use
-warnings;>) Perl will warn about non-shortest length UTF-8 (and other
-malformations, too, such as the surrogates, which are not real
-Unicode code points.)
+shortest length UTF-8, and with warnings on, Perl will warn about
+non-shortest length UTF-8 along with other malformations, such as the
+surrogates, which are not real Unicode code points.
+
+=item *
+
+Regular expressions behave slightly differently between byte data and
+character (Unicode) data.  For example, the "word character" character
+class C<\w> will work differently depending on if data is eight-bit bytes
+or Unicode.
+
+In the first case, the set of C<\w> characters is either small--the
+default set of alphabetic characters, digits, and the "_"--or, if you
+are using a locale (see L<perllocale>), the C<\w> might contain a few
+more letters according to your language and country.
+
+In the second case, the C<\w> set of characters is much, much larger.
+Most importantly, even in the set of the first 256 characters, it will
+probably match different characters: unlike most locales, which are
+specific to a language and country pair, Unicode classifies all the
+characters that are letters I<somewhere> as C<\w>.  For example, your
+locale might not think that LATIN SMALL LETTER ETH is a letter (unless
+you happen to speak Icelandic), but Unicode does.
+
+As discussed elsewhere, Perl has one foot (two hooves?) planted in
+each of two worlds: the old world of bytes and the new world of
+characters, upgrading from bytes to characters when necessary.
+If your legacy code does not explicitly use Unicode, no automatic
+switch-over to characters should happen.  Characters shouldn't get
+downgraded to bytes, either.  It is possible to accidentally mix bytes
+and characters, however (see L<perluniintro>), in which case C<\w> in
+regular expressions might start behaving differently.  Review your
+code.  Use warnings and the C<strict> pragma.
+
+=back
  
  =head2 Unicode in Perl on EBCDIC
  
-The way Unicode is handled on EBCDIC platforms is still rather
-experimental.  On such a platform, references to UTF-8 encoding in this
-document and elsewhere should be read as meaning UTF-EBCDIC as
-specified in Unicode Technical Report 16 unless ASCII vs EBCDIC issues
+The way Unicode is handled on EBCDIC platforms is still
+experimental.  On such platforms, references to UTF-8 encoding in this
+document and elsewhere should be read as meaning the UTF-EBCDIC
+specified in Unicode Technical Report 16, unless ASCII vs. EBCDIC issues
  are specifically discussed. There is no C<utfebcdic> pragma or
-":utfebcdic" layer, rather, "utf8" and ":utf8" are re-used to mean
+":utfebcdic" layer; rather, "utf8" and ":utf8" are reused to mean
  the platform's "natural" 8-bit encoding of Unicode. See L<perlebcdic>
  for more discussion of the issues.
  
+=head2 Locales
+
+Usually locale settings and Unicode do not affect each other, but
+there are a couple of exceptions:
+
+=over 4
+
+=item *
+
+You can enable automatic UTF-8-ification of your standard file
+handles, default C<open()> layer, and C<@ARGV> by using either
+the C<-C> command line switch or the C<PERL_UNICODE> environment
+variable, see L<perlrun> for the documentation of the C<-C> switch.
+
+=item *
+
+Perl tries really hard to work both with Unicode and the old
+byte-oriented world. Most often this is nice, but sometimes Perl's
+straddling of the proverbial fence causes problems.
+
+=back
+
+=head2 When Unicode Does Not Happen
+
+While Perl does have extensive ways to input and output in Unicode,
+and few other 'entry points' like the @ARGV which can be interpreted
+as Unicode (UTF-8), there still are many places where Unicode (in some
+encoding or another) could be given as arguments or received as
+results, or both, but it is not.
+
+The following are such interfaces.  Also, see L</The "Unicode Bug">.
+For all of these interfaces Perl
+currently (as of 5.8.3) simply assumes byte strings both as arguments
+and results, or UTF-8 strings if the C<encoding> pragma has been used.
+
+One reason why Perl does not attempt to resolve the role of Unicode in
+these cases is that the answers are highly dependent on the operating
+system and the file system(s).  For example, whether filenames can be
+in Unicode, and in exactly what kind of encoding, is not exactly a
+portable concept.  Similarly for the qx and system: how well will the
+'command line interface' (and which of them?) handle Unicode?
+
+=over 4
+
+=item *
+
+chdir, chmod, chown, chroot, exec, link, lstat, mkdir,
+rename, rmdir, stat, symlink, truncate, unlink, utime, -X
+
+=item *
+
+%ENV
+
+=item *
+
+glob (aka the <*>)
+
+=item *
+
+open, opendir, sysopen
+
+=item *
+
+qx (aka the backtick operator), system
+
+=item *
+
+readdir, readlink
+
+=back
+
+=head2 The "Unicode Bug"
+
+The term, the "Unicode bug" has been applied to an inconsistency with the
+Unicode characters whose ordinals are in the Latin-1 Supplement block, that
+is, between 128 and 255.  Without a locale specified, unlike all other
+characters or code points, these characters have very different semantics in
+byte semantics versus character semantics.
+
+In character semantics they are interpreted as Unicode code points, which means
+they have the same semantics as Latin-1 (ISO-8859-1).
+
+In byte semantics, they are considered to be unassigned characters, meaning
+that the only semantics they have is their ordinal numbers, and that they are
+not members of various character classes.  None are considered to match C<\w>
+for example, but all match C<\W>.  (On EBCDIC platforms, the behavior may
+be different from this, depending on the underlying C language library
+functions.)
+
+The behavior is known to have effects on these areas:
+
+=over 4
+
+=item *
+
+Changing the case of a scalar, that is, using C<uc()>, C<ucfirst()>, C<lc()>,
+and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression
+substitutions.
+
+=item *
+
+Using caseless (C</i>) regular expression matching
+
+=item *
+
+Matching a number of properties in regular expressions, namely C<\b>,
+C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes
+I<except> C<[[:ascii:]]>.
+
+=item *
+
+User-defined case change mappings.  You can create a C<ToUpper()> function, for
+example, which overrides Perl's built-in case mappings.  The scalar must be
+encoded in utf8 for your function to actually be invoked.
+
+=back
+
+This behavior can lead to unexpected results in which a string's semantics
+suddenly change if a code point above 255 is appended to or removed from it,
+which changes the string's semantics from byte to character or vice versa.  As
+an example, consider the following program and its output:
+
+ $ perl -le'
+     $s1 = "\xC2";
+     $s2 = "\x{2660}";
+     for ($s1, $s2, $s1.$s2) {
+         print /\w/ || 0;
+     }
+ '
+ 0
+ 0
+ 1
+
+If there's no C<\w> in C<s1> or in C<s2>, why does their concatenation have one?
+
+This anomaly stems from Perl's attempt to not disturb older programs that
+didn't use Unicode, and hence had no semantics for characters outside of the
+ASCII range (except in a locale), along with Perl's desire to add Unicode
+support seamlessly.  The result wasn't seamless: these characters were
+orphaned.
+
+Work is being done to correct this, but only some of it was complete in time
+for the 5.12 release.  What has been finished is the important part of the case
+changing component.  Due to concerns, and some evidence, that older code might
+have come to rely on the existing behavior, the new behavior must be explicitly
+enabled by the feature C<unicode_strings> in the L<feature> pragma, even though
+no new syntax is involved.
+
+See L<perlfunc/lc> for details on how this pragma works in combination with
+various others for casing.  Even though the pragma only affects casing
+operations in the 5.12 release, it is planned to have it affect all the
+problematic behaviors in later releases: you can't have one without them all.
+
+In the meantime, a workaround is to always call utf8::upgrade($string), or to
+use the standard module L<Encode>.   Also, a scalar that has any characters
+whose ordinal is above 0x100, or which were specified using either of the
+C<\N{...}> notations will automatically have character semantics.
+
+=head2 Forcing Unicode in Perl (Or Unforcing Unicode in Perl)
+
+Sometimes (see L</"When Unicode Does Not Happen"> or L</The "Unicode Bug">)
+there are situations where you simply need to force a byte
+string into UTF-8, or vice versa.  The low-level calls
+utf8::upgrade($bytestring) and utf8::downgrade($utf8string[, FAIL_OK]) are
+the answers.
+
+Note that utf8::downgrade() can fail if the string contains characters
+that don't fit into a byte.
+
+Calling either function on a string that already is in the desired state is a
+no-op.
+
  =head2 Using Unicode in XS
  
-If you want to handle Perl Unicode in XS extensions, you may find
-the following C APIs useful (see perlapi for details):
+If you want to handle Perl Unicode in XS extensions, you may find the
+following C APIs useful.  See also L<perlguts/"Unicode Support"> for an
+explanation about Unicode at the XS level, and L<perlapi> for the API
+details.
  
  =over 4
  
  =item *
  
-DO_UTF8(sv) returns true if the UTF8 flag is on and the bytes pragma
-is not in effect.  SvUTF8(sv) returns true is the UTF8 flag is on, the
-bytes pragma is ignored.  The UTF8 flag being on does B<not> mean that
-there are any characters of code points greater than 255 (or 127) in
-the scalar, or that there even are any characters in the scalar.
-What the UTF8 flag means is that the sequence of octets in the
-representation of the scalar is the sequence of UTF-8 encoded
-code points of the characters of a string.  The UTF8 flag being
-off means that each octet in this representation encodes a single
-character with codepoint 0..255 within the string.  Perl's Unicode
-model is not to use UTF-8 until it's really necessary.
+C<DO_UTF8(sv)> returns true if the C<UTF8> flag is on and the bytes
+pragma is not in effect.  C<SvUTF8(sv)> returns true if the C<UTF8>
+flag is on; the bytes pragma is ignored.  The C<UTF8> flag being on
+does B<not> mean that there are any characters of code points greater
+than 255 (or 127) in the scalar or that there are even any characters
+in the scalar.  What the C<UTF8> flag means is that the sequence of
+octets in the representation of the scalar is the sequence of UTF-8
+encoded code points of the characters of a string.  The C<UTF8> flag
+being off means that each octet in this representation encodes a
+single character with code point 0..255 within the string.  Perl's
+Unicode model is not to use UTF-8 until it is absolutely necessary.
  
  =item *
  
-uvuni_to_utf8(buf, chr) writes a Unicode character code point into a
-buffer encoding the code point as UTF-8, and returns a pointer
-pointing after the UTF-8 bytes.
+C<uvchr_to_utf8(buf, chr)> writes a Unicode character code point into
+a buffer encoding the code point as UTF-8, and returns a pointer
+pointing after the UTF-8 bytes.  It works appropriately on EBCDIC machines.
  
  =item *
  
-utf8_to_uvuni(buf, lenp) reads UTF-8 encoded bytes from a buffer and
-returns the Unicode character code point (and optionally the length of
-the UTF-8 byte sequence).
+C<utf8_to_uvchr(buf, lenp)> reads UTF-8 encoded bytes from a buffer and
+returns the Unicode character code point and, optionally, the length of
+the UTF-8 byte sequence.  It works appropriately on EBCDIC machines.
  
  =item *
  
-utf8_length(start, end) returns the length of the UTF-8 encoded buffer
-in characters.  sv_len_utf8(sv) returns the length of the UTF-8 encoded
+C<utf8_length(start, end)> returns the length of the UTF-8 encoded buffer
+in characters.  C<sv_len_utf8(sv)> returns the length of the UTF-8 encoded
  scalar.
  
  =item *
  
-sv_utf8_upgrade(sv) converts the string of the scalar to its UTF-8
-encoded form.  sv_utf8_downgrade(sv) does the opposite (if possible).
-sv_utf8_encode(sv) is like sv_utf8_upgrade but the UTF8 flag does not
-get turned on.  sv_utf8_decode() does the opposite of sv_utf8_encode().
-Note that none of these are to be used as general purpose encoding/decoding
-interfaces: use Encode for that.  sv_utf8_upgrade() is affected by the
-encoding pragma, but sv_utf8_downgrade() is not (since the encoding
-pragma is designed to be a one-way street).
+C<sv_utf8_upgrade(sv)> converts the string of the scalar to its UTF-8
+encoded form.  C<sv_utf8_downgrade(sv)> does the opposite, if
+possible.  C<sv_utf8_encode(sv)> is like sv_utf8_upgrade except that
+it does not set the C<UTF8> flag.  C<sv_utf8_decode()> does the
+opposite of C<sv_utf8_encode()>.  Note that none of these are to be
+used as general-purpose encoding or decoding interfaces: C<use Encode>
+for that.  C<sv_utf8_upgrade()> is affected by the encoding pragma
+but C<sv_utf8_downgrade()> is not (since the encoding pragma is
+designed to be a one-way street).
  
  =item *
  
-is_utf8_char(s) returns true if the pointer points to a valid UTF-8
+C<is_utf8_char(s)> returns true if the pointer points to a valid UTF-8
  character.
  
  =item *
  
-is_utf8_string(buf, len) returns true if the len bytes of the buffer
+C<is_utf8_string(buf, len)> returns true if C<len> bytes of the buffer
  are valid UTF-8.
  
  =item *
  
-UTF8SKIP(buf) will return the number of bytes in the UTF-8 encoded
-character in the buffer.  UNISKIP(chr) will return the number of bytes
-required to UTF-8-encode the Unicode character code point.  UTF8SKIP()
+C<UTF8SKIP(buf)> will return the number of bytes in the UTF-8 encoded
+character in the buffer.  C<UNISKIP(chr)> will return the number of bytes
+required to UTF-8-encode the Unicode character code point.  C<UTF8SKIP()>
  is useful for example for iterating over the characters of a UTF-8
-encoded buffer; UNISKIP() is useful for example in computing
+encoded buffer; C<UNISKIP()> is useful, for example, in computing
  the size required for a UTF-8 encoded buffer.
  
  =item *
  
-utf8_distance(a, b) will tell the distance in characters between the
+C<utf8_distance(a, b)> will tell the distance in characters between the
  two pointers pointing to the same UTF-8 encoded buffer.
  
  =item *
  
-utf8_hop(s, off) will return a pointer to an UTF-8 encoded buffer that
-is C<off> (positive or negative) Unicode characters displaced from the
-UTF-8 buffer C<s>.  Be careful not to overstep the buffer: utf8_hop()
-will merrily run off the end or the beginning if told to do so.
+C<utf8_hop(s, off)> will return a pointer to a UTF-8 encoded buffer
+that is C<off> (positive or negative) Unicode characters displaced
+from the UTF-8 buffer C<s>.  Be careful not to overstep the buffer:
+C<utf8_hop()> will merrily run off the end or the beginning of the
+buffer if told to do so.
  
  =item *
  
-pv_uni_display(dsv, spv, len, pvlim, flags) and sv_uni_display(dsv,
-ssv, pvlim, flags) are useful for debug output of Unicode strings and
-scalars.  By default they are useful only for debug: they display
-B<all> characters as hexadecimal code points, but with the flags
-UNI_DISPLAY_ISPRINT and UNI_DISPLAY_BACKSLASH you can make the output
-more readable.
+C<pv_uni_display(dsv, spv, len, pvlim, flags)> and
+C<sv_uni_display(dsv, ssv, pvlim, flags)> are useful for debugging the
+output of Unicode strings and scalars.  By default they are useful
+only for debugging--they display B<all> characters as hexadecimal code
+points--but with the flags C<UNI_DISPLAY_ISPRINT>,
+C<UNI_DISPLAY_BACKSLASH>, and C<UNI_DISPLAY_QQ> you can make the
+output more readable.
  
  =item *
  
-ibcmp_utf8(s1, pe1, u1, l1, u1, s2, pe2, l2, u2) can be used to
-compare two strings case-insensitively in Unicode.
-(For case-sensitive comparisons you can just use memEQ() and memNE()
-as usual.)
+C<ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2)> can be used to
+compare two strings case-insensitively in Unicode.  For case-sensitive
+comparisons you can just use C<memEQ()> and C<memNE()> as usual.
  
  =back
  
  For more information, see L<perlapi>, and F<utf8.c> and F<utf8.h>
  in the Perl source code distribution.
  
+=head2 Hacking Perl to work on earlier Unicode versions (for very serious hackers only)
+
+Perl by default comes with the latest supported Unicode version built in, but
+you can change to use any earlier one.
+
+Download the files in the version of Unicode that you want from the Unicode web
+site L<http://www.unicode.org>).  These should replace the existing files in
+C<\$Config{privlib}>/F<unicore>.  (C<\%Config> is available from the Config
+module.)  Follow the instructions in F<README.perl> in that directory to change
+some of their names, and then run F<make>.
+
+It is even possible to download them to a different directory, and then change
+F<utf8_heavy.pl> in the directory C<\$Config{privlib}> to point to the new
+directory, or maybe make a copy of that directory before making the change, and
+using C<@INC> or the C<-I> run-time flag to switch between versions at will
+(but because of caching, not in the middle of a process), but all this is
+beyond the scope of these instructions.
+
  =head1 BUGS
  
-Use of locales with Unicode data may lead to odd results.  Currently
-there is some attempt to apply 8-bit locale info to characters in the
-range 0..255, but this is demonstrably incorrect for locales that use
-characters above that range when mapped into Unicode.  It will also
-tend to run slower.  Avoidance of locales is strongly encouraged.
+=head2 Interaction with Locales
+
+Use of locales with Unicode data may lead to odd results.  Currently,
+Perl attempts to attach 8-bit locale info to characters in the range
+0..255, but this technique is demonstrably incorrect for locales that
+use characters above that range when mapped into Unicode.  Perl's
+Unicode support will also tend to run slower.  Use of locales with
+Unicode is discouraged.
+
+=head2 Problems with characters in the Latin-1 Supplement range
+
+See L</The "Unicode Bug">
+
+=head2 Problems with case-insensitive regular expression matching
+
+There are problems with case-insensitive matches, including those involving
+character classes (enclosed in [square brackets]), characters whose fold
+is to multiple characters (such as the single character LATIN SMALL LIGATURE
+FFL matches case-insensitively with the 3-character string C<ffl>), and
+characters in the Latin-1 Supplement.
+
+=head2 Interaction with Extensions
+
+When Perl exchanges data with an extension, the extension should be
+able to understand the UTF8 flag and act accordingly. If the
+extension doesn't know about the flag, it's likely that the extension
+will return incorrectly-flagged data.
+
+So if you're working with Unicode data, consult the documentation of
+every module you're using if there are any issues with Unicode data
+exchange. If the documentation does not talk about Unicode at all,
+suspect the worst and probably look at the source to learn how the
+module is implemented. Modules written completely in Perl shouldn't
+cause problems. Modules that directly or indirectly access code written
+in other programming languages are at risk.
+
+For affected functions, the simple strategy to avoid data corruption is
+to always make the encoding of the exchanged data explicit. Choose an
+encoding that you know the extension can handle. Convert arguments passed
+to the extensions to that encoding and convert results back from that
+encoding. Write wrapper functions that do the conversions for you, so
+you can later change the functions when the extension catches up.
+
+To provide an example, let's say the popular Foo::Bar::escape_html
+function doesn't deal with Unicode data yet. The wrapper function
+would convert the argument to raw UTF-8 and convert the result back to
+Perl's internal representation like so:
+
+    sub my_escape_html ($) {
+        my($what) = shift;
+        return unless defined $what;
+        Encode::decode_utf8(Foo::Bar::escape_html(
+                                         Encode::encode_utf8($what)));
+    }
+
+Sometimes, when the extension does not convert data but just stores
+and retrieves them, you will be in a position to use the otherwise
+dangerous Encode::_utf8_on() function. Let's say the popular
+C<Foo::Bar> extension, written in C, provides a C<param> method that
+lets you store and retrieve data according to these prototypes:
+
+    $self->param($name, $value);            # set a scalar
+    $value = $self->param($name);           # retrieve a scalar
+
+If it does not yet provide support for any encoding, one could write a
+derived class with such a C<param> method:
+
+    sub param {
+      my($self,$name,$value) = @_;
+      utf8::upgrade($name);     # make sure it is UTF-8 encoded
+      if (defined $value) {
+        utf8::upgrade($value);  # make sure it is UTF-8 encoded
+        return $self->SUPER::param($name,$value);
+      } else {
+        my $ret = $self->SUPER::param($name);
+        Encode::_utf8_on($ret); # we know, it is UTF-8 encoded
+        return $ret;
+      }
+    }
+
+Some extensions provide filters on data entry/exit points, such as
+DB_File::filter_store_key and family. Look out for such filters in
+the documentation of your extensions, they can make the transition to
+Unicode data much easier.
+
+=head2 Speed
  
  Some functions are slower when working on UTF-8 encoded strings than
-on byte encoded strings. All functions that need to hop over
-characters such as length(), substr() or index() can work B<much>
-faster when the underlying data are byte-encoded. Witness the
-following benchmark:
-  
-  % perl -e '
-  use Benchmark;
-  use strict;
-  our $l = 10000;
-  our $u = our $b = "x" x $l;
-  substr($u,0,1) = "\x{100}";
-  timethese(-2,{
-  LENGTH_B => q{ length($b) },
-  LENGTH_U => q{ length($u) },
-  SUBSTR_B => q{ substr($b, $l/4, $l/2) },
-  SUBSTR_U => q{ substr($u, $l/4, $l/2) },
-  });
-  '
-  Benchmark: running LENGTH_B, LENGTH_U, SUBSTR_B, SUBSTR_U for at least 2 CPU seconds...
-    LENGTH_B:  2 wallclock secs ( 2.36 usr +  0.00 sys =  2.36 CPU) @ 5649983.05/s (n=13333960)
-    LENGTH_U:  2 wallclock secs ( 2.11 usr +  0.00 sys =  2.11 CPU) @ 12155.45/s (n=25648)
-    SUBSTR_B:  3 wallclock secs ( 2.16 usr +  0.00 sys =  2.16 CPU) @ 374480.09/s (n=808877)
-    SUBSTR_U:  2 wallclock secs ( 2.11 usr +  0.00 sys =  2.11 CPU) @ 6791.00/s (n=14329)
-  
-The numbers show an incredible slowness on long UTF-8 strings and you
-should carefully avoid to use these functions within tight loops. For
-example if you want to iterate over characters, it is infinitely
-better to split into an array than to use substr, as the following
-benchmark shows:
-
-  % perl -e '
-  use Benchmark;
-  use strict;
-  our $l = 10000;
-  our $u = our $b = "x" x $l;
-  substr($u,0,1) = "\x{100}";
-  timethese(-5,{
-  SPLIT_B => q{ for my $c (split //, $b){}  },
-  SPLIT_U => q{ for my $c (split //, $u){}  },
-  SUBSTR_B => q{ for my $i (0..length($b)-1){my $c = substr($b,$i,1);} },
-  SUBSTR_U => q{ for my $i (0..length($u)-1){my $c = substr($u,$i,1);} },
-  });
-  '
-  Benchmark: running SPLIT_B, SPLIT_U, SUBSTR_B, SUBSTR_U for at least 5 CPU seconds...
-     SPLIT_B:  6 wallclock secs ( 5.29 usr +  0.00 sys =  5.29 CPU) @ 56.14/s (n=297)
-     SPLIT_U:  5 wallclock secs ( 5.17 usr +  0.01 sys =  5.18 CPU) @ 55.21/s (n=286)
-    SUBSTR_B:  5 wallclock secs ( 5.34 usr +  0.00 sys =  5.34 CPU) @ 123.22/s (n=658)
-    SUBSTR_U:  7 wallclock secs ( 6.20 usr +  0.00 sys =  6.20 CPU) @  0.81/s (n=5)
-
-You see, the algorithm based on substr() was faster with byte encoded
-data but it is pathologically slow with UTF-8 data.
-  
+on byte encoded strings.  All functions that need to hop over
+characters such as length(), substr() or index(), or matching regular
+expressions can work B<much> faster when the underlying data are
+byte-encoded.
+
+In Perl 5.8.0 the slowness was often quite spectacular; in Perl 5.8.1
+a caching scheme was introduced which will hopefully make the slowness
+somewhat less spectacular, at least for some operations.  In general,
+operations with UTF-8 encoded strings are still slower. As an example,
+the Unicode properties (character classes) like C<\p{Nd}> are known to
+be quite a bit slower (5-20 times) than their simpler counterparts
+like C<\d> (then again, there 268 Unicode characters matching C<Nd>
+compared with the 10 ASCII characters matching C<d>).
+
+=head2 Problems on EBCDIC platforms
+
+There are a number of known problems with Perl on EBCDIC platforms.  If you
+want to use Perl there, send email to perlbug@perl.org.
+
+In earlier versions, when byte and character data were concatenated,
+the new string was sometimes created by
+decoding the byte strings as I<ISO 8859-1 (Latin-1)>, even if the
+old Unicode string used EBCDIC.
+
+If you find any of these, please report them as bugs.
+
+=head2 Porting code from perl-5.6.X
+
+Perl 5.8 has a different Unicode model from 5.6. In 5.6 the programmer
+was required to use the C<utf8> pragma to declare that a given scope
+expected to deal with Unicode data and had to make sure that only
+Unicode data were reaching that scope. If you have code that is
+working with 5.6, you will need some of the following adjustments to
+your code. The examples are written such that the code will continue
+to work under 5.6, so you should be safe to try them out.
+
+=over 4
+
+=item *
+
+A filehandle that should read or write UTF-8
+
+  if ($] > 5.007) {
+    binmode $fh, ":encoding(utf8)";
+  }
+
+=item *
+
+A scalar that is going to be passed to some extension
+
+Be it Compress::Zlib, Apache::Request or any extension that has no
+mention of Unicode in the manpage, you need to make sure that the
+UTF8 flag is stripped off. Note that at the time of this writing
+(October 2002) the mentioned modules are not UTF-8-aware. Please
+check the documentation to verify if this is still true.
+
+  if ($] > 5.007) {
+    require Encode;
+    $val = Encode::encode_utf8($val); # make octets
+  }
+
+=item *
+
+A scalar we got back from an extension
+
+If you believe the scalar comes back as UTF-8, you will most likely
+want the UTF8 flag restored:
+
+  if ($] > 5.007) {
+    require Encode;
+    $val = Encode::decode_utf8($val);
+  }
+
+=item *
+
+Same thing, if you are really sure it is UTF-8
+
+  if ($] > 5.007) {
+    require Encode;
+    Encode::_utf8_on($val);
+  }
+
+=item *
+
+A wrapper for fetchrow_array and fetchrow_hashref
+
+When the database contains only UTF-8, a wrapper function or method is
+a convenient way to replace all your fetchrow_array and
+fetchrow_hashref calls. A wrapper function will also make it easier to
+adapt to future enhancements in your database driver. Note that at the
+time of this writing (October 2002), the DBI has no standardized way
+to deal with UTF-8 data. Please check the documentation to verify if
+that is still true.
+
+  sub fetchrow {
+    # $what is one of fetchrow_{array,hashref}
+    my($self, $sth, $what) = @_;
+    if ($] < 5.007) {
+      return $sth->$what;
+    } else {
+      require Encode;
+      if (wantarray) {
+        my @arr = $sth->$what;
+        for (@arr) {
+          defined && /[^\000-\177]/ && Encode::_utf8_on($_);
+        }
+        return @arr;
+      } else {
+        my $ret = $sth->$what;
+        if (ref $ret) {
+          for my $k (keys %$ret) {
+            defined
+            && /[^\000-\177]/
+            && Encode::_utf8_on($_) for $ret->{$k};
+          }
+          return $ret;
+        } else {
+          defined && /[^\000-\177]/ && Encode::_utf8_on($_) for $ret;
+          return $ret;
+        }
+      }
+    }
+  }
+
+
+=item *
+
+A large scalar that you know can only contain ASCII
+
+Scalars that contain only ASCII and are marked as UTF-8 are sometimes
+a drag to your program. If you recognize such a situation, just remove
+the UTF8 flag:
+
+  utf8::downgrade($val) if $] > 5.007;
+
+=back
+
  =head1 SEE ALSO
  
-L<perluniintro>, L<encoding>, L<Encode>, L<open>, L<utf8>, L<bytes>,
-L<perlretut>, L<perlvar/"${^WIDE_SYSTEM_CALLS}">
+L<perlunitut>, L<perluniintro>, L<perluniprops>, L<Encode>, L<open>, L<utf8>, L<bytes>,
+L<perlretut>, L<perlvar/"${^UNICODE}">
+L<http://www.unicode.org/reports/tr44>).
  
  =cut