s.

[perl5.git] / pod / perlunicode.pod
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index 7fb473e..af79344 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -113,8 +113,8 @@ Character semantics have the following effects:
  
  =item *
  
  
  =item *
  
-Strings and patterns may contain characters that have an ordinal value
-larger than 255.
+Strings (including hash keys) and regular expression patterns may
+contain characters that have an ordinal value larger than 255.
  
  If you use a Unicode editor to edit your program, Unicode characters
  may occur directly within the literal strings in one of the various
  
  If you use a Unicode editor to edit your program, Unicode characters
  may occur directly within the literal strings in one of the various
@@ -128,18 +128,20 @@ hexadecimal, into the curlies. For instance, a smiley face is C<\x{263A}>.
  This works only for characters with a code 0x100 and above.
  
  Additionally, if you
  This works only for characters with a code 0x100 and above.
  
  Additionally, if you
+
     use charnames ':full';
     use charnames ':full';
+
  you can use the C<\N{...}> notation, putting the official Unicode character
  name within the curlies. For example, C<\N{WHITE SMILING FACE}>.
  This works for all characters that have names.
  
  =item *
  
  you can use the C<\N{...}> notation, putting the official Unicode character
  name within the curlies. For example, C<\N{WHITE SMILING FACE}>.
  This works for all characters that have names.
  
  =item *
  
-If an appropriate L<encoding> is specified,
-identifiers within the Perl script may contain Unicode alphanumeric
-characters, including ideographs.  (You are currently on your own when
-it comes to using the canonical forms of characters--Perl doesn't
-(yet) attempt to canonicalize variable names for you.)
+If an appropriate L<encoding> is specified, identifiers within the
+Perl script may contain Unicode alphanumeric characters, including
+ideographs.  (You are currently on your own when it comes to using the
+canonical forms of characters--Perl doesn't (yet) attempt to
+canonicalize variable names for you.)
  
  =item *
  
  
  =item *
  
@@ -272,12 +274,13 @@ written right to left.
  =head2 Scripts
  
  The scripts available via C<\p{...}> and C<\P{...}>, for example
  =head2 Scripts
  
  The scripts available via C<\p{...}> and C<\P{...}>, for example
-C<\p{Latin}> or \p{Cyrillic>, are as follows:
+C<\p{Latin}> or C<\p{Cyrillic}>, are as follows:
  
      Arabic
      Armenian
      Bengali
      Bopomofo
  
      Arabic
      Armenian
      Bengali
      Bopomofo
+    Buhid
      CanadianAboriginal
      Cherokee
      Cyrillic
      CanadianAboriginal
      Cherokee
      Cyrillic
@@ -291,6 +294,7 @@ C<\p{Latin}> or \p{Cyrillic>, are as follows:
      Gurmukhi
      Han
      Hangul
      Gurmukhi
      Han
      Hangul
+    Hanunoo
      Hebrew
      Hiragana
      Inherited
      Hebrew
      Hiragana
      Inherited
@@ -308,6 +312,8 @@ C<\p{Latin}> or \p{Cyrillic>, are as follows:
      Runic
      Sinhala
      Syriac
      Runic
      Sinhala
      Syriac
+    Tagalog
+    Tagbanwa
      Tamil
      Telugu
      Thaana
      Tamil
      Telugu
      Thaana
@@ -318,21 +324,32 @@ C<\p{Latin}> or \p{Cyrillic>, are as follows:
  There are also extended property classes that supplement the basic
  properties, defined by the F<PropList> Unicode database:
  
  There are also extended property classes that supplement the basic
  properties, defined by the F<PropList> Unicode database:
  
-    ASCII_Hex_Digit
+    ASCIIHexDigit
      BidiControl
      Dash
      BidiControl
      Dash
+    Deprecated
      Diacritic
      Extender
      Diacritic
      Extender
+    GraphemeLink
      HexDigit
      Hyphen
      Ideographic
      HexDigit
      Hyphen
      Ideographic
+    IDSBinaryOperator
+    IDSTrinaryOperator
      JoinControl
      JoinControl
+    LogicalOrderException
      NoncharacterCodePoint
      OtherAlphabetic
      NoncharacterCodePoint
      OtherAlphabetic
+    OtherDefaultIgnorableCodePoint
+    OtherGraphemeExtend
      OtherLowercase
      OtherMath
      OtherUppercase
      QuotationMark
      OtherLowercase
      OtherMath
      OtherUppercase
      QuotationMark
+    Radical
+    SoftDotted
+    TerminalPunctuation
+    UnifiedIdeograph
      WhiteSpace
  
  and further derived properties:
      WhiteSpace
  
  and further derived properties:
@@ -346,12 +363,12 @@ and further derived properties:
      ID_Continue     ID_Start + Mn + Mc + Nd + Pc
  
      Any             Any character
      ID_Continue     ID_Start + Mn + Mc + Nd + Pc
  
      Any             Any character
-    Assigned        Any non-Cn character (i.e. synonym for C<\P{Cn}>)
-    Unassigned      Synonym for C<\p{Cn}>
+    Assigned        Any non-Cn character (i.e. synonym for \P{Cn})
+    Unassigned      Synonym for \p{Cn}
      Common          Any character (or unassigned code point)
                      not explicitly assigned to a script
  
      Common          Any character (or unassigned code point)
                      not explicitly assigned to a script
  
-For backward compatability, all properties mentioned so far may have C<Is>
+For backward compatibility, all properties mentioned so far may have C<Is>
  prepended to their name (e.g. C<\P{IsLu}> is equal to C<\P{Lu}>).
  
  =head2 Blocks
  prepended to their name (e.g. C<\P{IsLu}> is equal to C<\P{Lu}>).
  
  =head2 Blocks
@@ -376,114 +393,128 @@ For more about blocks, see:
  
  Blocks names are given with the C<In> prefix. For example, the
  Katakana block is referenced via C<\p{InKatakana}>. The C<In>
  
  Blocks names are given with the C<In> prefix. For example, the
  Katakana block is referenced via C<\p{InKatakana}>. The C<In>
-prefix may be omitted if there is no nameing conflict with a script
+prefix may be omitted if there is no naming conflict with a script
  or any other property, but it is recommended that C<In> always be used
  to avoid confusion.
  
  These block names are supported:
  
  or any other property, but it is recommended that C<In> always be used
  to avoid confusion.
  
  These block names are supported:
  
-   InAlphabeticPresentationForms
-   InArabicBlock
-   InArabicPresentationFormsA
-   InArabicPresentationFormsB
-   InArmenianBlock
-   InArrows
-   InBasicLatin
-   InBengaliBlock
-   InBlockElements
-   InBopomofoBlock
-   InBopomofoExtended
-   InBoxDrawing
-   InBraillePatterns
-   InByzantineMusicalSymbols
-   InCJKCompatibility
-   InCJKCompatibilityForms
-   InCJKCompatibilityIdeographs
-   InCJKCompatibilityIdeographsSupplement
-   InCJKRadicalsSupplement
-   InCJKSymbolsAndPunctuation
-   InCJKUnifiedIdeographs
-   InCJKUnifiedIdeographsExtensionA
-   InCJKUnifiedIdeographsExtensionB
-   InCherokeeBlock
-   InCombiningDiacriticalMarks
-   InCombiningHalfMarks
-   InCombiningMarksForSymbols
-   InControlPictures
-   InCurrencySymbols
-   InCyrillicBlock
-   InDeseretBlock
-   InDevanagariBlock
-   InDingbats
-   InEnclosedAlphanumerics
-   InEnclosedCJKLettersAndMonths
-   InEthiopicBlock
-   InGeneralPunctuation
-   InGeometricShapes
-   InGeorgianBlock
-   InGothicBlock
-   InGreekBlock
-   InGreekExtended
-   InGujaratiBlock
-   InGurmukhiBlock
-   InHalfwidthAndFullwidthForms
-   InHangulCompatibilityJamo
-   InHangulJamo
-   InHangulSyllables
-   InHebrewBlock
-   InHighPrivateUseSurrogates
-   InHighSurrogates
-   InHiraganaBlock
-   InIPAExtensions
-   InIdeographicDescriptionCharacters
-   InKanbun
-   InKangxiRadicals
-   InKannadaBlock
-   InKatakanaBlock
-   InKhmerBlock
-   InLaoBlock
-   InLatin1Supplement
-   InLatinExtendedAdditional
-   InLatinExtended-A
-   InLatinExtended-B
-   InLetterlikeSymbols
-   InLowSurrogates
-   InMalayalamBlock
-   InMathematicalAlphanumericSymbols
-   InMathematicalOperators
-   InMiscellaneousSymbols
-   InMiscellaneousTechnical
-   InMongolianBlock
-   InMusicalSymbols
-   InMyanmarBlock
-   InNumberForms
-   InOghamBlock
-   InOldItalicBlock
-   InOpticalCharacterRecognition
-   InOriyaBlock
-   InPrivateUse
-   InRunicBlock
-   InSinhalaBlock
-   InSmallFormVariants
-   InSpacingModifierLetters
-   InSpecials
-   InSuperscriptsAndSubscripts
-   InSyriacBlock
-   InTags
-   InTamilBlock
-   InTeluguBlock
-   InThaanaBlock
-   InThaiBlock
-   InTibetanBlock
-   InUnifiedCanadianAboriginalSyllabics
-   InYiRadicals
-   InYiSyllables
+    InAlphabeticPresentationForms
+    InArabic
+    InArabicPresentationFormsA
+    InArabicPresentationFormsB
+    InArmenian
+    InArrows
+    InBasicLatin
+    InBengali
+    InBlockElements
+    InBopomofo
+    InBopomofoExtended
+    InBoxDrawing
+    InBraillePatterns
+    InBuhid
+    InByzantineMusicalSymbols
+    InCJKCompatibility
+    InCJKCompatibilityForms
+    InCJKCompatibilityIdeographs
+    InCJKCompatibilityIdeographsSupplement
+    InCJKRadicalsSupplement
+    InCJKSymbolsAndPunctuation
+    InCJKUnifiedIdeographs
+    InCJKUnifiedIdeographsExtensionA
+    InCJKUnifiedIdeographsExtensionB
+    InCherokee
+    InCombiningDiacriticalMarks
+    InCombiningDiacriticalMarksforSymbols
+    InCombiningHalfMarks
+    InControlPictures
+    InCurrencySymbols
+    InCyrillic
+    InCyrillicSupplementary
+    InDeseret
+    InDevanagari
+    InDingbats
+    InEnclosedAlphanumerics
+    InEnclosedCJKLettersAndMonths
+    InEthiopic
+    InGeneralPunctuation
+    InGeometricShapes
+    InGeorgian
+    InGothic
+    InGreekExtended
+    InGreekAndCoptic
+    InGujarati
+    InGurmukhi
+    InHalfwidthAndFullwidthForms
+    InHangulCompatibilityJamo
+    InHangulJamo
+    InHangulSyllables
+    InHanunoo
+    InHebrew
+    InHighPrivateUseSurrogates
+    InHighSurrogates
+    InHiragana
+    InIPAExtensions
+    InIdeographicDescriptionCharacters
+    InKanbun
+    InKangxiRadicals
+    InKannada
+    InKatakana
+    InKatakanaPhoneticExtensions
+    InKhmer
+    InLao
+    InLatin1Supplement
+    InLatinExtendedA
+    InLatinExtendedAdditional
+    InLatinExtendedB
+    InLetterlikeSymbols
+    InLowSurrogates
+    InMalayalam
+    InMathematicalAlphanumericSymbols
+    InMathematicalOperators
+    InMiscellaneousMathematicalSymbolsA
+    InMiscellaneousMathematicalSymbolsB
+    InMiscellaneousSymbols
+    InMiscellaneousTechnical
+    InMongolian
+    InMusicalSymbols
+    InMyanmar
+    InNumberForms
+    InOgham
+    InOldItalic
+    InOpticalCharacterRecognition
+    InOriya
+    InPrivateUseArea
+    InRunic
+    InSinhala
+    InSmallFormVariants
+    InSpacingModifierLetters
+    InSpecials
+    InSuperscriptsAndSubscripts
+    InSupplementalArrowsA
+    InSupplementalArrowsB
+    InSupplementalMathematicalOperators
+    InSupplementaryPrivateUseAreaA
+    InSupplementaryPrivateUseAreaB
+    InSyriac
+    InTagalog
+    InTagbanwa
+    InTags
+    InTamil
+    InTelugu
+    InThaana
+    InThai
+    InTibetan
+    InUnifiedCanadianAboriginalSyllabics
+    InVariationSelectors
+    InYiRadicals
+    InYiSyllables
  
  =over 4
  
  =item *
  
  
  =over 4
  
  =item *
  
-The special pattern C<\X> match matches any extended Unicode sequence
+The special pattern C<\X> matches any extended Unicode sequence
  (a "combining character sequence" in Standardese), where the first
  character is a base character and subsequent characters are mark
  characters that apply to the base character.  It is equivalent to
  (a "combining character sequence" in Standardese), where the first
  character is a base character and subsequent characters are mark
  characters that apply to the base character.  It is equivalent to
@@ -588,18 +619,7 @@ And finally, C<scalar reverse()> reverses by character rather than by byte.
  
  See L<Encode>.
  
  
  See L<Encode>.
  
-=head1 CAVEATS
-
-Whether an arbitrary piece of data will be treated as "characters" or
-"bytes" by internal operations cannot be divined at the current time.
-
-Use of locales with Unicode data may lead to odd results.  Currently
-there is some attempt to apply 8-bit locale info to characters in the
-range 0..255, but this is demonstrably incorrect for locales that use
-characters above that range when mapped into Unicode.  It will also
-tend to run slower.  Avoidance of locales is strongly encouraged.
-
-=head1 UNICODE REGULAR EXPRESSION SUPPORT LEVEL
+=head2 Unicode Regular Expression Support Level
  
  The following list of Unicode regular expression support describes
  feature by feature the Unicode support implemented in Perl as of Perl
  
  The following list of Unicode regular expression support describes
  feature by feature the Unicode support implemented in Perl as of Perl
@@ -645,8 +665,8 @@ For example, what TR18 might write as
  
  in Perl can be written as:
  
  
  in Perl can be written as:
  
-    (?!\p{Unassigned})\p{InGreek}
-    (?=\p{Assigned})\p{InGreek}
+    (?!\p{Unassigned})\p{InGreekAndCoptic}
+    (?=\p{Assigned})\p{InGreekAndCoptic}
  
  But in this particular example, you probably really want
  
  
  But in this particular example, you probably really want
  
@@ -692,7 +712,7 @@ numbers.  To use these numbers various encodings are needed.
  
  =over 4
  
  
  =over 4
  
-=item
+=item *
  
  UTF-8
  
  
  UTF-8
  
@@ -700,18 +720,28 @@ UTF-8 is a variable-length (1 to 6 bytes, current character allocations
  require 4 bytes), byteorder independent encoding. For ASCII, UTF-8 is
  transparent (and we really do mean 7-bit ASCII, not another 8-bit encoding).
  
  require 4 bytes), byteorder independent encoding. For ASCII, UTF-8 is
  transparent (and we really do mean 7-bit ASCII, not another 8-bit encoding).
  
-The following table is from Unicode 3.1.
+The following table is from Unicode 3.2.
  
   Code Points            1st Byte  2nd Byte  3rd Byte  4th Byte
  
  
   Code Points            1st Byte  2nd Byte  3rd Byte  4th Byte
  
-   U+0000..U+007F       00..7F   
-   U+0080..U+07FF       C2..DF    80..BF   
+   U+0000..U+007F       00..7F
+   U+0080..U+07FF       C2..DF    80..BF
     U+0800..U+0FFF       E0        A0..BF    80..BF  
     U+0800..U+0FFF       E0        A0..BF    80..BF  
-   U+1000..U+FFFF       E1..EF    80..BF    80..BF  
+   U+1000..U+CFFF       E1..EC    80..BF    80..BF  
+   U+D000..U+D7FF       ED        80..9F    80..BF  
+   U+D800..U+DFFF       ******* ill-formed *******
+   U+E000..U+FFFF       EE..EF    80..BF    80..BF  
    U+10000..U+3FFFF      F0        90..BF    80..BF    80..BF
    U+40000..U+FFFFF      F1..F3    80..BF    80..BF    80..BF
   U+100000..U+10FFFF     F4        80..8F    80..BF    80..BF
  
    U+10000..U+3FFFF      F0        90..BF    80..BF    80..BF
    U+40000..U+FFFFF      F1..F3    80..BF    80..BF    80..BF
   U+100000..U+10FFFF     F4        80..8F    80..BF    80..BF
  
+Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
+the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
+The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings:
+it is technically possible to UTF-8-encode a single code point in different
+ways, but that is explicitly forbidden, and the shortest possible encoding
+should always be used (and that is what Perl does).
+
  Or, another way to look at it, as bits:
  
   Code Points                    1st Byte   2nd Byte  3rd Byte  4th Byte
  Or, another way to look at it, as bits:
  
   Code Points                    1st Byte   2nd Byte  3rd Byte  4th Byte
@@ -722,16 +752,16 @@ Or, another way to look at it, as bits:
    00000dddccccccbbbbbbaaaaaa     11110ddd  10cccccc  10bbbbbb  10aaaaaa
  
  As you can see, the continuation bytes all begin with C<10>, and the
    00000dddccccccbbbbbbaaaaaa     11110ddd  10cccccc  10bbbbbb  10aaaaaa
  
  As you can see, the continuation bytes all begin with C<10>, and the
-leading bits of the start byte tells how many bytes the are in the
+leading bits of the start byte tell how many bytes the are in the
  encoded character.
  
  encoded character.
  
-=item
+=item *
  
  UTF-EBCDIC
  
  Like UTF-8, but EBCDIC-safe, as UTF-8 is ASCII-safe.
  
  
  UTF-EBCDIC
  
  Like UTF-8, but EBCDIC-safe, as UTF-8 is ASCII-safe.
  
-=item
+=item *
  
  UTF-16, UTF-16BE, UTF16-LE, Surrogates, and BOMs (Byte Order Marks)
  
  
  UTF-16, UTF-16BE, UTF16-LE, Surrogates, and BOMs (Byte Order Marks)
  
@@ -754,7 +784,7 @@ are the range 0xDC00..0xDFFFF.  The surrogate encoding is
  
  and the decoding is
  
  
  and the decoding is
  
-       $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00);
+       $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
  
  If you try to generate surrogates (for example by using chr()), you
  will get a warning if warnings are turned on (C<-w> or C<use
  
  If you try to generate surrogates (for example by using chr()), you
  will get a warning if warnings are turned on (C<-w> or C<use
@@ -784,7 +814,7 @@ sequence of bytes 0xFF 0xFE is unambiguously "BOM, represented in
  little-endian format" and cannot be "0xFFFE, represented in big-endian
  format".
  
  little-endian format" and cannot be "0xFFFE, represented in big-endian
  format".
  
-=item
+=item *
  
  UTF-32, UTF-32BE, UTF32-LE
  
  
  UTF-32, UTF-32BE, UTF32-LE
  
@@ -793,7 +823,7 @@ the units are 32-bit, and therefore the surrogate scheme is not
  needed.  The BOM signatures will be 0x00 0x00 0xFE 0xFF for BE and
  0xFF 0xFE 0x00 0x00 for LE.
  
  needed.  The BOM signatures will be 0x00 0x00 0xFE 0xFF for BE and
  0xFF 0xFE 0x00 0x00 for LE.
  
-=item
+=item *
  
  UCS-2, UCS-4
  
  
  UCS-2, UCS-4
  
@@ -801,7 +831,7 @@ Encodings defined by the ISO 10646 standard.  UCS-2 is a 16-bit
  encoding, UCS-4 is a 32-bit encoding.  Unlike UTF-16, UCS-2
  is not extensible beyond 0xFFFF, because it does not use surrogates.
  
  encoding, UCS-4 is a 32-bit encoding.  Unlike UTF-16, UCS-2
  is not extensible beyond 0xFFFF, because it does not use surrogates.
  
-=item
+=item *
  
  UTF-7
  
  
  UTF-7
  
@@ -810,7 +840,13 @@ transport/storage is not eight-bit safe.  Defined by RFC 2152.
  
  =back
  
  
  =back
  
-=head2 Security Implications of Malformed UTF-8
+=head2 Security Implications of Unicode
+
+=over 4
+
+=item *
+
+Malformed UTF-8
  
  Unfortunately, the specification of UTF-8 leaves some room for
  interpretation of how many bytes of encoded output one should generate
  
  Unfortunately, the specification of UTF-8 leaves some room for
  interpretation of how many bytes of encoded output one should generate
@@ -823,6 +859,37 @@ warnings;>) Perl will warn about non-shortest length UTF-8 (and other
  malformations, too, such as the surrogates, which are not real
  Unicode code points.)
  
  malformations, too, such as the surrogates, which are not real
  Unicode code points.)
  
+=item *
+
+Regular expressions behave slightly differently between byte data and
+character (Unicode data).  For example, the "word character" character
+class C<\w> will work differently when the data is all eight-bit bytes
+or when the data is Unicode.
+
+In the first case, the set of C<\w> characters is either small (the
+default set of alphabetic characters, digits, and the "_"), or, if you
+are using a locale (see L<perllocale>), the C<\w> might contain a few
+more letters according to your language and country.
+
+In the second case, the C<\w> set of characters is much, much larger,
+and most importantly, even in the set of the first 256 characters, it
+will most probably be different: as opposed to most locales (which are
+specific to a language and country pair) Unicode classifies all the
+characters that are letters as C<\w>.  For example: your locale might
+not think that LATIN SMALL LETTER ETH is a letter (unless you happen
+to speak Icelandic), but Unicode does.
+
+As discussed elsewhere, Perl tries to stand one leg (two legs, as
+camels are quadrupeds?) in two worlds: the old world of bytes and the new
+world of characters, upgrading from bytes to characters when necessary.
+If your legacy code is not explicitly using Unicode, no automatic
+switchover to characters should happen, and characters shouldn't get
+downgraded back to bytes, either.  It is possible to accidentally mix
+bytes and characters, however (see L<perluniintro>), in which case the
+C<\w> might start behaving differently.  Review your code.
+
+=back
+
  =head2 Unicode in Perl on EBCDIC
  
  The way Unicode is handled on EBCDIC platforms is still rather
  =head2 Unicode in Perl on EBCDIC
  
  The way Unicode is handled on EBCDIC platforms is still rather
@@ -834,6 +901,28 @@ are specifically discussed. There is no C<utfebcdic> pragma or
  the platform's "natural" 8-bit encoding of Unicode. See L<perlebcdic>
  for more discussion of the issues.
  
  the platform's "natural" 8-bit encoding of Unicode. See L<perlebcdic>
  for more discussion of the issues.
  
+=head2 Locales
+
+Usually locale settings and Unicode do not affect each other, but
+there are a couple of exceptions:
+
+=over 4
+
+=item *
+
+If your locale environment variables (LANGUAGE, LC_ALL, LC_CTYPE, LANG)
+contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
+the default encoding of your STDIN, STDOUT, and STDERR, and of
+B<any subsequent file open>, is UTF-8.
+
+=item *
+
+Perl tries really hard to work both with Unicode and the old byte
+oriented world: most often this is nice, but sometimes this causes
+problems.
+
+=back
+
  =head2 Using Unicode in XS
  
  If you want to handle Perl Unicode in XS extensions, you may find
  =head2 Using Unicode in XS
  
  If you want to handle Perl Unicode in XS extensions, you may find
@@ -843,16 +932,17 @@ the following C APIs useful (see perlapi for details):
  
  =item *
  
  
  =item *
  
-DO_UTF8(sv) returns true if the UTF8 flag is on and the bytes
-pragma is not in effect.  SvUTF8(sv) returns true is the UTF8
-flag is on, the bytes pragma is ignored.  Remember that UTF8
-flag being on does not mean that there would be any characters
-of code points greater than 255 or 127 in the scalar, or that
-there even are any characters in the scalar.  The UTF8 flag
-means that any characters added to the string will be encoded
-in UTF8 if the code points of the characters are greater than
-255.  Not "if greater than 127", since Perl's Unicode model
-is not to use UTF-8 until it's really necessary.
+DO_UTF8(sv) returns true if the UTF8 flag is on and the bytes pragma
+is not in effect.  SvUTF8(sv) returns true is the UTF8 flag is on, the
+bytes pragma is ignored.  The UTF8 flag being on does B<not> mean that
+there are any characters of code points greater than 255 (or 127) in
+the scalar, or that there even are any characters in the scalar.
+What the UTF8 flag means is that the sequence of octets in the
+representation of the scalar is the sequence of UTF-8 encoded
+code points of the characters of a string.  The UTF8 flag being
+off means that each octet in this representation encodes a single
+character with codepoint 0..255 within the string.  Perl's Unicode
+model is not to use UTF-8 until it's really necessary.
  
  =item *
  
  
  =item *
  
@@ -878,6 +968,10 @@ sv_utf8_upgrade(sv) converts the string of the scalar to its UTF-8
  encoded form.  sv_utf8_downgrade(sv) does the opposite (if possible).
  sv_utf8_encode(sv) is like sv_utf8_upgrade but the UTF8 flag does not
  get turned on.  sv_utf8_decode() does the opposite of sv_utf8_encode().
  encoded form.  sv_utf8_downgrade(sv) does the opposite (if possible).
  sv_utf8_encode(sv) is like sv_utf8_upgrade but the UTF8 flag does not
  get turned on.  sv_utf8_decode() does the opposite of sv_utf8_encode().
+Note that none of these are to be used as general purpose encoding/decoding
+interfaces: use Encode for that.  sv_utf8_upgrade() is affected by the
+encoding pragma, but sv_utf8_downgrade() is not (since the encoding
+pragma is designed to be a one-way street).
  
  =item *
  
  
  =item *
  
@@ -931,6 +1025,134 @@ as usual.)
  For more information, see L<perlapi>, and F<utf8.c> and F<utf8.h>
  in the Perl source code distribution.
  
  For more information, see L<perlapi>, and F<utf8.c> and F<utf8.h>
  in the Perl source code distribution.
  
+=head1 BUGS
+
+=head2 Interaction with locales
+
+Use of locales with Unicode data may lead to odd results.  Currently
+there is some attempt to apply 8-bit locale info to characters in the
+range 0..255, but this is demonstrably incorrect for locales that use
+characters above that range when mapped into Unicode.  It will also
+tend to run slower.  Use of locales with Unicode is discouraged.
+
+=head2 Interaction with extensions
+
+When perl exchanges data with an extension, the extension should be
+able to understand the UTF-8 flag and act accordingly. If the
+extension doesn't know about the flag, the risk is high that it will
+return data that are incorrectly flagged.
+
+So if you're working with Unicode data, consult the documentation of
+every module you're using if there are any issues with Unicode data
+exchange. If the documentation does not talk about Unicode at all,
+suspect the worst and probably look at the source to learn how the
+module is implemented. Modules written completely in perl shouldn't
+cause problems. Modules that directly or indirectly access code written
+in other programming languages are at risk.
+
+For affected functions the simple strategy to avoid data corruption is
+to always make the encoding of the exchanged data explicit. Choose an
+encoding you know the extension can handle. Convert arguments passed
+to the extensions to that encoding and convert results back from that
+encoding. Write wrapper functions that do the conversions for you, so
+you can later change the functions when the extension catches up.
+
+To provide an example let's say the popular Foo::Bar::escape_html
+function doesn't deal with Unicode data yet. The wrapper function
+would convert the argument to raw UTF-8 and convert the result back to
+perl's internal representation like so:
+
+    sub my_escape_html ($) {
+      my($what) = shift;
+      return unless defined $what;
+      Encode::decode_utf8(Foo::Bar::escape_html(Encode::encode_utf8($what)));
+    }
+
+Sometimes, when the extension does not convert data but just stores
+and retrieves them, you will be in a position to use the otherwise
+dangerous Encode::_utf8_on() function. Let's say the popular
+C<Foo::Bar> extension, written in C, provides a C<param> method that
+lets you store and retrieve data according to these prototypes:
+
+    $self->param($name, $value);            # set a scalar
+    $value = $self->param($name);           # retrieve a scalar
+
+If it does not yet provide support for any encoding, one could write a
+derived class with such a C<param> method:
+
+    sub param {
+      my($self,$name,$value) = @_;
+      utf8::upgrade($name);     # make sure it is UTF-8 encoded
+      if (defined $value)
+        utf8::upgrade($value);  # make sure it is UTF-8 encoded
+        return $self->SUPER::param($name,$value);
+      } else {
+        my $ret = $self->SUPER::param($name);
+        Encode::_utf8_on($ret); # we know, it is UTF-8 encoded
+        return $ret;
+      }
+    }
+
+Some extensions provide filters on data entry/exit points, such as
+DB_File::filter_store_key and family. Look out for such filters in
+the documentation of your extensions, they can make the transition to
+Unicode data much easier.
+
+=head2 speed
+
+Some functions are slower when working on UTF-8 encoded strings than
+on byte encoded strings.  All functions that need to hop over
+characters such as length(), substr() or index() can work B<much>
+faster when the underlying data are byte-encoded. Witness the
+following benchmark:
+
+  % perl -e '
+  use Benchmark;
+  use strict;
+  our $l = 10000;
+  our $u = our $b = "x" x $l;
+  substr($u,0,1) = "\x{100}";
+  timethese(-2,{
+  LENGTH_B => q{ length($b) },
+  LENGTH_U => q{ length($u) },
+  SUBSTR_B => q{ substr($b, $l/4, $l/2) },
+  SUBSTR_U => q{ substr($u, $l/4, $l/2) },
+  });
+  '
+  Benchmark: running LENGTH_B, LENGTH_U, SUBSTR_B, SUBSTR_U for at least 2 CPU seconds...
+    LENGTH_B:  2 wallclock secs ( 2.36 usr +  0.00 sys =  2.36 CPU) @ 5649983.05/s (n=13333960)
+    LENGTH_U:  2 wallclock secs ( 2.11 usr +  0.00 sys =  2.11 CPU) @ 12155.45/s (n=25648)
+    SUBSTR_B:  3 wallclock secs ( 2.16 usr +  0.00 sys =  2.16 CPU) @ 374480.09/s (n=808877)
+    SUBSTR_U:  2 wallclock secs ( 2.11 usr +  0.00 sys =  2.11 CPU) @ 6791.00/s (n=14329)
+
+The numbers show an incredible slowness on long UTF-8 strings and you
+should carefully avoid to use these functions within tight loops. For
+example if you want to iterate over characters, it is infinitely
+better to split into an array than to use substr, as the following
+benchmark shows:
+
+  % perl -e '
+  use Benchmark;
+  use strict;
+  our $l = 10000;
+  our $u = our $b = "x" x $l;
+  substr($u,0,1) = "\x{100}";
+  timethese(-5,{
+  SPLIT_B => q{ for my $c (split //, $b){}  },
+  SPLIT_U => q{ for my $c (split //, $u){}  },
+  SUBSTR_B => q{ for my $i (0..length($b)-1){my $c = substr($b,$i,1);} },
+  SUBSTR_U => q{ for my $i (0..length($u)-1){my $c = substr($u,$i,1);} },
+  });
+  '
+  Benchmark: running SPLIT_B, SPLIT_U, SUBSTR_B, SUBSTR_U for at least 5 CPU seconds...
+     SPLIT_B:  6 wallclock secs ( 5.29 usr +  0.00 sys =  5.29 CPU) @ 56.14/s (n=297)
+     SPLIT_U:  5 wallclock secs ( 5.17 usr +  0.01 sys =  5.18 CPU) @ 55.21/s (n=286)
+    SUBSTR_B:  5 wallclock secs ( 5.34 usr +  0.00 sys =  5.34 CPU) @ 123.22/s (n=658)
+    SUBSTR_U:  7 wallclock secs ( 6.20 usr +  0.00 sys =  6.20 CPU) @  0.81/s (n=5)
+
+You see, the algorithm based on substr() was faster with byte encoded
+data but it is pathologically slow with UTF-8 data.
+
  =head1 SEE ALSO
  
  L<perluniintro>, L<encoding>, L<Encode>, L<open>, L<utf8>, L<bytes>,
  =head1 SEE ALSO
  
  L<perluniintro>, L<encoding>, L<Encode>, L<open>, L<utf8>, L<bytes>,