add parallel support 4 Win32 dmake-COREDIR parallelism part 2

[perl5.git] / lib / charnames.pm
diff --git a/lib/charnames.pm b/lib/charnames.pm

index 07c1b70..2efe3d5 100644 (file)
--- a/lib/charnames.pm
+++ b/lib/charnames.pm
@@ -1,7 +1,7 @@
  package charnames;
  use strict;
  use warnings;
-our $VERSION = '1.29';
+our $VERSION = '1.43';
  use unicore::Name;    # mktables-generated algorithmically-defined names
  use _charnames ();    # The submodule for this where most of the work gets done
  
@@ -49,7 +49,7 @@ sub vianame
      # can't change it because of backward compatibility.  New code can use
      # string_vianame() instead.
      my $ord = CORE::hex $1;
-    return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
+    return pack("U", $ord) if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
      _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord);
      return;
    }
@@ -74,7 +74,7 @@ sub string_vianame {
    if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
  
      my $ord = CORE::hex $1;
-    return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
+    return pack("U", $ord) if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
  
      _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord);
      return;
@@ -88,6 +88,8 @@ sub string_vianame {
  1;
  __END__
  
+=encoding utf8
+
  =head1 NAME
  
  charnames - access to Unicode character names and named character sequences; also define character names
@@ -110,12 +112,16 @@ charnames - access to Unicode character names and named character sequences; als
   use charnames qw(cyrillic greek);
   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
  
+ use utf8;
   use charnames ":full", ":alias" => {
     e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
     mychar => 0xE8000,  # Private use area
+   "自転車に乗る人" => "BICYCLIST"
   };
   print "\N{e_ACUTE} is a small letter e with an acute.\n";
   print "\N{mychar} allows me to name private use characters.\n";
+ print "And I can create synonyms in other languages,",
+       " such as \N{自転車に乗る人} for "BICYCLIST (U+1F6B4)\n";
  
   use charnames ();
   print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
@@ -165,7 +171,7 @@ charnames ();">> did not enable C<\N{I<CHARNAME>}>.)
  
  Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
  also inserts a character into a string.
-The character it inserts is the one whose code point
+The character it inserts is the one whose Unicode code point
  (ordinal value) is equal to the number.  For example, C<"\N{U+263a}"> is
  the Unicode (white background, black foreground) smiley face
  equivalent to C<"\N{WHITE SMILING FACE}">.
@@ -211,18 +217,18 @@ use variables inside the C<\N{...}>.  If you want similar run-time
  functionality, use
  L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>.
  
-Since Unicode 6.0, it is deprecated to use C<BELL>.  Instead use C<ALERT> (but
-C<BEL> will continue to work).
+Note, starting in Perl 5.18, the name C<BELL> refers to the Unicode character
+U+1F514, instead of the traditional U+0007.  For the latter, use C<ALERT>
+or C<BEL>.
  
-If the input name is unknown, C<\N{NAME}> raises a warning and
-substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
+It is a syntax error to use C<\N{NAME}> where C<NAME> is unknown.
  
  For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the
  input name is that of a character that won't fit into a byte (i.e., whose
  ordinal is above 255).
  
  Otherwise, any string that includes a C<\N{I<charname>}> or
-C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
+C<S<\N{U+I<code point>}>> will automatically have Unicode rules (see
  L<perlunicode/Byte and Character Semantics>).
  
  =head1 LOOSE MATCHES
@@ -248,8 +254,8 @@ C<:loose> slows down look-ups by a factor of 2 to 3 versus
  C<:full>, but the trade-off may be worth it to you.  Each individual look-up
  takes very little time, and the results are cached, so the speed difference
  would become a factor only in programs that do look-ups of many different
-spellings, and probably only when those look-ups are through vianame() and
-string_vianame(), since C<\N{...}> look-ups are done at compile time.
+spellings, and probably only when those look-ups are through C<vianame()> and
+C<string_vianame()>, since C<\N{...}> look-ups are done at compile time.
  
  =head1 ALIASES
  
@@ -270,24 +276,36 @@ conventions.  The aliases override any standard definitions, so, if
  you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
  mean C<"B">, etc.
  
-Note that an alias should not be something that is a legal curly
-brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>).  For example
-C<\N{123}> means to match 123 non-newline characters, and is not treated as a
-charnames alias.  Aliases are discouraged from beginning with anything
-other than an alphabetic character and from containing anything other
-than alphanumerics, spaces, dashes, parentheses, and underscores.
-Currently they must be ASCII.
+Aliases must begin with a character that is alphabetic.  After that, each may
+contain any combination of word (C<\w>) characters, SPACE (U+0020),
+HYPHEN-MINUS (U+002D), LEFT PARENTHESIS (U+0028), RIGHT PARENTHESIS (U+0029),
+and NO-BREAK SPACE (U+00A0).  These last three should never have been allowed
+in names, and are retained for backwards compatibility only; NO-BREAK SPACE IS
+currently deprecated and scheduled for removal in Perl v5.26; the other two
+may also be
+deprecated and removed in future releases of Perl, so don't use them for new
+names.  (More precisely, the first character of a name you specify must be
+something that matches all of C<\p{ID_Start}>, C<\p{Alphabetic}>, and
+C<\p{Gc=Letter}>.  This makes sure it is what any reasonable person would view
+as an alphabetic character.  And, the continuation characters that match C<\w>
+must also match C<\p{ID_Continue}>.)  Starting with Perl v5.18, any Unicode
+characters meeting the above criteria may be used; prior to that only
+Latin1-range characters were acceptable.
  
  An alias can map to either an official Unicode character name (not a loose
  matched name) or to a
  numeric code point (ordinal).  The latter is useful for assigning names
  to code points in Unicode private use areas such as U+E800 through
  U+F8FF.
-A numeric code point must be a non-negative integer or a string beginning
+A numeric code point must be a non-negative integer, or a string beginning
  with C<"U+"> or C<"0x"> with the remainder considered to be a
  hexadecimal integer.  A literal numeric constant must be unsigned; it
  will be interpreted as hex if it has a leading zero or contains
  non-decimal hex digits; otherwise it will be interpreted as decimal.
+If it begins with C<"U+">, it is interpreted as the Unicode code point;
+otherwise it is interpreted as native.  (Only code points below 256 can
+differ between Unicode and native.)  Thus C<U+41> is always the Latin letter
+"A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms.
  
  Aliases are added either by the use of anonymous hashes:
  
@@ -328,43 +346,6 @@ Also, both these methods currently allow only single characters to be named.
  To name a sequence of characters, use a
  L<custom translator|/CUSTOM TRANSLATORS> (described below).
  
-=head1 charnames::viacode(I<code>)
-
-Returns the full name of the character indicated by the numeric code.
-For example,
-
-    print charnames::viacode(0x2722);
-
-prints "FOUR TEARDROP-SPOKED ASTERISK".
-
-The name returned is the official name for the code point, if
-available; otherwise your custom alias for it.  This means that your
-alias will only be returned for code points that don't have an official
-Unicode name (nor alias) such as private use code points.
-Until Unicode 6.1, the 4 control characters U+0080, U+0081, U+0084, and U+0099
-did not have names (actually, to be precise they still don't, but they do have
-aliases, which for most purposes are indistiunguishable from true names).
-To preserve backwards compatibility, any alias you define for these code
-points will be returned by this function, in preference to the official alias.
-
-If you define more than one name for the code point, it is indeterminate
-which one will be returned.
-
-The function returns C<undef> if no name is known for the code point.
-In Unicode the proper name of these is the empty string, which
-C<undef> stringifies to.  (If you ask for a code point past the legal
-Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
-get C<undef> plus a warning.)
-
-The input number must be a non-negative integer, or a string beginning
-with C<"U+"> or C<"0x"> with the remainder considered to be a
-hexadecimal integer.  A literal numeric constant must be unsigned; it
-will be interpreted as hex if it has a leading zero or contains
-non-decimal hex digits; otherwise it will be interpreted as decimal.
-
-Notice that the name returned for U+FEFF is "ZERO WIDTH NO-BREAK
-SPACE", not "BYTE ORDER MARK".
-
  =head1 charnames::string_vianame(I<name>)
  
  This is a runtime equivalent to C<\N{...}>.  I<name> can be any expression
@@ -374,14 +355,15 @@ controlling C<"use charnames"> in the same scope apply, like C<:loose> or any
  L<script list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM
  ALIASES> you may have defined.
  
-The only difference is that if the input name is unknown, C<string_vianame>
-returns C<undef> instead of the REPLACEMENT CHARACTER and does not raise a
-warning message.
+The only differences are due to the fact that C<string_vianame> is run-time
+and C<\N{}> is compile time.  You can't interpolate inside a C<\N{}>, (so
+C<\N{$variable}> doesn't work); and if the input name is unknown,
+C<string_vianame> returns C<undef> instead of it being a syntax error.
  
  =head1 charnames::vianame(I<name>)
  
  This is similar to C<string_vianame>.  The main difference is that under most
-circumstances, vianame returns an ordinal code
+circumstances, C<vianame> returns an ordinal code
  point, whereas C<string_vianame> returns a string.  For example,
  
     printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
@@ -397,6 +379,80 @@ character, even ones that aren't legal under the C<S<use bytes>> pragma,
  See L</BUGS> for the circumstances in which the behavior differs
  from  that described above.
  
+=head1 charnames::viacode(I<code>)
+
+Returns the full name of the character indicated by the numeric code.
+For example,
+
+    print charnames::viacode(0x2722);
+
+prints "FOUR TEARDROP-SPOKED ASTERISK".
+
+The name returned is the "best" (defined below) official name or alias
+for the code point, if
+available; otherwise your custom alias for it, if defined; otherwise C<undef>.
+This means that your alias will only be returned for code points that don't
+have an official Unicode name (nor alias) such as private use code points.
+
+If you define more than one name for the code point, it is indeterminate
+which one will be returned.
+
+As mentioned, the function returns C<undef> if no name is known for the code
+point.  In Unicode the proper name for these is the empty string, which
+C<undef> stringifies to.  (If you ask for a code point past the legal
+Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
+get C<undef> plus a warning.)
+
+The input number must be a non-negative integer, or a string beginning
+with C<"U+"> or C<"0x"> with the remainder considered to be a
+hexadecimal integer.  A literal numeric constant must be unsigned; it
+will be interpreted as hex if it has a leading zero or contains
+non-decimal hex digits; otherwise it will be interpreted as decimal.
+If it begins with C<"U+">, it is interpreted as the Unicode code point;
+otherwise it is interpreted as native.  (Only code points below 256 can
+differ between Unicode and native.)  Thus C<U+41> is always the Latin letter
+"A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms.
+
+As mentioned above under L</ALIASES>, Unicode 6.1 defines extra names
+(synonyms or aliases) for some code points, most of which were already
+available as Perl extensions.  All these are accepted by C<\N{...}> and the
+other functions in this module, but C<viacode> has to choose which one
+name to return for a given input code point, so it returns the "best" name.
+To understand how this works, it is helpful to know more about the Unicode
+name properties.  All code points actually have only a single name, which
+(starting in Unicode 2.0) can never change once a character has been assigned
+to the code point.  But mistakes have been made in assigning names, for
+example sometimes a clerical error was made during the publishing of the
+Standard which caused words to be misspelled, and there was no way to correct
+those.  The Name_Alias property was eventually created to handle these
+situations.  If a name was wrong, a corrected synonym would be published for
+it, using Name_Alias.  C<viacode> will return that corrected synonym as the
+"best" name for a code point.  (It is even possible, though it hasn't happened
+yet, that the correction itself will need to be corrected, and so another
+Name_Alias can be created for that code point; C<viacode> will return the
+most recent correction.)
+
+The Unicode name for each of the control characters (such as LINE FEED) is the
+empty string.  However almost all had names assigned by other standards, such
+as the ASCII Standard, or were in common use.  C<viacode> returns these names
+as the "best" ones available.  Unicode 6.1 has created Name_Aliases for each
+of them, including alternate names, like NEW LINE.  C<viacode> uses the
+original name, "LINE FEED" in preference to the alternate.  Similarly the
+name returned for U+FEFF is "ZERO WIDTH NO-BREAK SPACE", not "BYTE ORDER
+MARK".
+
+Until Unicode 6.1, the 4 control characters U+0080, U+0081, U+0084, and U+0099
+did not have names nor aliases.
+To preserve backwards compatibility, any alias you define for these code
+points will be returned by this function, in preference to the official name.
+
+Some code points also have abbreviated names, such as "LF" or "NL".
+C<viacode> never returns these.
+
+Because a name correction may be added in future Unicode releases, the name
+that C<viacode> returns may change as a result.  This is a rare event, but it
+does happen.
+
  =head1 CUSTOM TRANSLATORS
  
  The mechanism of translation of C<\N{...}> escapes is general and not
@@ -441,10 +497,6 @@ the form C<U+...>, it returns a chr instead.  In this case, if C<use bytes> is
  in effect and the character won't fit into a byte, it returns C<undef> and
  raises a warning.
  
-Names must be ASCII characters only, which means that you are out of luck if
-you want to create aliases in a language where some or all the characters of
-the desired aliases are non-ASCII.
-
  Since evaluation of the translation function (see L</CUSTOM
  TRANSLATORS>) happens in the middle of compilation (of a string
  literal), the translation function should not do any C<eval>s or