lib/charnames.pm

   1 package charnames;
   2 use strict;
   3 use warnings;
   4 our $VERSION = '1.28';
   5 use unicore::Name;    # mktables-generated algorithmically-defined names
   6 use _charnames ();    # The submodule for this where most of the work gets done
   7
   8 use bytes ();          # for $bytes::hint_bits
   9 use re "/aa";          # Everything in here should be ASCII
  10
  11 # Translate between Unicode character names and their code points.
  12 # This is a wrapper around the submodule C<_charnames>.  This design allows
  13 # C<_charnames> to be autoloaded to enable use of \N{...}, but requires this
  14 # module to be explicitly requested for the functions API.
  15
  16 $Carp::Internal{ (__PACKAGE__) } = 1;
  17
  18 sub import
  19 {
  20   shift; ## ignore class name
  21   _charnames->import(@_);
  22 }
  23
  24 # Cache of already looked-up values.  This is set to only contain
  25 # official values, and user aliases can't override them, so scoping is
  26 # not an issue.
  27 my %viacode;
  28
  29 sub viacode {
  30   return _charnames::viacode(@_);
  31 }
  32
  33 sub vianame
  34 {
  35   if (@_ != 1) {
  36     _charnames::carp "charnames::vianame() expects one name argument";
  37     return ()
  38   }
  39
  40   # Looks up the character name and returns its ordinal if
  41   # found, undef otherwise.
  42
  43   my $arg = shift;
  44
  45   if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
  46
  47     # khw claims that this is poor interface design.  The function should
  48     # return either a an ord or a chr for all inputs; not be bipolar.  But
  49     # can't change it because of backward compatibility.  New code can use
  50     # string_vianame() instead.
  51     my $ord = CORE::hex $1;
  52     return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
  53     _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord);
  54     return;
  55   }
  56
  57   # The first 1 arg means wants an ord returned; the second that we are in
  58   # runtime, and this is the first level routine called from the user
  59   return _charnames::lookup_name($arg, 1, 1);
  60 } # vianame
  61
  62 sub string_vianame {
  63
  64   # Looks up the character name and returns its string representation if
  65   # found, undef otherwise.
  66
  67   if (@_ != 1) {
  68     _charnames::carp "charnames::string_vianame() expects one name argument";
  69     return;
  70   }
  71
  72   my $arg = shift;
  73
  74   if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
  75
  76     my $ord = CORE::hex $1;
  77     return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
  78
  79     _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord);
  80     return;
  81   }
  82
  83   # The 0 arg means wants a string returned; the 1 arg means that we are in
  84   # runtime, and this is the first level routine called from the user
  85   return _charnames::lookup_name($arg, 0, 1);
  86 } # string_vianame
  87
  88 1;
  89 __END__
  90
  91 =head1 NAME
  92
  93 charnames - access to Unicode character names and named character sequences; also define character names
  94
  95 =head1 SYNOPSIS
  96
  97  use charnames ':full';
  98  print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
  99  print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}",
 100        " is an officially named sequence of two Unicode characters\n";
 101
 102  use charnames ':loose';
 103  print "\N{Greek small-letter  sigma}",
 104         "can be used to ignore case, underscores, most blanks,"
 105         "and when you aren't sure if the official name has hyphens\n";
 106
 107  use charnames ':short';
 108  print "\N{greek:Sigma} is an upper-case sigma.\n";
 109
 110  use charnames qw(cyrillic greek);
 111  print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
 112
 113  use charnames ":full", ":alias" => {
 114    e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
 115    mychar => 0xE8000,  # Private use area
 116  };
 117  print "\N{e_ACUTE} is a small letter e with an acute.\n";
 118  print "\N{mychar} allows me to name private use characters.\n";
 119
 120  use charnames ();
 121  print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
 122  printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
 123                                                           # "10330"
 124  print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on
 125                                                      # ASCII platforms;
 126                                                      # 193 on EBCDIC
 127  print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A"
 128
 129 =head1 DESCRIPTION
 130
 131 Pragma C<use charnames> is used to gain access to the names of the
 132 Unicode characters and named character sequences, and to allow you to define
 133 your own character and character sequence names.
 134
 135 All forms of the pragma enable use of the following 3 functions:
 136
 137 =over
 138
 139 =item *
 140
 141 L</charnames::string_vianame(I<name>)> for run-time lookup of a
 142 either a character name or a named character sequence, returning its string
 143 representation
 144
 145 =item *
 146
 147 L</charnames::vianame(I<name>)> for run-time lookup of a
 148 character name (but not a named character sequence) to get its ordinal value
 149 (code point)
 150
 151 =item *
 152
 153 L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its
 154 Unicode name.
 155
 156 =back
 157
 158 Starting in Perl 5.16, any occurrence of C<\N{I<CHARNAME>}> sequences
 159 in a double-quotish string automatically loads this module with arguments
 160 C<:full> and C<:short> (described below) if it hasn't already been loaded with
 161 different arguments, in order to compile the named Unicode character into
 162 position in the string.  Prior to 5.16, an explicit S<C<use charnames>> was
 163 required to enable this usage.  (However, prior to 5.16, the form C<S<"use
 164 charnames ();">> did not enable C<\N{I<CHARNAME>}>.)
 165
 166 Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
 167 also inserts a character into a string.
 168 The character it inserts is the one whose code point
 169 (ordinal value) is equal to the number.  For example, C<"\N{U+263a}"> is
 170 the Unicode (white background, black foreground) smiley face
 171 equivalent to C<"\N{WHITE SMILING FACE}">.
 172 Also note, C<\N{I<...>}> can mean a regex quantifier instead of a character
 173 name, when the I<...> is a number (or comma separated pair of numbers
 174 (see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
 175
 176 The C<charnames> pragma supports arguments C<:full>, C<:loose>, C<:short>,
 177 script names and L<customized aliases|/CUSTOM ALIASES>.
 178
 179 If C<:full> is present, for expansion of
 180 C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
 181 standard Unicode character names.
 182
 183 C<:loose> is a variant of C<:full> which allows I<CHARNAME> to be less
 184 precisely specified.  Details are in L</LOOSE MATCHES>.
 185
 186 If C<:short> is present, and
 187 I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
 188 as a letter in script I<SCRIPT>, as described in the next paragraph.
 189 Or, if C<use charnames> is used
 190 with script name arguments, then for C<\N{I<CHARNAME>}> the name
 191 I<CHARNAME> is looked up as a letter in the given scripts (in the
 192 specified order). Customized aliases can override these, and are explained in
 193 L</CUSTOM ALIASES>.
 194
 195 For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>
 196 this pragma looks in the table of standard Unicode names for the names
 197
 198   SCRIPTNAME CAPITAL LETTER CHARNAME
 199   SCRIPTNAME SMALL LETTER CHARNAME
 200   SCRIPTNAME LETTER CHARNAME
 201
 202 If I<CHARNAME> is all lowercase,
 203 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
 204 is ignored, and both I<CHARNAME> and I<SCRIPTNAME> are converted to all
 205 uppercase for look-up.  Other than that, both of them follow L<loose|/LOOSE
 206 MATCHES> rules if C<:loose> is also specified; strict otherwise.
 207
 208 Note that C<\N{...}> is compile-time; it's a special form of string
 209 constant used inside double-quotish strings; this means that you cannot
 210 use variables inside the C<\N{...}>.  If you want similar run-time
 211 functionality, use
 212 L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>.
 213
 214 For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
 215 there are no official Unicode names but you can use instead the ISO 6429
 216 names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF,
 217 ESC, ...).  In Unicode 3.2 (as of Perl 5.8) some naming changes took
 218 place, and ISO 6429 was updated, see L</ALIASES>.  Since Unicode 6.0, it
 219 is deprecated to use C<BELL>.  Instead use C<ALERT> (but C<BEL> will continue
 220 to work).
 221
 222 If the input name is unknown, C<\N{NAME}> raises a warning and
 223 substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
 224
 225 For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the
 226 input name is that of a character that won't fit into a byte (i.e., whose
 227 ordinal is above 255).
 228
 229 Otherwise, any string that includes a C<\N{I<charname>}> or
 230 C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
 231 L<perlunicode/Byte and Character Semantics>).
 232
 233 =head1 LOOSE MATCHES
 234
 235 By specifying C<:loose>, Unicode's L<loose character name
 236 matching|http://www.unicode.org/reports/tr44#Matching_Rules> rules are
 237 selected instead of the strict exact match used otherwise.
 238 That means that I<CHARNAME> doesn't have to be so precisely specified.
 239 Upper/lower case doesn't matter (except with scripts as mentioned above), nor
 240 do any underscores, and the only hyphens that matter are those at the
 241 beginning or end of a word in the name (with one exception:  the hyphen in
 242 U+1180 C<HANGUL JUNGSEONG O-E> does matter).
 243 Also, blanks not adjacent to hyphens don't matter.
 244 The official Unicode names are quite variable as to where they use hyphens
 245 versus spaces to separate word-like units, and this option allows you to not
 246 have to care as much.
 247 The reason non-medial hyphens matter is because of cases like
 248 U+0F60 C<TIBETAN LETTER -A> versus U+0F68 C<TIBETAN LETTER A>.
 249 The hyphen here is significant, as is the space before it, and so both must be
 250 included.
 251
 252 C<:loose> slows down look-ups by a factor of 2 to 3 versus
 253 C<:full>, but the trade-off may be worth it to you.  Each individual look-up
 254 takes very little time, and the results are cached, so the speed difference
 255 would become a factor only in programs that do look-ups of many different
 256 spellings, and probably only when those look-ups are through vianame() and
 257 string_vianame(), since C<\N{...}> look-ups are done at compile time.
 258
 259 =head1 ALIASES
 260
 261 A few aliases have been defined for convenience; instead of having
 262 to use the official names,
 263
 264     LINE FEED (LF)
 265     FORM FEED (FF)
 266     CARRIAGE RETURN (CR)
 267     NEXT LINE (NEL)
 268
 269 (yes, with parentheses), one can use
 270
 271     LINE FEED
 272     FORM FEED
 273     CARRIAGE RETURN
 274     NEXT LINE
 275     LF
 276     FF
 277     CR
 278     NEL
 279
 280 All the other standard abbreviations for the controls, such as C<ACK> for
 281 C<ACKNOWLEDGE> also can be used.
 282
 283 One can also use
 284
 285     BYTE ORDER MARK
 286     BOM
 287
 288 and these abbreviations
 289
 290     Abbreviation        Full Name
 291
 292     CGJ                 COMBINING GRAPHEME JOINER
 293     FVS1                MONGOLIAN FREE VARIATION SELECTOR ONE
 294     FVS2                MONGOLIAN FREE VARIATION SELECTOR TWO
 295     FVS3                MONGOLIAN FREE VARIATION SELECTOR THREE
 296     LRE                 LEFT-TO-RIGHT EMBEDDING
 297     LRM                 LEFT-TO-RIGHT MARK
 298     LRO                 LEFT-TO-RIGHT OVERRIDE
 299     MMSP                MEDIUM MATHEMATICAL SPACE
 300     MVS                 MONGOLIAN VOWEL SEPARATOR
 301     NBSP                NO-BREAK SPACE
 302     NNBSP               NARROW NO-BREAK SPACE
 303     PDF                 POP DIRECTIONAL FORMATTING
 304     RLE                 RIGHT-TO-LEFT EMBEDDING
 305     RLM                 RIGHT-TO-LEFT MARK
 306     RLO                 RIGHT-TO-LEFT OVERRIDE
 307     SHY                 SOFT HYPHEN
 308     VS1                 VARIATION SELECTOR-1
 309     .
 310     .
 311     .
 312     VS256               VARIATION SELECTOR-256
 313     WJ                  WORD JOINER
 314     ZWJ                 ZERO WIDTH JOINER
 315     ZWNJ                ZERO WIDTH NON-JOINER
 316     ZWSP                ZERO WIDTH SPACE
 317
 318 For backward compatibility one can use the old names for
 319 certain C0 and C1 controls
 320
 321     old                         new
 322
 323     FILE SEPARATOR              INFORMATION SEPARATOR FOUR
 324     GROUP SEPARATOR             INFORMATION SEPARATOR THREE
 325     HORIZONTAL TABULATION       CHARACTER TABULATION
 326     HORIZONTAL TABULATION SET   CHARACTER TABULATION SET
 327     HORIZONTAL TABULATION WITH JUSTIFICATION    CHARACTER TABULATION
 328                                                 WITH JUSTIFICATION
 329     PARTIAL LINE DOWN           PARTIAL LINE FORWARD
 330     PARTIAL LINE UP             PARTIAL LINE BACKWARD
 331     RECORD SEPARATOR            INFORMATION SEPARATOR TWO
 332     REVERSE INDEX               REVERSE LINE FEED
 333     UNIT SEPARATOR              INFORMATION SEPARATOR ONE
 334     VERTICAL TABULATION         LINE TABULATION
 335     VERTICAL TABULATION SET     LINE TABULATION SET
 336
 337 but the old names in addition to giving the character
 338 will also give a warning about being deprecated.
 339
 340 And finally, certain published variants are usable, including some for
 341 controls that have no Unicode names:
 342
 343     name                                   character
 344
 345     END OF PROTECTED AREA                  END OF GUARDED AREA, U+0097
 346     HIGH OCTET PRESET                      U+0081
 347     HOP                                    U+0081
 348     IND                                    U+0084
 349     INDEX                                  U+0084
 350     PAD                                    U+0080
 351     PADDING CHARACTER                      U+0080
 352     PRIVATE USE 1                          PRIVATE USE ONE, U+0091
 353     PRIVATE USE 2                          PRIVATE USE TWO, U+0092
 354     SGC                                    U+0099
 355     SINGLE GRAPHIC CHARACTER INTRODUCER    U+0099
 356     SINGLE-SHIFT 2                         SINGLE SHIFT TWO, U+008E
 357     SINGLE-SHIFT 3                         SINGLE SHIFT THREE, U+008F
 358     START OF PROTECTED AREA                START OF GUARDED AREA, U+0096
 359
 360 =head1 CUSTOM ALIASES
 361
 362 You can add customized aliases to standard (C<:full>) Unicode naming
 363 conventions.  The aliases override any standard definitions, so, if
 364 you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
 365 mean C<"B">, etc.
 366
 367 Note that an alias should not be something that is a legal curly
 368 brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>).  For example
 369 C<\N{123}> means to match 123 non-newline characters, and is not treated as a
 370 charnames alias.  Aliases are discouraged from beginning with anything
 371 other than an alphabetic character and from containing anything other
 372 than alphanumerics, spaces, dashes, parentheses, and underscores.
 373 Currently they must be ASCII.
 374
 375 An alias can map to either an official Unicode character name (not a loose
 376 matched name) or to a
 377 numeric code point (ordinal).  The latter is useful for assigning names
 378 to code points in Unicode private use areas such as U+E800 through
 379 U+F8FF.
 380 A numeric code point must be a non-negative integer or a string beginning
 381 with C<"U+"> or C<"0x"> with the remainder considered to be a
 382 hexadecimal integer.  A literal numeric constant must be unsigned; it
 383 will be interpreted as hex if it has a leading zero or contains
 384 non-decimal hex digits; otherwise it will be interpreted as decimal.
 385
 386 Aliases are added either by the use of anonymous hashes:
 387
 388     use charnames ":alias" => {
 389         e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
 390         mychar1 => 0xE8000,
 391         };
 392     my $str = "\N{e_ACUTE}";
 393
 394 or by using a file containing aliases:
 395
 396     use charnames ":alias" => "pro";
 397
 398 This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
 399 file should return a list in plain perl:
 400
 401     (
 402     A_GRAVE         => "LATIN CAPITAL LETTER A WITH GRAVE",
 403     A_CIRCUM        => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
 404     A_DIAERES       => "LATIN CAPITAL LETTER A WITH DIAERESIS",
 405     A_TILDE         => "LATIN CAPITAL LETTER A WITH TILDE",
 406     A_BREVE         => "LATIN CAPITAL LETTER A WITH BREVE",
 407     A_RING          => "LATIN CAPITAL LETTER A WITH RING ABOVE",
 408     A_MACRON        => "LATIN CAPITAL LETTER A WITH MACRON",
 409     mychar2         => "U+E8001",
 410     );
 411
 412 Both these methods insert C<":full"> automatically as the first argument (if no
 413 other argument is given), and you can give the C<":full"> explicitly as
 414 well, like
 415
 416     use charnames ":full", ":alias" => "pro";
 417
 418 C<":loose"> has no effect with these.  Input names must match exactly, using
 419 C<":full"> rules.
 420
 421 Also, both these methods currently allow only single characters to be named.
 422 To name a sequence of characters, use a
 423 L<custom translator|/CUSTOM TRANSLATORS> (described below).
 424
 425 =head1 charnames::viacode(I<code>)
 426
 427 Returns the full name of the character indicated by the numeric code.
 428 For example,
 429
 430     print charnames::viacode(0x2722);
 431
 432 prints "FOUR TEARDROP-SPOKED ASTERISK".
 433
 434 The name returned is the official name for the code point, if
 435 available; otherwise your custom alias for it.  This means that your
 436 alias will only be returned for code points that don't have an official
 437 Unicode name (nor a Unicode version 1 name), such as private use code
 438 points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
 439 If you define more than one name for the code point, it is indeterminate
 440 which one will be returned.
 441
 442 The function returns C<undef> if no name is known for the code point.
 443 In Unicode the proper name of these is the empty string, which
 444 C<undef> stringifies to.  (If you ask for a code point past the legal
 445 Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
 446 get C<undef> plus a warning.)
 447
 448 The input number must be a non-negative integer or a string beginning
 449 with C<"U+"> or C<"0x"> with the remainder considered to be a
 450 hexadecimal integer.  A literal numeric constant must be unsigned; it
 451 will be interpreted as hex if it has a leading zero or contains
 452 non-decimal hex digits; otherwise it will be interpreted as decimal.
 453
 454 Notice that the name returned for U+FEFF is "ZERO WIDTH NO-BREAK
 455 SPACE", not "BYTE ORDER MARK".
 456
 457 =head1 charnames::string_vianame(I<name>)
 458
 459 This is a runtime equivalent to C<\N{...}>.  I<name> can be any expression
 460 that evaluates to a name accepted by C<\N{...}> under the L<C<:full>
 461 option|/DESCRIPTION> to C<charnames>.  In addition, any other options for the
 462 controlling C<"use charnames"> in the same scope apply, like C<:loose> or any
 463 L<script list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM
 464 ALIASES> you may have defined.
 465
 466 The only difference is that if the input name is unknown, C<string_vianame>
 467 returns C<undef> instead of the REPLACEMENT CHARACTER and does not raise a
 468 warning message.
 469
 470 =head1 charnames::vianame(I<name>)
 471
 472 This is similar to C<string_vianame>.  The main difference is that under most
 473 circumstances, vianame returns an ordinal code
 474 point, whereas C<string_vianame> returns a string.  For example,
 475
 476    printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
 477
 478 prints "U+2722".
 479
 480 This leads to the other two differences.  Since a single code point is
 481 returned, the function can't handle named character sequences, as these are
 482 composed of multiple characters (it returns C<undef> for these.  And, the code
 483 point can be that of any
 484 character, even ones that aren't legal under the C<S<use bytes>> pragma,
 485
 486 See L</BUGS> for the circumstances in which the behavior differs
 487 from  that described above.
 488
 489 =head1 CUSTOM TRANSLATORS
 490
 491 The mechanism of translation of C<\N{...}> escapes is general and not
 492 hardwired into F<charnames.pm>.  A module can install custom
 493 translations (inside the scope which C<use>s the module) with the
 494 following magic incantation:
 495
 496     sub import {
 497         shift;
 498         $^H{charnames} = \&translator;
 499     }
 500
 501 Here translator() is a subroutine which takes I<CHARNAME> as an
 502 argument, and returns text to insert into the string instead of the
 503 C<\N{I<CHARNAME>}> escape.
 504
 505 This is the only way you can create a custom named sequence of code points.
 506
 507 Since the text to insert should be different
 508 in C<bytes> mode and out of it, the function should check the current
 509 state of C<bytes>-flag as in:
 510
 511     use bytes ();                      # for $bytes::hint_bits
 512     sub translator {
 513         if ($^H & $bytes::hint_bits) {
 514             return bytes_translator(@_);
 515         }
 516         else {
 517             return utf8_translator(@_);
 518         }
 519     }
 520
 521 See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
 522
 523 Of course, C<vianame>, C<viacode>, and C<string_vianame> would need to be
 524 overridden as well.
 525
 526 =head1 BUGS
 527
 528 vianame() normally returns an ordinal code point, but when the input name is of
 529 the form C<U+...>, it returns a chr instead.  In this case, if C<use bytes> is
 530 in effect and the character won't fit into a byte, it returns C<undef> and
 531 raises a warning.
 532
 533 Names must be ASCII characters only, which means that you are out of luck if
 534 you want to create aliases in a language where some or all the characters of
 535 the desired aliases are non-ASCII.
 536
 537 Since evaluation of the translation function (see L</CUSTOM
 538 TRANSLATORS>) happens in the middle of compilation (of a string
 539 literal), the translation function should not do any C<eval>s or
 540 C<require>s.  This restriction should be lifted (but is low priority) in
 541 a future version of Perl.
 542
 543 =cut
 544
 545 # ex: set ts=8 sts=2 sw=2 et: