pod/perlunicook.pod

   1
   2 =encoding utf8
   3
   4 =head1 NAME
   5
   6 perlunicook - cookbookish examples of handling Unicode in Perl
   7
   8 =head1 DESCRIPTION
   9
  10 This manpage contains short recipes demonstrating how to handle common Unicode
  11 operations in Perl, plus one complete program at the end. Any undeclared
  12 variables in individual recipes are assumed to have a previous appropriate
  13 value in them.
  14
  15 =head1 EXAMPLES
  16
  17 =head2 ℞ 0: Standard preamble
  18
  19 Unless otherwise notes, all examples below require this standard preamble
  20 to work correctly, with the C<#!> adjusted to work on your system:
  21
  22  #!/usr/bin/env perl
  23
  24  use utf8;      # so literals and identifiers can be in UTF-8
  25  use v5.12;     # or later to get "unicode_strings" feature
  26  use strict;    # quote strings, declare variables
  27  use warnings;  # on by default
  28  use warnings  qw(FATAL utf8);    # fatalize encoding glitches
  29  use open      qw(:std :utf8);    # undeclared streams in UTF-8
  30  use charnames qw(:full :short);  # unneeded in v5.16
  31
  32 This I<does> make even Unix programmers C<binmode> your binary streams,
  33 or open them with C<:raw>, but that's the only way to get at them
  34 portably anyway.
  35
  36 B<WARNING>: C<use autoload> and C<use open> do not get along with each other.
  37
  38 =head2 ℞ 1: Generic Unicode-savvy filter
  39
  40 Always decompose on the way in, then recompose on the way out.
  41
  42  use Unicode::Normalize;
  43
  44  while (<>) {
  45      $_ = NFD($_);   # decompose + reorder canonically
  46      ...
  47  } continue {
  48      print NFC($_);  # recompose (where possible) + reorder canonically
  49  }
  50
  51 =head2 ℞ 2: Fine-tuning Unicode warnings
  52
  53 As of v5.14, Perl distinguishes three subclasses of UTF‑8 warnings.
  54
  55  use v5.14;                  # subwarnings unavailable any earlier
  56  no warnings "nonchar";      # the 66 forbidden non-characters
  57  no warnings "surrogate";    # UTF-16/CESU-8 nonsense
  58  no warnings "non_unicode";  # for codepoints over 0x10_FFFF
  59
  60 =head2 ℞ 3: Declare source in utf8 for identifiers and literals
  61
  62 Without the all-critical C<use utf8> declaration, putting UTF‑8 in your
  63 literals and identifiers won’t work right.  If you used the standard
  64 preamble just given above, this already happened.  If you did, you can
  65 do things like this:
  66
  67  use utf8;
  68
  69  my $measure   = "Ångström";
  70  my @μsoft     = qw( cp852 cp1251 cp1252 );
  71  my @ὑπέρμεγας = qw( ὑπέρ  μεγας );
  72  my @鯉        = qw( koi8-f koi8-u koi8-r );
  73  my $motto     = "👪 💗 🐪"; # FAMILY, GROWING HEART, DROMEDARY CAMEL
  74
  75 If you forget C<use utf8>, high bytes will be misunderstood as
  76 separate characters, and nothing will work right.
  77
  78 =head2 ℞ 4: Characters and their numbers
  79
  80 The C<ord> and C<chr> functions work transparently on all codepoints,
  81 not just on ASCII alone — nor in fact, not even just on Unicode alone.
  82
  83  # ASCII characters
  84  ord("A")
  85  chr(65)
  86
  87  # characters from the Basic Multilingual Plane
  88  ord("Σ")
  89  chr(0x3A3)
  90
  91  # beyond the BMP
  92  ord("𝑛")               # MATHEMATICAL ITALIC SMALL N
  93  chr(0x1D45B)
  94
  95  # beyond Unicode! (up to MAXINT)
  96  ord("\x{20_0000}")
  97  chr(0x20_0000)
  98
  99 =head2 ℞ 5: Unicode literals by character number
 100
 101 In an interpolated literal, whether a double-quoted string or a
 102 regex, you may specify a character by its number using the
 103 C<\x{I<HHHHHH>}> escape.
 104
 105  String: "\x{3a3}"
 106  Regex:  /\x{3a3}/
 107
 108  String: "\x{1d45b}"
 109  Regex:  /\x{1d45b}/
 110
 111  # even non-BMP ranges in regex work fine
 112  /[\x{1D434}-\x{1D467}]/
 113
 114 =head2 ℞ 6: Get character name by number
 115
 116  use charnames ();
 117  my $name = charnames::viacode(0x03A3);
 118
 119 =head2 ℞ 7: Get character number by name
 120
 121  use charnames ();
 122  my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");
 123
 124 =head2 ℞ 8: Unicode named characters
 125
 126 Use the C<< \N{I<charname>} >> notation to get the character
 127 by that name for use in interpolated literals (double-quoted
 128 strings and regexes).  In v5.16, there is an implicit
 129
 130  use charnames qw(:full :short);
 131
 132 But prior to v5.16, you must be explicit about which set of charnames you
 133 want.  The C<:full> names are the official Unicode character name, alias, or
 134 sequence, which all share a namespace.
 135
 136  use charnames qw(:full :short latin greek);
 137
 138  "\N{MATHEMATICAL ITALIC SMALL N}"      # :full
 139  "\N{GREEK CAPITAL LETTER SIGMA}"       # :full
 140
 141 Anything else is a Perl-specific convenience abbreviation.  Specify one or
 142 more scripts by names if you want short names that are script-specific.
 143
 144  "\N{Greek:Sigma}"                      # :short
 145  "\N{ae}"                               #  latin
 146  "\N{epsilon}"                          #  greek
 147
 148 The v5.16 release also supports a C<:loose> import for loose matching of
 149 character names, which works just like loose matching of property names:
 150 that is, it disregards case, whitespace, and underscores:
 151
 152  "\N{euro sign}"                        # :loose (from v5.16)
 153
 154 =head2 ℞ 9: Unicode named sequences
 155
 156 These look just like character names but return multiple codepoints.
 157 Notice the C<%vx> vector-print functionality in C<printf>.
 158
 159  use charnames qw(:full);
 160  my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
 161  printf "U+%v04X\n", $seq;
 162  U+0100.0300
 163
 164 =head2 ℞ 10: Custom named characters
 165
 166 Use C<:alias> to give your own lexically scoped nicknames to existing
 167 characters, or even to give unnamed private-use characters useful names.
 168
 169  use charnames ":full", ":alias" => {
 170      ecute => "LATIN SMALL LETTER E WITH ACUTE",
 171      "APPLE LOGO" => 0xF8FF, # private use character
 172  };
 173
 174  "\N{ecute}"
 175  "\N{APPLE LOGO}"
 176
 177 =head2 ℞ 11: Names of CJK codepoints
 178
 179 Sinograms like “東京” come back with character names of
 180 C<CJK UNIFIED IDEOGRAPH-6771> and C<CJK UNIFIED IDEOGRAPH-4EAC>,
 181 because their “names” vary.  The CPAN C<Unicode::Unihan> module
 182 has a large database for decoding these (and a whole lot more), provided you
 183 know how to understand its output.
 184
 185  # cpan -i Unicode::Unihan
 186  use Unicode::Unihan;
 187  my $str = "東京";
 188  my $unhan = new Unicode::Unihan;
 189  for my $lang (qw(Mandarin Cantonese Korean JapaneseOn JapaneseKun)) {
 190      printf "CJK $str in %-12s is ", $lang;
 191      say $unhan->$lang($str);
 192  }
 193
 194 prints:
 195
 196  CJK 東京 in Mandarin     is DONG1JING1
 197  CJK 東京 in Cantonese    is dung1ging1
 198  CJK 東京 in Korean       is TONGKYENG
 199  CJK 東京 in JapaneseOn   is TOUKYOU KEI KIN
 200  CJK 東京 in JapaneseKun  is HIGASHI AZUMAMIYAKO
 201
 202 If you have a specific romanization scheme in mind,
 203 use the specific module:
 204
 205  # cpan -i Lingua::JA::Romanize::Japanese
 206  use Lingua::JA::Romanize::Japanese;
 207  my $k2r = new Lingua::JA::Romanize::Japanese;
 208  my $str = "東京";
 209  say "Japanese for $str is ", $k2r->chars($str);
 210
 211 prints
 212
 213  Japanese for 東京 is toukyou
 214
 215 =head2 ℞ 12: Explicit encode/decode
 216
 217 On rare occasion, such as a database read, you may be
 218 given encoded text you need to decode.
 219
 220   use Encode qw(encode decode);
 221
 222   my $chars = decode("shiftjis", $bytes, 1);
 223  # OR
 224   my $bytes = encode("MIME-Header-ISO_2022_JP", $chars, 1);
 225
 226 For streams all in the same encoding, don't use encode/decode; instead
 227 set the file encoding when you open the file or immediately after with
 228 C<binmode> as described later below.
 229
 230 =head2 ℞ 13: Decode program arguments as utf8
 231
 232      $ perl -CA ...
 233  or
 234      $ export PERL_UNICODE=A
 235  or
 236     use Encode qw(decode_utf8);
 237     @ARGV = map { decode_utf8($_, 1) } @ARGV;
 238
 239 =head2 ℞ 14: Decode program arguments as locale encoding
 240
 241     # cpan -i Encode::Locale
 242     use Encode qw(locale);
 243     use Encode::Locale;
 244
 245     # use "locale" as an arg to encode/decode
 246     @ARGV = map { decode(locale => $_, 1) } @ARGV;
 247
 248 =head2 ℞ 15: Declare STD{IN,OUT,ERR} to be utf8
 249
 250 Use a command-line option, an environment variable, or else
 251 call C<binmode> explicitly:
 252
 253      $ perl -CS ...
 254  or
 255      $ export PERL_UNICODE=S
 256  or
 257      use open qw(:std :utf8);
 258  or
 259      binmode(STDIN,  ":utf8");
 260      binmode(STDOUT, ":utf8");
 261      binmode(STDERR, ":utf8");
 262
 263 =head2 ℞ 16: Declare STD{IN,OUT,ERR} to be in locale encoding
 264
 265     # cpan -i Encode::Locale
 266     use Encode;
 267     use Encode::Locale;
 268
 269     # or as a stream for binmode or open
 270     binmode STDIN,  ":encoding(console_in)"  if -t STDIN;
 271     binmode STDOUT, ":encoding(console_out)" if -t STDOUT;
 272     binmode STDERR, ":encoding(console_out)" if -t STDERR;
 273
 274 =head2 ℞ 17: Make file I/O default to utf8
 275
 276 Files opened without an encoding argument will be in UTF-8:
 277
 278      $ perl -CD ...
 279  or
 280      $ export PERL_UNICODE=D
 281  or
 282      use open qw(:utf8);
 283
 284 =head2 ℞ 18: Make all I/O and args default to utf8
 285
 286      $ perl -CSDA ...
 287  or
 288      $ export PERL_UNICODE=SDA
 289  or
 290      use open qw(:std :utf8);
 291      use Encode qw(decode_utf8);
 292      @ARGV = map { decode_utf8($_, 1) } @ARGV;
 293
 294 =head2 ℞ 19: Open file with specific encoding
 295
 296 Specify stream encoding.  This is the normal way
 297 to deal with encoded text, not by calling low-level
 298 functions.
 299
 300  # input file
 301      open(my $in_file, "< :encoding(UTF-16)", "wintext");
 302  OR
 303      open(my $in_file, "<", "wintext");
 304      binmode($in_file, ":encoding(UTF-16)");
 305  THEN
 306      my $line = <$in_file>;
 307
 308  # output file
 309      open($out_file, "> :encoding(cp1252)", "wintext");
 310  OR
 311      open(my $out_file, ">", "wintext");
 312      binmode($out_file, ":encoding(cp1252)");
 313  THEN
 314      print $out_file "some text\n";
 315
 316 More layers than just the encoding can be specified here. For example,
 317 the incantation C<":raw :encoding(UTF-16LE) :crlf"> includes implicit
 318 CRLF handling.
 319
 320 =head2 ℞ 20: Unicode casing
 321
 322 Unicode casing is very different from ASCII casing.
 323
 324  uc("henry ⅷ")  # "HENRY Ⅷ"
 325  uc("tschüß")   # "TSCHÜSS"  notice ß => SS
 326
 327  # both are true:
 328  "tschüß"  =~ /TSCHÜSS/i   # notice ß => SS
 329  "Σίσυφος" =~ /ΣΊΣΥΦΟΣ/i   # notice Σ,σ,ς sameness
 330
 331 =head2 ℞ 21: Unicode case-insensitive comparisons
 332
 333 Also available in the CPAN L<Unicode::CaseFold> module,
 334 the new C<fc> “foldcase” function from v5.16 grants
 335 access to the same Unicode casefolding as the C</i>
 336 pattern modifier has always used:
 337
 338  use feature "fc"; # fc() function is from v5.16
 339
 340  # sort case-insensitively
 341  my @sorted = sort { fc($a) cmp fc($b) } @list;
 342
 343  # both are true:
 344  fc("tschüß")  eq fc("TSCHÜSS")
 345  fc("Σίσυφος") eq fc("ΣΊΣΥΦΟΣ")
 346
 347 =head2 ℞ 22: Match Unicode linebreak sequence in regex
 348
 349 A Unicode linebreak matches the two-character CRLF
 350 grapheme or any of seven vertical whitespace characters.
 351 Good for dealing with textfiles coming from different
 352 operating systems.
 353
 354  \R
 355
 356  s/\R/\n/g;  # normalize all linebreaks to \n
 357
 358 =head2 ℞ 23: Get character category
 359
 360 Find the general category of a numeric codepoint.
 361
 362  use Unicode::UCD qw(charinfo);
 363  my $cat = charinfo(0x3A3)->{category};  # "Lu"
 364
 365 =head2 ℞ 24: Disabling Unicode-awareness in builtin charclasses
 366
 367 Disable C<\w>, C<\b>, C<\s>, C<\d>, and the POSIX
 368 classes from working correctly on Unicode either in this
 369 scope, or in just one regex.
 370
 371  use v5.14;
 372  use re "/a";
 373
 374  # OR
 375
 376  my($num) = $str =~ /(\d+)/a;
 377
 378 Or use specific un-Unicode properties, like C<\p{ahex}>
 379 and C<\p{POSIX_Digit>}.  Properties still work normally
 380 no matter what charset modifiers (C</d /u /l /a /aa>)
 381 should be effect.
 382
 383 =head2 ℞ 25: Match Unicode properties in regex with \p, \P
 384
 385 These all match a single codepoint with the given
 386 property.  Use C<\P> in place of C<\p> to match
 387 one codepoint lacking that property.
 388
 389  \pL, \pN, \pS, \pP, \pM, \pZ, \pC
 390  \p{Sk}, \p{Ps}, \p{Lt}
 391  \p{alpha}, \p{upper}, \p{lower}
 392  \p{Latin}, \p{Greek}
 393  \p{script=Latin}, \p{script=Greek}
 394  \p{East_Asian_Width=Wide}, \p{EA=W}
 395  \p{Line_Break=Hyphen}, \p{LB=HY}
 396  \p{Numeric_Value=4}, \p{NV=4}
 397
 398 =head2 ℞ 26: Custom character properties
 399
 400 Define at compile-time your own custom character
 401 properties for use in regexes.
 402
 403  # using private-use characters
 404  sub In_Tengwar { "E000\tE07F\n" }
 405
 406  if (/\p{In_Tengwar}/) { ... }
 407
 408  # blending existing properties
 409  sub Is_GraecoRoman_Title {<<'END_OF_SET'}
 410  +utf8::IsLatin
 411  +utf8::IsGreek
 412  &utf8::IsTitle
 413  END_OF_SET
 414
 415  if (/\p{Is_GraecoRoman_Title}/ { ... }
 416
 417 =head2 ℞ 27: Unicode normalization
 418
 419 Typically render into NFD on input and NFC on output. Using NFKC or NFKD
 420 functions improves recall on searches, assuming you've already done to the
 421 same text to be searched. Note that this is about much more than just pre-
 422 combined compatibility glyphs; it also reorders marks according to their
 423 canonical combining classes and weeds out singletons.
 424
 425  use Unicode::Normalize;
 426  my $nfd  = NFD($orig);
 427  my $nfc  = NFC($orig);
 428  my $nfkd = NFKD($orig);
 429  my $nfkc = NFKC($orig);
 430
 431 =head2 ℞ 28: Convert non-ASCII Unicode numerics
 432
 433 Unless you’ve used C</a> or C</aa>, C<\d> matches more than
 434 ASCII digits only, but Perl’s implicit string-to-number
 435 conversion does not current recognize these.  Here’s how to
 436 convert such strings manually.
 437
 438  use v5.14;  # needed for num() function
 439  use Unicode::UCD qw(num);
 440  my $str = "got Ⅻ and ४५६७ and ⅞ and here";
 441  my @nums = ();
 442  while (/$str =~ (\d+|\N)/g) {  # not just ASCII!
 443     push @nums, num($1);
 444  }
 445  say "@nums";   #     12      4567      0.875
 446
 447  use charnames qw(:full);
 448  my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
 449
 450 =head2 ℞ 29: Match Unicode grapheme cluster in regex
 451
 452 Programmer-visible “characters” are codepoints matched by C</./s>,
 453 but user-visible “characters” are graphemes matched by C</\X/>.
 454
 455  # Find vowel *plus* any combining diacritics,underlining,etc.
 456  my $nfd = NFD($orig);
 457  $nfd =~ / (?=[aeiou]) \X /xi
 458
 459 =head2 ℞ 30: Extract by grapheme instead of by codepoint (regex)
 460
 461  # match and grab five first graphemes
 462  my($first_five) = $str =~ /^ ( \X{5} ) /x;
 463
 464 =head2 ℞ 31: Extract by grapheme instead of by codepoint (substr)
 465
 466  # cpan -i Unicode::GCString
 467  use Unicode::GCString;
 468  my $gcs = Unicode::GCString->new($str);
 469  my $first_five = $gcs->substr(0, 5);
 470
 471 =head2 ℞ 32: Reverse string by grapheme
 472
 473 Reversing by codepoint messes up diacritics, mistakenly converting
 474 C<crème brûlée> into C<éel̂urb em̀erc> instead of into C<eélûrb emèrc>;
 475 so reverse by grapheme instead.  Both these approaches work
 476 right no matter what normalization the string is in:
 477
 478  $str = join("", reverse $str =~ /\X/g);
 479
 480  # OR: cpan -i Unicode::GCString
 481  use Unicode::GCString;
 482  $str = reverse Unicode::GCString->new($str);
 483
 484 =head2 ℞ 33: String length in graphemes
 485
 486 The string C<brûlée> has six graphemes but up to eight codepoints.
 487 This counts by grapheme, not by codepoint:
 488
 489  my $str = "brûlée";
 490  my $count = 0;
 491  while ($str =~ /\X/g) { $count++ }
 492
 493   # OR: cpan -i Unicode::GCString
 494  use Unicode::GCString;
 495  my $gcs = Unicode::GCString->new($str);
 496  my $count = $gcs->length;
 497
 498 =head2 ℞ 34: Unicode column-width for printing
 499
 500 Perl’s C<printf>, C<sprintf>, and C<format> think all
 501 codepoints take up 1 print column, but many take 0 or 2.
 502 Here to show that normalization makes no difference,
 503 we print out both forms:
 504
 505  use Unicode::GCString;
 506  use Unicode::Normalize;
 507
 508  my @words = qw/crème brûlée/;
 509  @words = map { NFC($_), NFD($_) } @words;
 510
 511  for my $str (@words) {
 512      my $gcs = Unicode::GCString->new($str);
 513      my $cols = $gcs->columns;
 514      my $pad = " " x (10 - $cols);
 515      say str, $pad, " |";
 516  }
 517
 518 generates this to show that it pads correctly no matter
 519 the normalization:
 520
 521  crème      |
 522  crème      |
 523  brûlée     |
 524  brûlée     |
 525
 526 =head2 ℞ 35: Unicode collation
 527
 528 Text sorted by numeric codepoint follows no reasonable alphabetic order;
 529 use the UCA for sorting text.
 530
 531  use Unicode::Collate;
 532  my $col = Unicode::Collate->new();
 533  my @list = $col->sort(@old_list);
 534
 535 See the I<ucsort> program from the L<Unicode::Tussle> CPAN module
 536 for a convenient command-line interface to this module.
 537
 538 =head2 ℞ 36: Case- I<and> accent-insensitive Unicode sort
 539
 540 Specify a collation strength of level 1 to ignore case and
 541 diacritics, only looking at the basic character.
 542
 543  use Unicode::Collate;
 544  my $col = Unicode::Collate->new(level => 1);
 545  my @list = $col->sort(@old_list);
 546
 547 =head2 ℞ 37: Unicode locale collation
 548
 549 Some locales have special sorting rules.
 550
 551  # either use v5.12, OR: cpan -i Unicode::Collate::Locale
 552  use Unicode::Collate::Locale;
 553  my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
 554  my @list = $col->sort(@old_list);
 555
 556 The I<ucsort> program mentioned above accepts a C<--locale> parameter.
 557
 558 =head2 ℞ 38: Making C<cmp> work on text instead of codepoints
 559
 560 Instead of this:
 561
 562  @srecs = sort {
 563      $b->{AGE}   <=>  $a->{AGE}
 564                  ||
 565      $a->{NAME}  cmp  $b->{NAME}
 566  } @recs;
 567
 568 Use this:
 569
 570  my $coll = Unicode::Collate->new();
 571  for my $rec (@recs) {
 572      $rec->{NAME_key} = $coll->getSortKey( $rec->{NAME} );
 573  }
 574  @srecs = sort {
 575      $b->{AGE}       <=>  $a->{AGE}
 576                      ||
 577      $a->{NAME_key}  cmp  $b->{NAME_key}
 578  } @recs;
 579
 580 =head2 ℞ 39: Case- I<and> accent-insensitive comparisons
 581
 582 Use a collator object to compare Unicode text by character
 583 instead of by codepoint.
 584
 585  use Unicode::Collate;
 586  my $es = Unicode::Collate->new(
 587      level => 1,
 588      normalization => undef
 589  );
 590
 591   # now both are true:
 592  $es->eq("García",  "GARCIA" );
 593  $es->eq("Márquez", "MARQUEZ");
 594
 595 =head2 ℞ 40: Case- I<and> accent-insensitive locale comparisons
 596
 597 Same, but in a specific locale.
 598
 599  my $de = Unicode::Collate::Locale->new(
 600             locale => "de__phonebook",
 601           );
 602
 603  # now this is true:
 604  $de->eq("tschüß", "TSCHUESS");  # notice ü => UE, ß => SS
 605
 606 =head2 ℞ 41: Unicode linebreaking
 607
 608 Break up text into lines according to Unicode rules.
 609
 610  # cpan -i Unicode::LineBreak
 611  use Unicode::LineBreak;
 612  use charnames qw(:full);
 613
 614  my $para = "This is a super\N{HYPHEN}long string. " x 20;
 615  my $fmt = new Unicode::LineBreak;
 616  print $fmt->break($para), "\n";
 617
 618 =head2 ℞ 42: Unicode text in DBM hashes, the tedious way
 619
 620 Using a regular Perl string as a key or value for a DBM
 621 hash will trigger a wide character exception if any codepoints
 622 won’t fit into a byte.  Here’s how to manually manage the translation:
 623
 624     use DB_File;
 625     use Encode qw(encode decode);
 626     tie %dbhash, "DB_File", "pathname";
 627
 628  # STORE
 629
 630     # assume $uni_key and $uni_value are abstract Unicode strings
 631     my $enc_key   = encode("UTF-8", $uni_key, 1);
 632     my $enc_value = encode("UTF-8", $uni_value, 1);
 633     $dbhash{$enc_key} = $enc_value;
 634
 635  # FETCH
 636
 637     # assume $uni_key holds a normal Perl string (abstract Unicode)
 638     my $enc_key   = encode("UTF-8", $uni_key, 1);
 639     my $enc_value = $dbhash{$enc_key};
 640     my $uni_value = decode("UTF-8", $enc_key, 1);
 641
 642 =head2 ℞ 43: Unicode text in DBM hashes, the easy way
 643
 644 Here’s how to implicitly manage the translation; all encoding
 645 and decoding is done automatically, just as with streams that
 646 have a particular encoding attached to them:
 647
 648     use DB_File;
 649     use DBM_Filter;
 650
 651     my $dbobj = tie %dbhash, "DB_File", "pathname";
 652     $dbobj->Filter_Value("utf8");  # this is the magic bit
 653
 654  # STORE
 655
 656     # assume $uni_key and $uni_value are abstract Unicode strings
 657     $dbhash{$uni_key} = $uni_value;
 658
 659   # FETCH
 660
 661     # $uni_key holds a normal Perl string (abstract Unicode)
 662     my $uni_value = $dbhash{$uni_key};
 663
 664 =head2 ℞ 44: PROGRAM: Demo of Unicode collation and printing
 665
 666 Here’s a full program showing how to make use of locale-sensitive
 667 sorting, Unicode casing, and managing print widths when some of the
 668 characters take up zero or two columns, not just one column each time.
 669 When run, the following program produces this nicely aligned output:
 670
 671     Crème Brûlée....... €2.00
 672     Éclair............. €1.60
 673     Fideuà............. €4.20
 674     Hamburger.......... €6.00
 675     Jamón Serrano...... €4.45
 676     Linguiça........... €7.00
 677     Pâté............... €4.15
 678     Pears.............. €2.00
 679     Pêches............. €2.25
 680     Smørbrød........... €5.75
 681     Spätzle............ €5.50
 682     Xoriço............. €3.00
 683     Γύρος.............. €6.50
 684     막걸리............. €4.00
 685     おもち............. €2.65
 686     お好み焼き......... €8.00
 687     シュークリーム..... €1.85
 688     寿司............... €9.99
 689     包子............... €7.50
 690
 691 Here's that program; tested on v5.14.
 692
 693  #!/usr/bin/env perl
 694  # umenu - demo sorting and printing of Unicode food
 695  #
 696  # (obligatory and increasingly long preamble)
 697  #
 698  use utf8;
 699  use v5.14;                       # for locale sorting
 700  use strict;
 701  use warnings;
 702  use warnings  qw(FATAL utf8);    # fatalize encoding faults
 703  use open      qw(:std :utf8);    # undeclared streams in UTF-8
 704  use charnames qw(:full :short);  # unneeded in v5.16
 705
 706  # std modules
 707  use Unicode::Normalize;          # std perl distro as of v5.8
 708  use List::Util qw(max);          # std perl distro as of v5.10
 709  use Unicode::Collate::Locale;    # std perl distro as of v5.14
 710
 711  # cpan modules
 712  use Unicode::GCString;           # from CPAN
 713
 714  # forward defs
 715  sub pad($$$);
 716  sub colwidth(_);
 717  sub entitle(_);
 718
 719  my %price = (
 720      "γύρος"             => 6.50, # gyros
 721      "pears"             => 2.00, # like um, pears
 722      "linguiça"          => 7.00, # spicy sausage, Portuguese
 723      "xoriço"            => 3.00, # chorizo sausage, Catalan
 724      "hamburger"         => 6.00, # burgermeister meisterburger
 725      "éclair"            => 1.60, # dessert, French
 726      "smørbrød"          => 5.75, # sandwiches, Norwegian
 727      "spätzle"           => 5.50, # Bayerisch noodles, little sparrows
 728      "包子"              => 7.50, # bao1 zi5, steamed pork buns, Mandarin
 729      "jamón serrano"     => 4.45, # country ham, Spanish
 730      "pêches"            => 2.25, # peaches, French
 731      "シュークリーム"    => 1.85, # cream-filled pastry like eclair
 732      "막걸리"            => 4.00, # makgeolli, Korean rice wine
 733      "寿司"              => 9.99, # sushi, Japanese
 734      "おもち"            => 2.65, # omochi, rice cakes, Japanese
 735      "crème brûlée"      => 2.00, # crema catalana
 736      "fideuà"            => 4.20, # more noodles, Valencian
 737                                   # (Catalan=fideuada)
 738      "pâté"              => 4.15, # gooseliver paste, French
 739      "お好み焼き"        => 8.00, # okonomiyaki, Japanese
 740  );
 741
 742  my $width = 5 + max map { colwidth } keys %price;
 743
 744  # So the Asian stuff comes out in an order that someone
 745  # who reads those scripts won't freak out over; the
 746  # CJK stuff will be in JIS X 0208 order that way.
 747  my $coll  = new Unicode::Collate::Locale locale => "ja";
 748
 749  for my $item ($coll->sort(keys %price)) {
 750      print pad(entitle($item), $width, ".");
 751      printf " €%.2f\n", $price{$item};
 752  }
 753
 754  sub pad($$$) {
 755      my($str, $width, $padchar) = @_;
 756      return $str . ($padchar x ($width - colwidth($str)));
 757  }
 758
 759  sub colwidth(_) {
 760      my($str) = @_;
 761      return Unicode::GCString->new($str)->columns;
 762  }
 763
 764  sub entitle(_) {
 765      my($str) = @_;
 766      $str =~ s{ (?=\pL)(\S)     (\S*) }
 767               { ucfirst($1) . lc($2)  }xge;
 768      return $str;
 769  }
 770
 771 =head1 SEE ALSO
 772
 773 See these manpages, some of which are CPAN modules:
 774 L<perlunicode>, L<perluniprops>,
 775 L<perlre>, L<perlrecharclass>,
 776 L<perluniintro>, L<perlunitut>, L<perlunifaq>,
 777 L<PerlIO>, L<DB_File>, L<DBM_Filter>, L<DBM_Filter::utf8>,
 778 L<Encode>, L<Encode::Locale>,
 779 L<Unicode::UCD>,
 780 L<Unicode::Normalize>,
 781 L<Unicode::GCString>, L<Unicode::LineBreak>,
 782 L<Unicode::Collate>, L<Unicode::Collate::Locale>,
 783 L<Unicode::Unihan>,
 784 L<Unicode::CaseFold>,
 785 L<Unicode::Tussle>,
 786 L<Lingua::JA::Romanize::Japanese>,
 787 L<Lingua::ZH::Romanize::Pinyin>,
 788 L<Lingua::KO::Romanize::Hangul>.
 789
 790 The L<Unicode::Tussle> CPAN module includes many programs
 791 to help with working with Unicode, including
 792 these programs to fully or partly replace standard utilities:
 793 I<tcgrep> instead of I<egrep>,
 794 I<uniquote> instead of I<cat -v> or I<hexdump>,
 795 I<uniwc> instead of I<wc>,
 796 I<unilook> instead of I<look>,
 797 I<unifmt> instead of I<fmt>,
 798 and
 799 I<ucsort> instead of I<sort>.
 800 For exploring Unicode character names and character properties,
 801 see its I<uniprops>, I<unichars>, and I<uninames> programs.
 802 It also supplies these programs, all of which are general filters that do Unicode-y things:
 803 I<unititle> and I<unicaps>;
 804 I<uniwide> and I<uninarrow>;
 805 I<unisupers> and I<unisubs>;
 806 I<nfd>, I<nfc>, I<nfkd>, and I<nfkc>;
 807 and I<uc>, I<lc>, and I<tc>.
 808
 809 Finally, see the published Unicode Standard (page numbers are from version
 810 6.0.0), including these specific annexes and technical reports:
 811
 812 =over
 813
 814 =item §3.13 Default Case Algorithms, page 113;
 815 §4.2  Case, pages 120–122;
 816 Case Mappings, page 166–172, especially Caseless Matching starting on page 170.
 817
 818 =item UAX #44: Unicode Character Database
 819
 820 =item UTS #18: Unicode Regular Expressions
 821
 822 =item UAX #15: Unicode Normalization Forms
 823
 824 =item UTS #10: Unicode Collation Algorithm
 825
 826 =item UAX #29: Unicode Text Segmentation
 827
 828 =item UAX #14: Unicode Line Breaking Algorithm
 829
 830 =item UAX #11: East Asian Width
 831
 832 =back
 833
 834 =head1 AUTHOR
 835
 836 Tom Christiansen E<lt>tchrist@perl.comE<gt> wrote this, with occasional
 837 kibbitzing from Larry Wall and Jeffrey Friedl in the background.
 838
 839 =head1 COPYRIGHT AND LICENCE
 840
 841 Copyright © 2012 Tom Christiansen.
 842
 843 This program is free software; you may redistribute it and/or modify it
 844 under the same terms as Perl itself.
 845
 846 Most of these examples taken from the current edition of the “Camel Book”;
 847 that is, from the 4ᵗʰ Edition of I<Programming Perl>, Copyright © 2012 Tom
 848 Christiansen <et al.>, 2012-02-13 by O’Reilly Media.  The code itself is
 849 freely redistributable, and you are encouraged to transplant, fold,
 850 spindle, and mutilate any of the examples in this manpage however you please
 851 for inclusion into your own programs without any encumbrance whatsoever.
 852 Acknowledgement via code comment is polite but not required.
 853
 854 =head1 REVISION HISTORY
 855
 856 v1.0.0 – first public release, 2012-02-27
 857