pod/perlunicook.pod

   1
   2 =encoding utf8
   3
   4 =head1 NAME
   5
   6 perlunicook - cookbookish examples of handling Unicode in Perl
   7
   8 =head1 DESCRIPTION
   9
  10 This manpage contains short recipes demonstrating how to handle common Unicode
  11 operations in Perl, plus one complete program at the end. Any undeclared
  12 variables in individual recipes are assumed to have a previous appropriate
  13 value in them.
  14
  15 =head1 EXAMPLES
  16
  17 =head2 ℞ 0: Standard preamble
  18
  19 Unless otherwise notes, all examples below require this standard preamble
  20 to work correctly, with the C<#!> adjusted to work on your system:
  21
  22  #!/usr/bin/env perl
  23
  24  use utf8;      # so literals and identifiers can be in UTF-8
  25  use v5.12;     # or later to get "unicode_strings" feature
  26  use strict;    # quote strings, declare variables
  27  use warnings;  # on by default
  28  use warnings  qw(FATAL utf8);    # fatalize encoding glitches
  29  use open      qw(:std :encoding(UTF-8)); # undeclared streams in UTF-8
  30  use charnames qw(:full :short);  # unneeded in v5.16
  31
  32 This I<does> make even Unix programmers C<binmode> your binary streams,
  33 or open them with C<:raw>, but that's the only way to get at them
  34 portably anyway.
  35
  36 B<WARNING>: C<use autodie> (pre 2.26) and C<use open> do not get along with each
  37 other.
  38
  39 =head2 ℞ 1: Generic Unicode-savvy filter
  40
  41 Always decompose on the way in, then recompose on the way out.
  42
  43  use Unicode::Normalize;
  44
  45  while (<>) {
  46      $_ = NFD($_);   # decompose + reorder canonically
  47      ...
  48  } continue {
  49      print NFC($_);  # recompose (where possible) + reorder canonically
  50  }
  51
  52 =head2 ℞ 2: Fine-tuning Unicode warnings
  53
  54 As of v5.14, Perl distinguishes three subclasses of UTF‑8 warnings.
  55
  56  use v5.14;                  # subwarnings unavailable any earlier
  57  no warnings "nonchar";      # the 66 forbidden non-characters
  58  no warnings "surrogate";    # UTF-16/CESU-8 nonsense
  59  no warnings "non_unicode";  # for codepoints over 0x10_FFFF
  60
  61 =head2 ℞ 3: Declare source in utf8 for identifiers and literals
  62
  63 Without the all-critical C<use utf8> declaration, putting UTF‑8 in your
  64 literals and identifiers won’t work right.  If you used the standard
  65 preamble just given above, this already happened.  If you did, you can
  66 do things like this:
  67
  68  use utf8;
  69
  70  my $measure   = "Ångström";
  71  my @μsoft     = qw( cp852 cp1251 cp1252 );
  72  my @ὑπέρμεγας = qw( ὑπέρ  μεγας );
  73  my @鯉        = qw( koi8-f koi8-u koi8-r );
  74  my $motto     = "👪 💗 🐪"; # FAMILY, GROWING HEART, DROMEDARY CAMEL
  75
  76 If you forget C<use utf8>, high bytes will be misunderstood as
  77 separate characters, and nothing will work right.
  78
  79 =head2 ℞ 4: Characters and their numbers
  80
  81 The C<ord> and C<chr> functions work transparently on all codepoints,
  82 not just on ASCII alone — nor in fact, not even just on Unicode alone.
  83
  84  # ASCII characters
  85  ord("A")
  86  chr(65)
  87
  88  # characters from the Basic Multilingual Plane
  89  ord("Σ")
  90  chr(0x3A3)
  91
  92  # beyond the BMP
  93  ord("𝑛")               # MATHEMATICAL ITALIC SMALL N
  94  chr(0x1D45B)
  95
  96  # beyond Unicode! (up to MAXINT)
  97  ord("\x{20_0000}")
  98  chr(0x20_0000)
  99
 100 =head2 ℞ 5: Unicode literals by character number
 101
 102 In an interpolated literal, whether a double-quoted string or a
 103 regex, you may specify a character by its number using the
 104 C<\x{I<HHHHHH>}> escape.
 105
 106  String: "\x{3a3}"
 107  Regex:  /\x{3a3}/
 108
 109  String: "\x{1d45b}"
 110  Regex:  /\x{1d45b}/
 111
 112  # even non-BMP ranges in regex work fine
 113  /[\x{1D434}-\x{1D467}]/
 114
 115 =head2 ℞ 6: Get character name by number
 116
 117  use charnames ();
 118  my $name = charnames::viacode(0x03A3);
 119
 120 =head2 ℞ 7: Get character number by name
 121
 122  use charnames ();
 123  my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");
 124
 125 =head2 ℞ 8: Unicode named characters
 126
 127 Use the C<< \N{I<charname>} >> notation to get the character
 128 by that name for use in interpolated literals (double-quoted
 129 strings and regexes).  In v5.16, there is an implicit
 130
 131  use charnames qw(:full :short);
 132
 133 But prior to v5.16, you must be explicit about which set of charnames you
 134 want.  The C<:full> names are the official Unicode character name, alias, or
 135 sequence, which all share a namespace.
 136
 137  use charnames qw(:full :short latin greek);
 138
 139  "\N{MATHEMATICAL ITALIC SMALL N}"      # :full
 140  "\N{GREEK CAPITAL LETTER SIGMA}"       # :full
 141
 142 Anything else is a Perl-specific convenience abbreviation.  Specify one or
 143 more scripts by names if you want short names that are script-specific.
 144
 145  "\N{Greek:Sigma}"                      # :short
 146  "\N{ae}"                               #  latin
 147  "\N{epsilon}"                          #  greek
 148
 149 The v5.16 release also supports a C<:loose> import for loose matching of
 150 character names, which works just like loose matching of property names:
 151 that is, it disregards case, whitespace, and underscores:
 152
 153  "\N{euro sign}"                        # :loose (from v5.16)
 154
 155 =head2 ℞ 9: Unicode named sequences
 156
 157 These look just like character names but return multiple codepoints.
 158 Notice the C<%vx> vector-print functionality in C<printf>.
 159
 160  use charnames qw(:full);
 161  my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
 162  printf "U+%v04X\n", $seq;
 163  U+0100.0300
 164
 165 =head2 ℞ 10: Custom named characters
 166
 167 Use C<:alias> to give your own lexically scoped nicknames to existing
 168 characters, or even to give unnamed private-use characters useful names.
 169
 170  use charnames ":full", ":alias" => {
 171      ecute => "LATIN SMALL LETTER E WITH ACUTE",
 172      "APPLE LOGO" => 0xF8FF, # private use character
 173  };
 174
 175  "\N{ecute}"
 176  "\N{APPLE LOGO}"
 177
 178 =head2 ℞ 11: Names of CJK codepoints
 179
 180 Sinograms like “東京” come back with character names of
 181 C<CJK UNIFIED IDEOGRAPH-6771> and C<CJK UNIFIED IDEOGRAPH-4EAC>,
 182 because their “names” vary.  The CPAN C<Unicode::Unihan> module
 183 has a large database for decoding these (and a whole lot more), provided you
 184 know how to understand its output.
 185
 186  # cpan -i Unicode::Unihan
 187  use Unicode::Unihan;
 188  my $str = "東京";
 189  my $unhan = Unicode::Unihan->new;
 190  for my $lang (qw(Mandarin Cantonese Korean JapaneseOn JapaneseKun)) {
 191      printf "CJK $str in %-12s is ", $lang;
 192      say $unhan->$lang($str);
 193  }
 194
 195 prints:
 196
 197  CJK 東京 in Mandarin     is DONG1JING1
 198  CJK 東京 in Cantonese    is dung1ging1
 199  CJK 東京 in Korean       is TONGKYENG
 200  CJK 東京 in JapaneseOn   is TOUKYOU KEI KIN
 201  CJK 東京 in JapaneseKun  is HIGASHI AZUMAMIYAKO
 202
 203 If you have a specific romanization scheme in mind,
 204 use the specific module:
 205
 206  # cpan -i Lingua::JA::Romanize::Japanese
 207  use Lingua::JA::Romanize::Japanese;
 208  my $k2r = Lingua::JA::Romanize::Japanese->new;
 209  my $str = "東京";
 210  say "Japanese for $str is ", $k2r->chars($str);
 211
 212 prints
 213
 214  Japanese for 東京 is toukyou
 215
 216 =head2 ℞ 12: Explicit encode/decode
 217
 218 On rare occasion, such as a database read, you may be
 219 given encoded text you need to decode.
 220
 221   use Encode qw(encode decode);
 222
 223   my $chars = decode("shiftjis", $bytes, 1);
 224  # OR
 225   my $bytes = encode("MIME-Header-ISO_2022_JP", $chars, 1);
 226
 227 For streams all in the same encoding, don't use encode/decode; instead
 228 set the file encoding when you open the file or immediately after with
 229 C<binmode> as described later below.
 230
 231 =head2 ℞ 13: Decode program arguments as utf8
 232
 233      $ perl -CA ...
 234  or
 235      $ export PERL_UNICODE=A
 236  or
 237     use Encode qw(decode);
 238     @ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
 239
 240 =head2 ℞ 14: Decode program arguments as locale encoding
 241
 242     # cpan -i Encode::Locale
 243     use Encode qw(locale);
 244     use Encode::Locale;
 245
 246     # use "locale" as an arg to encode/decode
 247     @ARGV = map { decode(locale => $_, 1) } @ARGV;
 248
 249 =head2 ℞ 15: Declare STD{IN,OUT,ERR} to be utf8
 250
 251 Use a command-line option, an environment variable, or else
 252 call C<binmode> explicitly:
 253
 254      $ perl -CS ...
 255  or
 256      $ export PERL_UNICODE=S
 257  or
 258      use open qw(:std :encoding(UTF-8));
 259  or
 260      binmode(STDIN,  ":encoding(UTF-8)");
 261      binmode(STDOUT, ":utf8");
 262      binmode(STDERR, ":utf8");
 263
 264 =head2 ℞ 16: Declare STD{IN,OUT,ERR} to be in locale encoding
 265
 266     # cpan -i Encode::Locale
 267     use Encode;
 268     use Encode::Locale;
 269
 270     # or as a stream for binmode or open
 271     binmode STDIN,  ":encoding(console_in)"  if -t STDIN;
 272     binmode STDOUT, ":encoding(console_out)" if -t STDOUT;
 273     binmode STDERR, ":encoding(console_out)" if -t STDERR;
 274
 275 =head2 ℞ 17: Make file I/O default to utf8
 276
 277 Files opened without an encoding argument will be in UTF-8:
 278
 279      $ perl -CD ...
 280  or
 281      $ export PERL_UNICODE=D
 282  or
 283      use open qw(:encoding(UTF-8));
 284
 285 =head2 ℞ 18: Make all I/O and args default to utf8
 286
 287      $ perl -CSDA ...
 288  or
 289      $ export PERL_UNICODE=SDA
 290  or
 291      use open qw(:std :encoding(UTF-8));
 292      use Encode qw(decode);
 293      @ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
 294
 295 =head2 ℞ 19: Open file with specific encoding
 296
 297 Specify stream encoding.  This is the normal way
 298 to deal with encoded text, not by calling low-level
 299 functions.
 300
 301  # input file
 302      open(my $in_file, "< :encoding(UTF-16)", "wintext");
 303  OR
 304      open(my $in_file, "<", "wintext");
 305      binmode($in_file, ":encoding(UTF-16)");
 306  THEN
 307      my $line = <$in_file>;
 308
 309  # output file
 310      open($out_file, "> :encoding(cp1252)", "wintext");
 311  OR
 312      open(my $out_file, ">", "wintext");
 313      binmode($out_file, ":encoding(cp1252)");
 314  THEN
 315      print $out_file "some text\n";
 316
 317 More layers than just the encoding can be specified here. For example,
 318 the incantation C<":raw :encoding(UTF-16LE) :crlf"> includes implicit
 319 CRLF handling.
 320
 321 =head2 ℞ 20: Unicode casing
 322
 323 Unicode casing is very different from ASCII casing.
 324
 325  uc("henry ⅷ")  # "HENRY Ⅷ"
 326  uc("tschüß")   # "TSCHÜSS"  notice ß => SS
 327
 328  # both are true:
 329  "tschüß"  =~ /TSCHÜSS/i   # notice ß => SS
 330  "Σίσυφος" =~ /ΣΊΣΥΦΟΣ/i   # notice Σ,σ,ς sameness
 331
 332 =head2 ℞ 21: Unicode case-insensitive comparisons
 333
 334 Also available in the CPAN L<Unicode::CaseFold> module,
 335 the new C<fc> “foldcase” function from v5.16 grants
 336 access to the same Unicode casefolding as the C</i>
 337 pattern modifier has always used:
 338
 339  use feature "fc"; # fc() function is from v5.16
 340
 341  # sort case-insensitively
 342  my @sorted = sort { fc($a) cmp fc($b) } @list;
 343
 344  # both are true:
 345  fc("tschüß")  eq fc("TSCHÜSS")
 346  fc("Σίσυφος") eq fc("ΣΊΣΥΦΟΣ")
 347
 348 =head2 ℞ 22: Match Unicode linebreak sequence in regex
 349
 350 A Unicode linebreak matches the two-character CRLF
 351 grapheme or any of seven vertical whitespace characters.
 352 Good for dealing with textfiles coming from different
 353 operating systems.
 354
 355  \R
 356
 357  s/\R/\n/g;  # normalize all linebreaks to \n
 358
 359 =head2 ℞ 23: Get character category
 360
 361 Find the general category of a numeric codepoint.
 362
 363  use Unicode::UCD qw(charinfo);
 364  my $cat = charinfo(0x3A3)->{category};  # "Lu"
 365
 366 =head2 ℞ 24: Disabling Unicode-awareness in builtin charclasses
 367
 368 Disable C<\w>, C<\b>, C<\s>, C<\d>, and the POSIX
 369 classes from working correctly on Unicode either in this
 370 scope, or in just one regex.
 371
 372  use v5.14;
 373  use re "/a";
 374
 375  # OR
 376
 377  my($num) = $str =~ /(\d+)/a;
 378
 379 Or use specific un-Unicode properties, like C<\p{ahex}>
 380 and C<\p{POSIX_Digit>}.  Properties still work normally
 381 no matter what charset modifiers (C</d /u /l /a /aa>)
 382 should be effect.
 383
 384 =head2 ℞ 25: Match Unicode properties in regex with \p, \P
 385
 386 These all match a single codepoint with the given
 387 property.  Use C<\P> in place of C<\p> to match
 388 one codepoint lacking that property.
 389
 390  \pL, \pN, \pS, \pP, \pM, \pZ, \pC
 391  \p{Sk}, \p{Ps}, \p{Lt}
 392  \p{alpha}, \p{upper}, \p{lower}
 393  \p{Latin}, \p{Greek}
 394  \p{script_extensions=Latin}, \p{scx=Greek}
 395  \p{East_Asian_Width=Wide}, \p{EA=W}
 396  \p{Line_Break=Hyphen}, \p{LB=HY}
 397  \p{Numeric_Value=4}, \p{NV=4}
 398
 399 =head2 ℞ 26: Custom character properties
 400
 401 Define at compile-time your own custom character
 402 properties for use in regexes.
 403
 404  # using private-use characters
 405  sub In_Tengwar { "E000\tE07F\n" }
 406
 407  if (/\p{In_Tengwar}/) { ... }
 408
 409  # blending existing properties
 410  sub Is_GraecoRoman_Title {<<'END_OF_SET'}
 411  +utf8::IsLatin
 412  +utf8::IsGreek
 413  &utf8::IsTitle
 414  END_OF_SET
 415
 416  if (/\p{Is_GraecoRoman_Title}/ { ... }
 417
 418 =head2 ℞ 27: Unicode normalization
 419
 420 Typically render into NFD on input and NFC on output. Using NFKC or NFKD
 421 functions improves recall on searches, assuming you've already done to the
 422 same text to be searched. Note that this is about much more than just pre-
 423 combined compatibility glyphs; it also reorders marks according to their
 424 canonical combining classes and weeds out singletons.
 425
 426  use Unicode::Normalize;
 427  my $nfd  = NFD($orig);
 428  my $nfc  = NFC($orig);
 429  my $nfkd = NFKD($orig);
 430  my $nfkc = NFKC($orig);
 431
 432 =head2 ℞ 28: Convert non-ASCII Unicode numerics
 433
 434 Unless you’ve used C</a> or C</aa>, C<\d> matches more than
 435 ASCII digits only, but Perl’s implicit string-to-number
 436 conversion does not current recognize these.  Here’s how to
 437 convert such strings manually.
 438
 439  use v5.14;  # needed for num() function
 440  use Unicode::UCD qw(num);
 441  my $str = "got Ⅻ and ४५६७ and ⅞ and here";
 442  my @nums = ();
 443  while ($str =~ /(\d+|\N)/g) {  # not just ASCII!
 444     push @nums, num($1);
 445  }
 446  say "@nums";   #     12      4567      0.875
 447
 448  use charnames qw(:full);
 449  my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
 450
 451 =head2 ℞ 29: Match Unicode grapheme cluster in regex
 452
 453 Programmer-visible “characters” are codepoints matched by C</./s>,
 454 but user-visible “characters” are graphemes matched by C</\X/>.
 455
 456  # Find vowel *plus* any combining diacritics,underlining,etc.
 457  my $nfd = NFD($orig);
 458  $nfd =~ / (?=[aeiou]) \X /xi
 459
 460 =head2 ℞ 30: Extract by grapheme instead of by codepoint (regex)
 461
 462  # match and grab five first graphemes
 463  my($first_five) = $str =~ /^ ( \X{5} ) /x;
 464
 465 =head2 ℞ 31: Extract by grapheme instead of by codepoint (substr)
 466
 467  # cpan -i Unicode::GCString
 468  use Unicode::GCString;
 469  my $gcs = Unicode::GCString->new($str);
 470  my $first_five = $gcs->substr(0, 5);
 471
 472 =head2 ℞ 32: Reverse string by grapheme
 473
 474 Reversing by codepoint messes up diacritics, mistakenly converting
 475 C<crème brûlée> into C<éel̂urb em̀erc> instead of into C<eélûrb emèrc>;
 476 so reverse by grapheme instead.  Both these approaches work
 477 right no matter what normalization the string is in:
 478
 479  $str = join("", reverse $str =~ /\X/g);
 480
 481  # OR: cpan -i Unicode::GCString
 482  use Unicode::GCString;
 483  $str = reverse Unicode::GCString->new($str);
 484
 485 =head2 ℞ 33: String length in graphemes
 486
 487 The string C<brûlée> has six graphemes but up to eight codepoints.
 488 This counts by grapheme, not by codepoint:
 489
 490  my $str = "brûlée";
 491  my $count = 0;
 492  while ($str =~ /\X/g) { $count++ }
 493
 494   # OR: cpan -i Unicode::GCString
 495  use Unicode::GCString;
 496  my $gcs = Unicode::GCString->new($str);
 497  my $count = $gcs->length;
 498
 499 =head2 ℞ 34: Unicode column-width for printing
 500
 501 Perl’s C<printf>, C<sprintf>, and C<format> think all
 502 codepoints take up 1 print column, but many take 0 or 2.
 503 Here to show that normalization makes no difference,
 504 we print out both forms:
 505
 506  use Unicode::GCString;
 507  use Unicode::Normalize;
 508
 509  my @words = qw/crème brûlée/;
 510  @words = map { NFC($_), NFD($_) } @words;
 511
 512  for my $str (@words) {
 513      my $gcs = Unicode::GCString->new($str);
 514      my $cols = $gcs->columns;
 515      my $pad = " " x (10 - $cols);
 516      say str, $pad, " |";
 517  }
 518
 519 generates this to show that it pads correctly no matter
 520 the normalization:
 521
 522  crème      |
 523  crème      |
 524  brûlée     |
 525  brûlée     |
 526
 527 =head2 ℞ 35: Unicode collation
 528
 529 Text sorted by numeric codepoint follows no reasonable alphabetic order;
 530 use the UCA for sorting text.
 531
 532  use Unicode::Collate;
 533  my $col = Unicode::Collate->new();
 534  my @list = $col->sort(@old_list);
 535
 536 See the I<ucsort> program from the L<Unicode::Tussle> CPAN module
 537 for a convenient command-line interface to this module.
 538
 539 =head2 ℞ 36: Case- I<and> accent-insensitive Unicode sort
 540
 541 Specify a collation strength of level 1 to ignore case and
 542 diacritics, only looking at the basic character.
 543
 544  use Unicode::Collate;
 545  my $col = Unicode::Collate->new(level => 1);
 546  my @list = $col->sort(@old_list);
 547
 548 =head2 ℞ 37: Unicode locale collation
 549
 550 Some locales have special sorting rules.
 551
 552  # either use v5.12, OR: cpan -i Unicode::Collate::Locale
 553  use Unicode::Collate::Locale;
 554  my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
 555  my @list = $col->sort(@old_list);
 556
 557 The I<ucsort> program mentioned above accepts a C<--locale> parameter.
 558
 559 =head2 ℞ 38: Making C<cmp> work on text instead of codepoints
 560
 561 Instead of this:
 562
 563  @srecs = sort {
 564      $b->{AGE}   <=>  $a->{AGE}
 565                  ||
 566      $a->{NAME}  cmp  $b->{NAME}
 567  } @recs;
 568
 569 Use this:
 570
 571  my $coll = Unicode::Collate->new();
 572  for my $rec (@recs) {
 573      $rec->{NAME_key} = $coll->getSortKey( $rec->{NAME} );
 574  }
 575  @srecs = sort {
 576      $b->{AGE}       <=>  $a->{AGE}
 577                      ||
 578      $a->{NAME_key}  cmp  $b->{NAME_key}
 579  } @recs;
 580
 581 =head2 ℞ 39: Case- I<and> accent-insensitive comparisons
 582
 583 Use a collator object to compare Unicode text by character
 584 instead of by codepoint.
 585
 586  use Unicode::Collate;
 587  my $es = Unicode::Collate->new(
 588      level => 1,
 589      normalization => undef
 590  );
 591
 592   # now both are true:
 593  $es->eq("García",  "GARCIA" );
 594  $es->eq("Márquez", "MARQUEZ");
 595
 596 =head2 ℞ 40: Case- I<and> accent-insensitive locale comparisons
 597
 598 Same, but in a specific locale.
 599
 600  my $de = Unicode::Collate::Locale->new(
 601             locale => "de__phonebook",
 602           );
 603
 604  # now this is true:
 605  $de->eq("tschüß", "TSCHUESS");  # notice ü => UE, ß => SS
 606
 607 =head2 ℞ 41: Unicode linebreaking
 608
 609 Break up text into lines according to Unicode rules.
 610
 611  # cpan -i Unicode::LineBreak
 612  use Unicode::LineBreak;
 613  use charnames qw(:full);
 614
 615  my $para = "This is a super\N{HYPHEN}long string. " x 20;
 616  my $fmt = Unicode::LineBreak->new;
 617  print $fmt->break($para), "\n";
 618
 619 =head2 ℞ 42: Unicode text in DBM hashes, the tedious way
 620
 621 Using a regular Perl string as a key or value for a DBM
 622 hash will trigger a wide character exception if any codepoints
 623 won’t fit into a byte.  Here’s how to manually manage the translation:
 624
 625     use DB_File;
 626     use Encode qw(encode decode);
 627     tie %dbhash, "DB_File", "pathname";
 628
 629  # STORE
 630
 631     # assume $uni_key and $uni_value are abstract Unicode strings
 632     my $enc_key   = encode("UTF-8", $uni_key, 1);
 633     my $enc_value = encode("UTF-8", $uni_value, 1);
 634     $dbhash{$enc_key} = $enc_value;
 635
 636  # FETCH
 637
 638     # assume $uni_key holds a normal Perl string (abstract Unicode)
 639     my $enc_key   = encode("UTF-8", $uni_key, 1);
 640     my $enc_value = $dbhash{$enc_key};
 641     my $uni_value = decode("UTF-8", $enc_value, 1);
 642
 643 =head2 ℞ 43: Unicode text in DBM hashes, the easy way
 644
 645 Here’s how to implicitly manage the translation; all encoding
 646 and decoding is done automatically, just as with streams that
 647 have a particular encoding attached to them:
 648
 649     use DB_File;
 650     use DBM_Filter;
 651
 652     my $dbobj = tie %dbhash, "DB_File", "pathname";
 653     $dbobj->Filter_Value("utf8");  # this is the magic bit
 654
 655  # STORE
 656
 657     # assume $uni_key and $uni_value are abstract Unicode strings
 658     $dbhash{$uni_key} = $uni_value;
 659
 660   # FETCH
 661
 662     # $uni_key holds a normal Perl string (abstract Unicode)
 663     my $uni_value = $dbhash{$uni_key};
 664
 665 =head2 ℞ 44: PROGRAM: Demo of Unicode collation and printing
 666
 667 Here’s a full program showing how to make use of locale-sensitive
 668 sorting, Unicode casing, and managing print widths when some of the
 669 characters take up zero or two columns, not just one column each time.
 670 When run, the following program produces this nicely aligned output:
 671
 672     Crème Brûlée....... €2.00
 673     Éclair............. €1.60
 674     Fideuà............. €4.20
 675     Hamburger.......... €6.00
 676     Jamón Serrano...... €4.45
 677     Linguiça........... €7.00
 678     Pâté............... €4.15
 679     Pears.............. €2.00
 680     Pêches............. €2.25
 681     Smørbrød........... €5.75
 682     Spätzle............ €5.50
 683     Xoriço............. €3.00
 684     Γύρος.............. €6.50
 685     막걸리............. €4.00
 686     おもち............. €2.65
 687     お好み焼き......... €8.00
 688     シュークリーム..... €1.85
 689     寿司............... €9.99
 690     包子............... €7.50
 691
 692 Here's that program; tested on v5.14.
 693
 694  #!/usr/bin/env perl
 695  # umenu - demo sorting and printing of Unicode food
 696  #
 697  # (obligatory and increasingly long preamble)
 698  #
 699  use utf8;
 700  use v5.14;                       # for locale sorting
 701  use strict;
 702  use warnings;
 703  use warnings  qw(FATAL utf8);    # fatalize encoding faults
 704  use open      qw(:std :encoding(UTF-8)); # undeclared streams in UTF-8
 705  use charnames qw(:full :short);  # unneeded in v5.16
 706
 707  # std modules
 708  use Unicode::Normalize;          # std perl distro as of v5.8
 709  use List::Util qw(max);          # std perl distro as of v5.10
 710  use Unicode::Collate::Locale;    # std perl distro as of v5.14
 711
 712  # cpan modules
 713  use Unicode::GCString;           # from CPAN
 714
 715  # forward defs
 716  sub pad($$$);
 717  sub colwidth(_);
 718  sub entitle(_);
 719
 720  my %price = (
 721      "γύρος"             => 6.50, # gyros
 722      "pears"             => 2.00, # like um, pears
 723      "linguiça"          => 7.00, # spicy sausage, Portuguese
 724      "xoriço"            => 3.00, # chorizo sausage, Catalan
 725      "hamburger"         => 6.00, # burgermeister meisterburger
 726      "éclair"            => 1.60, # dessert, French
 727      "smørbrød"          => 5.75, # sandwiches, Norwegian
 728      "spätzle"           => 5.50, # Bayerisch noodles, little sparrows
 729      "包子"              => 7.50, # bao1 zi5, steamed pork buns, Mandarin
 730      "jamón serrano"     => 4.45, # country ham, Spanish
 731      "pêches"            => 2.25, # peaches, French
 732      "シュークリーム"    => 1.85, # cream-filled pastry like eclair
 733      "막걸리"            => 4.00, # makgeolli, Korean rice wine
 734      "寿司"              => 9.99, # sushi, Japanese
 735      "おもち"            => 2.65, # omochi, rice cakes, Japanese
 736      "crème brûlée"      => 2.00, # crema catalana
 737      "fideuà"            => 4.20, # more noodles, Valencian
 738                                   # (Catalan=fideuada)
 739      "pâté"              => 4.15, # gooseliver paste, French
 740      "お好み焼き"        => 8.00, # okonomiyaki, Japanese
 741  );
 742
 743  my $width = 5 + max map { colwidth } keys %price;
 744
 745  # So the Asian stuff comes out in an order that someone
 746  # who reads those scripts won't freak out over; the
 747  # CJK stuff will be in JIS X 0208 order that way.
 748  my $coll  = Unicode::Collate::Locale->new(locale => "ja");
 749
 750  for my $item ($coll->sort(keys %price)) {
 751      print pad(entitle($item), $width, ".");
 752      printf " €%.2f\n", $price{$item};
 753  }
 754
 755  sub pad($$$) {
 756      my($str, $width, $padchar) = @_;
 757      return $str . ($padchar x ($width - colwidth($str)));
 758  }
 759
 760  sub colwidth(_) {
 761      my($str) = @_;
 762      return Unicode::GCString->new($str)->columns;
 763  }
 764
 765  sub entitle(_) {
 766      my($str) = @_;
 767      $str =~ s{ (?=\pL)(\S)     (\S*) }
 768               { ucfirst($1) . lc($2)  }xge;
 769      return $str;
 770  }
 771
 772 =head1 SEE ALSO
 773
 774 See these manpages, some of which are CPAN modules:
 775 L<perlunicode>, L<perluniprops>,
 776 L<perlre>, L<perlrecharclass>,
 777 L<perluniintro>, L<perlunitut>, L<perlunifaq>,
 778 L<PerlIO>, L<DB_File>, L<DBM_Filter>, L<DBM_Filter::utf8>,
 779 L<Encode>, L<Encode::Locale>,
 780 L<Unicode::UCD>,
 781 L<Unicode::Normalize>,
 782 L<Unicode::GCString>, L<Unicode::LineBreak>,
 783 L<Unicode::Collate>, L<Unicode::Collate::Locale>,
 784 L<Unicode::Unihan>,
 785 L<Unicode::CaseFold>,
 786 L<Unicode::Tussle>,
 787 L<Lingua::JA::Romanize::Japanese>,
 788 L<Lingua::ZH::Romanize::Pinyin>,
 789 L<Lingua::KO::Romanize::Hangul>.
 790
 791 The L<Unicode::Tussle> CPAN module includes many programs
 792 to help with working with Unicode, including
 793 these programs to fully or partly replace standard utilities:
 794 I<tcgrep> instead of I<egrep>,
 795 I<uniquote> instead of I<cat -v> or I<hexdump>,
 796 I<uniwc> instead of I<wc>,
 797 I<unilook> instead of I<look>,
 798 I<unifmt> instead of I<fmt>,
 799 and
 800 I<ucsort> instead of I<sort>.
 801 For exploring Unicode character names and character properties,
 802 see its I<uniprops>, I<unichars>, and I<uninames> programs.
 803 It also supplies these programs, all of which are general filters that do Unicode-y things:
 804 I<unititle> and I<unicaps>;
 805 I<uniwide> and I<uninarrow>;
 806 I<unisupers> and I<unisubs>;
 807 I<nfd>, I<nfc>, I<nfkd>, and I<nfkc>;
 808 and I<uc>, I<lc>, and I<tc>.
 809
 810 Finally, see the published Unicode Standard (page numbers are from version
 811 6.0.0), including these specific annexes and technical reports:
 812
 813 =over
 814
 815 =item §3.13 Default Case Algorithms, page 113;
 816 §4.2  Case, pages 120–122;
 817 Case Mappings, page 166–172, especially Caseless Matching starting on page 170.
 818
 819 =item UAX #44: Unicode Character Database
 820
 821 =item UTS #18: Unicode Regular Expressions
 822
 823 =item UAX #15: Unicode Normalization Forms
 824
 825 =item UTS #10: Unicode Collation Algorithm
 826
 827 =item UAX #29: Unicode Text Segmentation
 828
 829 =item UAX #14: Unicode Line Breaking Algorithm
 830
 831 =item UAX #11: East Asian Width
 832
 833 =back
 834
 835 =head1 AUTHOR
 836
 837 Tom Christiansen E<lt>tchrist@perl.comE<gt> wrote this, with occasional
 838 kibbitzing from Larry Wall and Jeffrey Friedl in the background.
 839
 840 =head1 COPYRIGHT AND LICENCE
 841
 842 Copyright © 2012 Tom Christiansen.
 843
 844 This program is free software; you may redistribute it and/or modify it
 845 under the same terms as Perl itself.
 846
 847 Most of these examples taken from the current edition of the “Camel Book”;
 848 that is, from the 4ᵗʰ Edition of I<Programming Perl>, Copyright © 2012 Tom
 849 Christiansen <et al.>, 2012-02-13 by O’Reilly Media.  The code itself is
 850 freely redistributable, and you are encouraged to transplant, fold,
 851 spindle, and mutilate any of the examples in this manpage however you please
 852 for inclusion into your own programs without any encumbrance whatsoever.
 853 Acknowledgement via code comment is polite but not required.
 854
 855 =head1 REVISION HISTORY
 856
 857 v1.0.0 – first public release, 2012-02-27