pod/perlunicook.pod

   1
   2 =encoding utf8
   3
   4 =head1 NAME
   5
   6 perlunicook - cookbookish examples of handling Unicode in Perl
   7
   8 =head1 DESCRIPTION
   9
  10 This manpage contains short recipes demonstrating how to handle common Unicode
  11 operations in Perl, plus one complete program at the end. Any undeclared
  12 variables in individual recipes are assumed to have a previous appropriate
  13 value in them.
  14
  15 =head1 EXAMPLES
  16
  17 =head2 ℞ 0: Standard preamble
  18
  19 Unless otherwise notes, all examples below require this standard preamble
  20 to work correctly, with the C<#!> adjusted to work on your system:
  21
  22  #!/usr/bin/env perl
  23
  24  use utf8;      # so literals and identifiers can be in UTF-8
  25  use v5.12;     # or later to get "unicode_strings" feature
  26  use strict;    # quote strings, declare variables
  27  use warnings;  # on by default
  28  use warnings  qw(FATAL utf8);    # fatalize encoding glitches
  29  use open      qw(:std :encoding(UTF-8)); # undeclared streams in UTF-8
  30  use charnames qw(:full :short);  # unneeded in v5.16
  31
  32 This I<does> make even Unix programmers C<binmode> your binary streams,
  33 or open them with C<:raw>, but that's the only way to get at them
  34 portably anyway.
  35
  36 B<WARNING>: C<use autodie> (pre 2.26) and C<use open> do not get along with each
  37 other.
  38
  39 =head2 ℞ 1: Generic Unicode-savvy filter
  40
  41 Always decompose on the way in, then recompose on the way out.
  42
  43  use Unicode::Normalize;
  44
  45  while (<>) {
  46      $_ = NFD($_);   # decompose + reorder canonically
  47      ...
  48  } continue {
  49      print NFC($_);  # recompose (where possible) + reorder canonically
  50  }
  51
  52 =head2 ℞ 2: Fine-tuning Unicode warnings
  53
  54 As of v5.14, Perl distinguishes three subclasses of UTF‑8 warnings.
  55
  56  use v5.14;                  # subwarnings unavailable any earlier
  57  no warnings "nonchar";      # the 66 forbidden non-characters
  58  no warnings "surrogate";    # UTF-16/CESU-8 nonsense
  59  no warnings "non_unicode";  # for codepoints over 0x10_FFFF
  60
  61 =head2 ℞ 3: Declare source in utf8 for identifiers and literals
  62
  63 Without the all-critical C<use utf8> declaration, putting UTF‑8 in your
  64 literals and identifiers won’t work right.  If you used the standard
  65 preamble just given above, this already happened.  If you did, you can
  66 do things like this:
  67
  68  use utf8;
  69
  70  my $measure   = "Ångström";
  71  my @μsoft     = qw( cp852 cp1251 cp1252 );
  72  my @ὑπέρμεγας = qw( ὑπέρ  μεγας );
  73  my @鯉        = qw( koi8-f koi8-u koi8-r );
  74  my $motto     = "👪 💗 🐪"; # FAMILY, GROWING HEART, DROMEDARY CAMEL
  75
  76 If you forget C<use utf8>, high bytes will be misunderstood as
  77 separate characters, and nothing will work right.
  78
  79 =head2 ℞ 4: Characters and their numbers
  80
  81 The C<ord> and C<chr> functions work transparently on all codepoints,
  82 not just on ASCII alone — nor in fact, not even just on Unicode alone.
  83
  84  # ASCII characters
  85  ord("A")
  86  chr(65)
  87
  88  # characters from the Basic Multilingual Plane
  89  ord("Σ")
  90  chr(0x3A3)
  91
  92  # beyond the BMP
  93  ord("𝑛")               # MATHEMATICAL ITALIC SMALL N
  94  chr(0x1D45B)
  95
  96  # beyond Unicode! (up to MAXINT)
  97  ord("\x{20_0000}")
  98  chr(0x20_0000)
  99
 100 =head2 ℞ 5: Unicode literals by character number
 101
 102 In an interpolated literal, whether a double-quoted string or a
 103 regex, you may specify a character by its number using the
 104 C<\x{I<HHHHHH>}> escape.
 105
 106  String: "\x{3a3}"
 107  Regex:  /\x{3a3}/
 108
 109  String: "\x{1d45b}"
 110  Regex:  /\x{1d45b}/
 111
 112  # even non-BMP ranges in regex work fine
 113  /[\x{1D434}-\x{1D467}]/
 114
 115 =head2 ℞ 6: Get character name by number
 116
 117  use charnames ();
 118  my $name = charnames::viacode(0x03A3);
 119
 120 =head2 ℞ 7: Get character number by name
 121
 122  use charnames ();
 123  my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");
 124
 125 =head2 ℞ 8: Unicode named characters
 126
 127 Use the C<< \N{I<charname>} >> notation to get the character
 128 by that name for use in interpolated literals (double-quoted
 129 strings and regexes).  In v5.16, there is an implicit
 130
 131  use charnames qw(:full :short);
 132
 133 But prior to v5.16, you must be explicit about which set of charnames you
 134 want.  The C<:full> names are the official Unicode character name, alias, or
 135 sequence, which all share a namespace.
 136
 137  use charnames qw(:full :short latin greek);
 138
 139  "\N{MATHEMATICAL ITALIC SMALL N}"      # :full
 140  "\N{GREEK CAPITAL LETTER SIGMA}"       # :full
 141
 142 Anything else is a Perl-specific convenience abbreviation.  Specify one or
 143 more scripts by names if you want short names that are script-specific.
 144
 145  "\N{Greek:Sigma}"                      # :short
 146  "\N{ae}"                               #  latin
 147  "\N{epsilon}"                          #  greek
 148
 149 The v5.16 release also supports a C<:loose> import for loose matching of
 150 character names, which works just like loose matching of property names:
 151 that is, it disregards case, whitespace, and underscores:
 152
 153  "\N{euro sign}"                        # :loose (from v5.16)
 154
 155 Starting in v5.32, you can also use
 156
 157  qr/\p{name=euro sign}/
 158
 159 to get official Unicode named characters in regular expressions.  Loose
 160 matching is always done for these.
 161
 162 =head2 ℞ 9: Unicode named sequences
 163
 164 These look just like character names but return multiple codepoints.
 165 Notice the C<%vx> vector-print functionality in C<printf>.
 166
 167  use charnames qw(:full);
 168  my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
 169  printf "U+%v04X\n", $seq;
 170  U+0100.0300
 171
 172 =head2 ℞ 10: Custom named characters
 173
 174 Use C<:alias> to give your own lexically scoped nicknames to existing
 175 characters, or even to give unnamed private-use characters useful names.
 176
 177  use charnames ":full", ":alias" => {
 178      ecute => "LATIN SMALL LETTER E WITH ACUTE",
 179      "APPLE LOGO" => 0xF8FF, # private use character
 180  };
 181
 182  "\N{ecute}"
 183  "\N{APPLE LOGO}"
 184
 185 =head2 ℞ 11: Names of CJK codepoints
 186
 187 Sinograms like “東京” come back with character names of
 188 C<CJK UNIFIED IDEOGRAPH-6771> and C<CJK UNIFIED IDEOGRAPH-4EAC>,
 189 because their “names” vary.  The CPAN C<Unicode::Unihan> module
 190 has a large database for decoding these (and a whole lot more), provided you
 191 know how to understand its output.
 192
 193  # cpan -i Unicode::Unihan
 194  use Unicode::Unihan;
 195  my $str = "東京";
 196  my $unhan = Unicode::Unihan->new;
 197  for my $lang (qw(Mandarin Cantonese Korean JapaneseOn JapaneseKun)) {
 198      printf "CJK $str in %-12s is ", $lang;
 199      say $unhan->$lang($str);
 200  }
 201
 202 prints:
 203
 204  CJK 東京 in Mandarin     is DONG1JING1
 205  CJK 東京 in Cantonese    is dung1ging1
 206  CJK 東京 in Korean       is TONGKYENG
 207  CJK 東京 in JapaneseOn   is TOUKYOU KEI KIN
 208  CJK 東京 in JapaneseKun  is HIGASHI AZUMAMIYAKO
 209
 210 If you have a specific romanization scheme in mind,
 211 use the specific module:
 212
 213  # cpan -i Lingua::JA::Romanize::Japanese
 214  use Lingua::JA::Romanize::Japanese;
 215  my $k2r = Lingua::JA::Romanize::Japanese->new;
 216  my $str = "東京";
 217  say "Japanese for $str is ", $k2r->chars($str);
 218
 219 prints
 220
 221  Japanese for 東京 is toukyou
 222
 223 =head2 ℞ 12: Explicit encode/decode
 224
 225 On rare occasion, such as a database read, you may be
 226 given encoded text you need to decode.
 227
 228   use Encode qw(encode decode);
 229
 230   my $chars = decode("shiftjis", $bytes, 1);
 231  # OR
 232   my $bytes = encode("MIME-Header-ISO_2022_JP", $chars, 1);
 233
 234 For streams all in the same encoding, don't use encode/decode; instead
 235 set the file encoding when you open the file or immediately after with
 236 C<binmode> as described later below.
 237
 238 =head2 ℞ 13: Decode program arguments as utf8
 239
 240      $ perl -CA ...
 241  or
 242      $ export PERL_UNICODE=A
 243  or
 244     use Encode qw(decode);
 245     @ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
 246
 247 =head2 ℞ 14: Decode program arguments as locale encoding
 248
 249     # cpan -i Encode::Locale
 250     use Encode qw(locale);
 251     use Encode::Locale;
 252
 253     # use "locale" as an arg to encode/decode
 254     @ARGV = map { decode(locale => $_, 1) } @ARGV;
 255
 256 =head2 ℞ 15: Declare STD{IN,OUT,ERR} to be utf8
 257
 258 Use a command-line option, an environment variable, or else
 259 call C<binmode> explicitly:
 260
 261      $ perl -CS ...
 262  or
 263      $ export PERL_UNICODE=S
 264  or
 265      use open qw(:std :encoding(UTF-8));
 266  or
 267      binmode(STDIN,  ":encoding(UTF-8)");
 268      binmode(STDOUT, ":utf8");
 269      binmode(STDERR, ":utf8");
 270
 271 =head2 ℞ 16: Declare STD{IN,OUT,ERR} to be in locale encoding
 272
 273     # cpan -i Encode::Locale
 274     use Encode;
 275     use Encode::Locale;
 276
 277     # or as a stream for binmode or open
 278     binmode STDIN,  ":encoding(console_in)"  if -t STDIN;
 279     binmode STDOUT, ":encoding(console_out)" if -t STDOUT;
 280     binmode STDERR, ":encoding(console_out)" if -t STDERR;
 281
 282 =head2 ℞ 17: Make file I/O default to utf8
 283
 284 Files opened without an encoding argument will be in UTF-8:
 285
 286      $ perl -CD ...
 287  or
 288      $ export PERL_UNICODE=D
 289  or
 290      use open qw(:encoding(UTF-8));
 291
 292 =head2 ℞ 18: Make all I/O and args default to utf8
 293
 294      $ perl -CSDA ...
 295  or
 296      $ export PERL_UNICODE=SDA
 297  or
 298      use open qw(:std :encoding(UTF-8));
 299      use Encode qw(decode);
 300      @ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
 301
 302 =head2 ℞ 19: Open file with specific encoding
 303
 304 Specify stream encoding.  This is the normal way
 305 to deal with encoded text, not by calling low-level
 306 functions.
 307
 308  # input file
 309      open(my $in_file, "< :encoding(UTF-16)", "wintext");
 310  OR
 311      open(my $in_file, "<", "wintext");
 312      binmode($in_file, ":encoding(UTF-16)");
 313  THEN
 314      my $line = <$in_file>;
 315
 316  # output file
 317      open($out_file, "> :encoding(cp1252)", "wintext");
 318  OR
 319      open(my $out_file, ">", "wintext");
 320      binmode($out_file, ":encoding(cp1252)");
 321  THEN
 322      print $out_file "some text\n";
 323
 324 More layers than just the encoding can be specified here. For example,
 325 the incantation C<":raw :encoding(UTF-16LE) :crlf"> includes implicit
 326 CRLF handling.
 327
 328 =head2 ℞ 20: Unicode casing
 329
 330 Unicode casing is very different from ASCII casing.
 331
 332  uc("henry ⅷ")  # "HENRY Ⅷ"
 333  uc("tschüß")   # "TSCHÜSS"  notice ß => SS
 334
 335  # both are true:
 336  "tschüß"  =~ /TSCHÜSS/i   # notice ß => SS
 337  "Σίσυφος" =~ /ΣΊΣΥΦΟΣ/i   # notice Σ,σ,ς sameness
 338
 339 =head2 ℞ 21: Unicode case-insensitive comparisons
 340
 341 Also available in the CPAN L<Unicode::CaseFold> module,
 342 the new C<fc> “foldcase” function from v5.16 grants
 343 access to the same Unicode casefolding as the C</i>
 344 pattern modifier has always used:
 345
 346  use feature "fc"; # fc() function is from v5.16
 347
 348  # sort case-insensitively
 349  my @sorted = sort { fc($a) cmp fc($b) } @list;
 350
 351  # both are true:
 352  fc("tschüß")  eq fc("TSCHÜSS")
 353  fc("Σίσυφος") eq fc("ΣΊΣΥΦΟΣ")
 354
 355 =head2 ℞ 22: Match Unicode linebreak sequence in regex
 356
 357 A Unicode linebreak matches the two-character CRLF
 358 grapheme or any of seven vertical whitespace characters.
 359 Good for dealing with textfiles coming from different
 360 operating systems.
 361
 362  \R
 363
 364  s/\R/\n/g;  # normalize all linebreaks to \n
 365
 366 =head2 ℞ 23: Get character category
 367
 368 Find the general category of a numeric codepoint.
 369
 370  use Unicode::UCD qw(charinfo);
 371  my $cat = charinfo(0x3A3)->{category};  # "Lu"
 372
 373 =head2 ℞ 24: Disabling Unicode-awareness in builtin charclasses
 374
 375 Disable C<\w>, C<\b>, C<\s>, C<\d>, and the POSIX
 376 classes from working correctly on Unicode either in this
 377 scope, or in just one regex.
 378
 379  use v5.14;
 380  use re "/a";
 381
 382  # OR
 383
 384  my($num) = $str =~ /(\d+)/a;
 385
 386 Or use specific un-Unicode properties, like C<\p{ahex}>
 387 and C<\p{POSIX_Digit>}.  Properties still work normally
 388 no matter what charset modifiers (C</d /u /l /a /aa>)
 389 should be effect.
 390
 391 =head2 ℞ 25: Match Unicode properties in regex with \p, \P
 392
 393 These all match a single codepoint with the given
 394 property.  Use C<\P> in place of C<\p> to match
 395 one codepoint lacking that property.
 396
 397  \pL, \pN, \pS, \pP, \pM, \pZ, \pC
 398  \p{Sk}, \p{Ps}, \p{Lt}
 399  \p{alpha}, \p{upper}, \p{lower}
 400  \p{Latin}, \p{Greek}
 401  \p{script_extensions=Latin}, \p{scx=Greek}
 402  \p{East_Asian_Width=Wide}, \p{EA=W}
 403  \p{Line_Break=Hyphen}, \p{LB=HY}
 404  \p{Numeric_Value=4}, \p{NV=4}
 405
 406 =head2 ℞ 26: Custom character properties
 407
 408 Define at compile-time your own custom character
 409 properties for use in regexes.
 410
 411  # using private-use characters
 412  sub In_Tengwar { "E000\tE07F\n" }
 413
 414  if (/\p{In_Tengwar}/) { ... }
 415
 416  # blending existing properties
 417  sub Is_GraecoRoman_Title {<<'END_OF_SET'}
 418  +utf8::IsLatin
 419  +utf8::IsGreek
 420  &utf8::IsTitle
 421  END_OF_SET
 422
 423  if (/\p{Is_GraecoRoman_Title}/ { ... }
 424
 425 =head2 ℞ 27: Unicode normalization
 426
 427 Typically render into NFD on input and NFC on output. Using NFKC or NFKD
 428 functions improves recall on searches, assuming you've already done to the
 429 same text to be searched. Note that this is about much more than just pre-
 430 combined compatibility glyphs; it also reorders marks according to their
 431 canonical combining classes and weeds out singletons.
 432
 433  use Unicode::Normalize;
 434  my $nfd  = NFD($orig);
 435  my $nfc  = NFC($orig);
 436  my $nfkd = NFKD($orig);
 437  my $nfkc = NFKC($orig);
 438
 439 =head2 ℞ 28: Convert non-ASCII Unicode numerics
 440
 441 Unless you’ve used C</a> or C</aa>, C<\d> matches more than
 442 ASCII digits only, but Perl’s implicit string-to-number
 443 conversion does not current recognize these.  Here’s how to
 444 convert such strings manually.
 445
 446  use v5.14;  # needed for num() function
 447  use Unicode::UCD qw(num);
 448  my $str = "got Ⅻ and ४५६७ and ⅞ and here";
 449  my @nums = ();
 450  while ($str =~ /(\d+|\N)/g) {  # not just ASCII!
 451     push @nums, num($1);
 452  }
 453  say "@nums";   #     12      4567      0.875
 454
 455  use charnames qw(:full);
 456  my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
 457
 458 =head2 ℞ 29: Match Unicode grapheme cluster in regex
 459
 460 Programmer-visible “characters” are codepoints matched by C</./s>,
 461 but user-visible “characters” are graphemes matched by C</\X/>.
 462
 463  # Find vowel *plus* any combining diacritics,underlining,etc.
 464  my $nfd = NFD($orig);
 465  $nfd =~ / (?=[aeiou]) \X /xi
 466
 467 =head2 ℞ 30: Extract by grapheme instead of by codepoint (regex)
 468
 469  # match and grab five first graphemes
 470  my($first_five) = $str =~ /^ ( \X{5} ) /x;
 471
 472 =head2 ℞ 31: Extract by grapheme instead of by codepoint (substr)
 473
 474  # cpan -i Unicode::GCString
 475  use Unicode::GCString;
 476  my $gcs = Unicode::GCString->new($str);
 477  my $first_five = $gcs->substr(0, 5);
 478
 479 =head2 ℞ 32: Reverse string by grapheme
 480
 481 Reversing by codepoint messes up diacritics, mistakenly converting
 482 C<crème brûlée> into C<éel̂urb em̀erc> instead of into C<eélûrb emèrc>;
 483 so reverse by grapheme instead.  Both these approaches work
 484 right no matter what normalization the string is in:
 485
 486  $str = join("", reverse $str =~ /\X/g);
 487
 488  # OR: cpan -i Unicode::GCString
 489  use Unicode::GCString;
 490  $str = reverse Unicode::GCString->new($str);
 491
 492 =head2 ℞ 33: String length in graphemes
 493
 494 The string C<brûlée> has six graphemes but up to eight codepoints.
 495 This counts by grapheme, not by codepoint:
 496
 497  my $str = "brûlée";
 498  my $count = 0;
 499  while ($str =~ /\X/g) { $count++ }
 500
 501   # OR: cpan -i Unicode::GCString
 502  use Unicode::GCString;
 503  my $gcs = Unicode::GCString->new($str);
 504  my $count = $gcs->length;
 505
 506 =head2 ℞ 34: Unicode column-width for printing
 507
 508 Perl’s C<printf>, C<sprintf>, and C<format> think all
 509 codepoints take up 1 print column, but many take 0 or 2.
 510 Here to show that normalization makes no difference,
 511 we print out both forms:
 512
 513  use Unicode::GCString;
 514  use Unicode::Normalize;
 515
 516  my @words = qw/crème brûlée/;
 517  @words = map { NFC($_), NFD($_) } @words;
 518
 519  for my $str (@words) {
 520      my $gcs = Unicode::GCString->new($str);
 521      my $cols = $gcs->columns;
 522      my $pad = " " x (10 - $cols);
 523      say str, $pad, " |";
 524  }
 525
 526 generates this to show that it pads correctly no matter
 527 the normalization:
 528
 529  crème      |
 530  crème      |
 531  brûlée     |
 532  brûlée     |
 533
 534 =head2 ℞ 35: Unicode collation
 535
 536 Text sorted by numeric codepoint follows no reasonable alphabetic order;
 537 use the UCA for sorting text.
 538
 539  use Unicode::Collate;
 540  my $col = Unicode::Collate->new();
 541  my @list = $col->sort(@old_list);
 542
 543 See the I<ucsort> program from the L<Unicode::Tussle> CPAN module
 544 for a convenient command-line interface to this module.
 545
 546 =head2 ℞ 36: Case- I<and> accent-insensitive Unicode sort
 547
 548 Specify a collation strength of level 1 to ignore case and
 549 diacritics, only looking at the basic character.
 550
 551  use Unicode::Collate;
 552  my $col = Unicode::Collate->new(level => 1);
 553  my @list = $col->sort(@old_list);
 554
 555 =head2 ℞ 37: Unicode locale collation
 556
 557 Some locales have special sorting rules.
 558
 559  # either use v5.12, OR: cpan -i Unicode::Collate::Locale
 560  use Unicode::Collate::Locale;
 561  my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
 562  my @list = $col->sort(@old_list);
 563
 564 The I<ucsort> program mentioned above accepts a C<--locale> parameter.
 565
 566 =head2 ℞ 38: Making C<cmp> work on text instead of codepoints
 567
 568 Instead of this:
 569
 570  @srecs = sort {
 571      $b->{AGE}   <=>  $a->{AGE}
 572                  ||
 573      $a->{NAME}  cmp  $b->{NAME}
 574  } @recs;
 575
 576 Use this:
 577
 578  my $coll = Unicode::Collate->new();
 579  for my $rec (@recs) {
 580      $rec->{NAME_key} = $coll->getSortKey( $rec->{NAME} );
 581  }
 582  @srecs = sort {
 583      $b->{AGE}       <=>  $a->{AGE}
 584                      ||
 585      $a->{NAME_key}  cmp  $b->{NAME_key}
 586  } @recs;
 587
 588 =head2 ℞ 39: Case- I<and> accent-insensitive comparisons
 589
 590 Use a collator object to compare Unicode text by character
 591 instead of by codepoint.
 592
 593  use Unicode::Collate;
 594  my $es = Unicode::Collate->new(
 595      level => 1,
 596      normalization => undef
 597  );
 598
 599   # now both are true:
 600  $es->eq("García",  "GARCIA" );
 601  $es->eq("Márquez", "MARQUEZ");
 602
 603 =head2 ℞ 40: Case- I<and> accent-insensitive locale comparisons
 604
 605 Same, but in a specific locale.
 606
 607  my $de = Unicode::Collate::Locale->new(
 608             locale => "de__phonebook",
 609           );
 610
 611  # now this is true:
 612  $de->eq("tschüß", "TSCHUESS");  # notice ü => UE, ß => SS
 613
 614 =head2 ℞ 41: Unicode linebreaking
 615
 616 Break up text into lines according to Unicode rules.
 617
 618  # cpan -i Unicode::LineBreak
 619  use Unicode::LineBreak;
 620  use charnames qw(:full);
 621
 622  my $para = "This is a super\N{HYPHEN}long string. " x 20;
 623  my $fmt = Unicode::LineBreak->new;
 624  print $fmt->break($para), "\n";
 625
 626 =head2 ℞ 42: Unicode text in DBM hashes, the tedious way
 627
 628 Using a regular Perl string as a key or value for a DBM
 629 hash will trigger a wide character exception if any codepoints
 630 won’t fit into a byte.  Here’s how to manually manage the translation:
 631
 632     use DB_File;
 633     use Encode qw(encode decode);
 634     tie %dbhash, "DB_File", "pathname";
 635
 636  # STORE
 637
 638     # assume $uni_key and $uni_value are abstract Unicode strings
 639     my $enc_key   = encode("UTF-8", $uni_key, 1);
 640     my $enc_value = encode("UTF-8", $uni_value, 1);
 641     $dbhash{$enc_key} = $enc_value;
 642
 643  # FETCH
 644
 645     # assume $uni_key holds a normal Perl string (abstract Unicode)
 646     my $enc_key   = encode("UTF-8", $uni_key, 1);
 647     my $enc_value = $dbhash{$enc_key};
 648     my $uni_value = decode("UTF-8", $enc_value, 1);
 649
 650 =head2 ℞ 43: Unicode text in DBM hashes, the easy way
 651
 652 Here’s how to implicitly manage the translation; all encoding
 653 and decoding is done automatically, just as with streams that
 654 have a particular encoding attached to them:
 655
 656     use DB_File;
 657     use DBM_Filter;
 658
 659     my $dbobj = tie %dbhash, "DB_File", "pathname";
 660     $dbobj->Filter_Value("utf8");  # this is the magic bit
 661
 662  # STORE
 663
 664     # assume $uni_key and $uni_value are abstract Unicode strings
 665     $dbhash{$uni_key} = $uni_value;
 666
 667   # FETCH
 668
 669     # $uni_key holds a normal Perl string (abstract Unicode)
 670     my $uni_value = $dbhash{$uni_key};
 671
 672 =head2 ℞ 44: PROGRAM: Demo of Unicode collation and printing
 673
 674 Here’s a full program showing how to make use of locale-sensitive
 675 sorting, Unicode casing, and managing print widths when some of the
 676 characters take up zero or two columns, not just one column each time.
 677 When run, the following program produces this nicely aligned output:
 678
 679     Crème Brûlée....... €2.00
 680     Éclair............. €1.60
 681     Fideuà............. €4.20
 682     Hamburger.......... €6.00
 683     Jamón Serrano...... €4.45
 684     Linguiça........... €7.00
 685     Pâté............... €4.15
 686     Pears.............. €2.00
 687     Pêches............. €2.25
 688     Smørbrød........... €5.75
 689     Spätzle............ €5.50
 690     Xoriço............. €3.00
 691     Γύρος.............. €6.50
 692     막걸리............. €4.00
 693     おもち............. €2.65
 694     お好み焼き......... €8.00
 695     シュークリーム..... €1.85
 696     寿司............... €9.99
 697     包子............... €7.50
 698
 699 Here's that program; tested on v5.14.
 700
 701  #!/usr/bin/env perl
 702  # umenu - demo sorting and printing of Unicode food
 703  #
 704  # (obligatory and increasingly long preamble)
 705  #
 706  use utf8;
 707  use v5.14;                       # for locale sorting
 708  use strict;
 709  use warnings;
 710  use warnings  qw(FATAL utf8);    # fatalize encoding faults
 711  use open      qw(:std :encoding(UTF-8)); # undeclared streams in UTF-8
 712  use charnames qw(:full :short);  # unneeded in v5.16
 713
 714  # std modules
 715  use Unicode::Normalize;          # std perl distro as of v5.8
 716  use List::Util qw(max);          # std perl distro as of v5.10
 717  use Unicode::Collate::Locale;    # std perl distro as of v5.14
 718
 719  # cpan modules
 720  use Unicode::GCString;           # from CPAN
 721
 722  # forward defs
 723  sub pad($$$);
 724  sub colwidth(_);
 725  sub entitle(_);
 726
 727  my %price = (
 728      "γύρος"             => 6.50, # gyros
 729      "pears"             => 2.00, # like um, pears
 730      "linguiça"          => 7.00, # spicy sausage, Portuguese
 731      "xoriço"            => 3.00, # chorizo sausage, Catalan
 732      "hamburger"         => 6.00, # burgermeister meisterburger
 733      "éclair"            => 1.60, # dessert, French
 734      "smørbrød"          => 5.75, # sandwiches, Norwegian
 735      "spätzle"           => 5.50, # Bayerisch noodles, little sparrows
 736      "包子"              => 7.50, # bao1 zi5, steamed pork buns, Mandarin
 737      "jamón serrano"     => 4.45, # country ham, Spanish
 738      "pêches"            => 2.25, # peaches, French
 739      "シュークリーム"    => 1.85, # cream-filled pastry like eclair
 740      "막걸리"            => 4.00, # makgeolli, Korean rice wine
 741      "寿司"              => 9.99, # sushi, Japanese
 742      "おもち"            => 2.65, # omochi, rice cakes, Japanese
 743      "crème brûlée"      => 2.00, # crema catalana
 744      "fideuà"            => 4.20, # more noodles, Valencian
 745                                   # (Catalan=fideuada)
 746      "pâté"              => 4.15, # gooseliver paste, French
 747      "お好み焼き"        => 8.00, # okonomiyaki, Japanese
 748  );
 749
 750  my $width = 5 + max map { colwidth } keys %price;
 751
 752  # So the Asian stuff comes out in an order that someone
 753  # who reads those scripts won't freak out over; the
 754  # CJK stuff will be in JIS X 0208 order that way.
 755  my $coll  = Unicode::Collate::Locale->new(locale => "ja");
 756
 757  for my $item ($coll->sort(keys %price)) {
 758      print pad(entitle($item), $width, ".");
 759      printf " €%.2f\n", $price{$item};
 760  }
 761
 762  sub pad($$$) {
 763      my($str, $width, $padchar) = @_;
 764      return $str . ($padchar x ($width - colwidth($str)));
 765  }
 766
 767  sub colwidth(_) {
 768      my($str) = @_;
 769      return Unicode::GCString->new($str)->columns;
 770  }
 771
 772  sub entitle(_) {
 773      my($str) = @_;
 774      $str =~ s{ (?=\pL)(\S)     (\S*) }
 775               { ucfirst($1) . lc($2)  }xge;
 776      return $str;
 777  }
 778
 779 =head1 SEE ALSO
 780
 781 See these manpages, some of which are CPAN modules:
 782 L<perlunicode>, L<perluniprops>,
 783 L<perlre>, L<perlrecharclass>,
 784 L<perluniintro>, L<perlunitut>, L<perlunifaq>,
 785 L<PerlIO>, L<DB_File>, L<DBM_Filter>, L<DBM_Filter::utf8>,
 786 L<Encode>, L<Encode::Locale>,
 787 L<Unicode::UCD>,
 788 L<Unicode::Normalize>,
 789 L<Unicode::GCString>, L<Unicode::LineBreak>,
 790 L<Unicode::Collate>, L<Unicode::Collate::Locale>,
 791 L<Unicode::Unihan>,
 792 L<Unicode::CaseFold>,
 793 L<Unicode::Tussle>,
 794 L<Lingua::JA::Romanize::Japanese>,
 795 L<Lingua::ZH::Romanize::Pinyin>,
 796 L<Lingua::KO::Romanize::Hangul>.
 797
 798 The L<Unicode::Tussle> CPAN module includes many programs
 799 to help with working with Unicode, including
 800 these programs to fully or partly replace standard utilities:
 801 I<tcgrep> instead of I<egrep>,
 802 I<uniquote> instead of I<cat -v> or I<hexdump>,
 803 I<uniwc> instead of I<wc>,
 804 I<unilook> instead of I<look>,
 805 I<unifmt> instead of I<fmt>,
 806 and
 807 I<ucsort> instead of I<sort>.
 808 For exploring Unicode character names and character properties,
 809 see its I<uniprops>, I<unichars>, and I<uninames> programs.
 810 It also supplies these programs, all of which are general filters that do Unicode-y things:
 811 I<unititle> and I<unicaps>;
 812 I<uniwide> and I<uninarrow>;
 813 I<unisupers> and I<unisubs>;
 814 I<nfd>, I<nfc>, I<nfkd>, and I<nfkc>;
 815 and I<uc>, I<lc>, and I<tc>.
 816
 817 Finally, see the published Unicode Standard (page numbers are from version
 818 6.0.0), including these specific annexes and technical reports:
 819
 820 =over
 821
 822 =item §3.13 Default Case Algorithms, page 113;
 823 §4.2  Case, pages 120–122;
 824 Case Mappings, page 166–172, especially Caseless Matching starting on page 170.
 825
 826 =item UAX #44: Unicode Character Database
 827
 828 =item UTS #18: Unicode Regular Expressions
 829
 830 =item UAX #15: Unicode Normalization Forms
 831
 832 =item UTS #10: Unicode Collation Algorithm
 833
 834 =item UAX #29: Unicode Text Segmentation
 835
 836 =item UAX #14: Unicode Line Breaking Algorithm
 837
 838 =item UAX #11: East Asian Width
 839
 840 =back
 841
 842 =head1 AUTHOR
 843
 844 Tom Christiansen E<lt>tchrist@perl.comE<gt> wrote this, with occasional
 845 kibbitzing from Larry Wall and Jeffrey Friedl in the background.
 846
 847 =head1 COPYRIGHT AND LICENCE
 848
 849 Copyright © 2012 Tom Christiansen.
 850
 851 This program is free software; you may redistribute it and/or modify it
 852 under the same terms as Perl itself.
 853
 854 Most of these examples taken from the current edition of the “Camel Book”;
 855 that is, from the 4ᵗʰ Edition of I<Programming Perl>, Copyright © 2012 Tom
 856 Christiansen <et al.>, 2012-02-13 by O’Reilly Media.  The code itself is
 857 freely redistributable, and you are encouraged to transplant, fold,
 858 spindle, and mutilate any of the examples in this manpage however you please
 859 for inclusion into your own programs without any encumbrance whatsoever.
 860 Acknowledgement via code comment is polite but not required.
 861
 862 =head1 REVISION HISTORY
 863
 864 v1.0.0 – first public release, 2012-02-27