This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move Pod::Plainer from lib to ext
[perl5.git] / lib / Pod / Escapes.pm
CommitLineData
d7fcd4ce
SP
1
2require 5;
3# The documentation is at the end.
4# Time-stamp: "2004-05-07 15:31:25 ADT"
5package Pod::Escapes;
6require Exporter;
7@ISA = ('Exporter');
8$VERSION = '1.04';
9@EXPORT_OK = qw(
10 %Code2USASCII
11 %Name2character
12 %Name2character_number
13 %Latin1Code_to_fallback
14 %Latin1Char_to_fallback
15 e2char
16 e2charnum
17);
18%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
19
20#==========================================================================
21
22use strict;
23use vars qw(
24 %Code2USASCII
25 %Name2character
26 %Name2character_number
27 %Latin1Code_to_fallback
28 %Latin1Char_to_fallback
29 $FAR_CHAR
30 $FAR_CHAR_NUMBER
31 $NOT_ASCII
32);
33
34$FAR_CHAR = "?" unless defined $FAR_CHAR;
35$FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER;
36
37$NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII;
38
39#--------------------------------------------------------------------------
40sub e2char {
41 my $in = $_[0];
42 return undef unless defined $in and length $in;
43
44 # Convert to decimal:
45 if($in =~ m/^(0[0-7]*)$/s ) {
46 $in = oct $in;
47 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
48 $in = hex $1;
49 } # else it's decimal, or named
50
51 if($NOT_ASCII) {
52 # We're in bizarro world of not-ASCII!
53 # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR.
54 unless($in =~ m/^\d+$/s) {
55 # It's a named character reference. Get its numeric Unicode value.
56 $in = $Name2character{$in};
57 return undef unless defined $in; # (if there's no such name)
58 $in = ord $in; # (All ents must be one character long.)
59 # ...So $in holds the char's US-ASCII numeric value, which we'll
60 # now go get the local equivalent for.
61 }
62
63 # It's numeric, whether by origin or by mutation from a known name
64 return $Code2USASCII{$in} # so "65" => "A" everywhere
65 || $Latin1Code_to_fallback{$in} # Fallback.
66 || $FAR_CHAR; # Fall further back
67 }
68
69 # Normal handling:
70 if($in =~ m/^\d+$/s) {
71 if($] < 5.007 and $in > 255) { # can't be trusted with Unicode
72 return $FAR_CHAR;
73 } else {
74 return chr($in);
75 }
76 } else {
77 return $Name2character{$in}; # returns undef if unknown
78 }
79}
80
81#--------------------------------------------------------------------------
82sub e2charnum {
83 my $in = $_[0];
84 return undef unless defined $in and length $in;
85
86 # Convert to decimal:
87 if($in =~ m/^(0[0-7]*)$/s ) {
88 $in = oct $in;
89 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
90 $in = hex $1;
91 } # else it's decimal, or named
92
93 if($in =~ m/^\d+$/s) {
94 return 0 + $in;
95 } else {
96 return $Name2character_number{$in}; # returns undef if unknown
97 }
98}
99
100#--------------------------------------------------------------------------
101
102%Name2character_number = (
103 # General XML/XHTML:
104 'lt' => 60,
105 'gt' => 62,
106 'quot' => 34,
107 'amp' => 38,
108 'apos' => 39,
109
110 # POD-specific:
111 'sol' => 47,
112 'verbar' => 124,
113
114 'lchevron' => 171, # legacy for laquo
115 'rchevron' => 187, # legacy for raquo
116
117 # Remember, grave looks like \ (as in virtu\)
118 # acute looks like / (as in re/sume/)
119 # circumflex looks like ^ (as in papier ma^che/)
120 # umlaut/dieresis looks like " (as in nai"ve, Chloe")
121
122 # From the XHTML 1 .ent files:
123 'nbsp' , 160,
124 'iexcl' , 161,
125 'cent' , 162,
126 'pound' , 163,
127 'curren' , 164,
128 'yen' , 165,
129 'brvbar' , 166,
130 'sect' , 167,
131 'uml' , 168,
132 'copy' , 169,
133 'ordf' , 170,
134 'laquo' , 171,
135 'not' , 172,
136 'shy' , 173,
137 'reg' , 174,
138 'macr' , 175,
139 'deg' , 176,
140 'plusmn' , 177,
141 'sup2' , 178,
142 'sup3' , 179,
143 'acute' , 180,
144 'micro' , 181,
145 'para' , 182,
146 'middot' , 183,
147 'cedil' , 184,
148 'sup1' , 185,
149 'ordm' , 186,
150 'raquo' , 187,
151 'frac14' , 188,
152 'frac12' , 189,
153 'frac34' , 190,
154 'iquest' , 191,
155 'Agrave' , 192,
156 'Aacute' , 193,
157 'Acirc' , 194,
158 'Atilde' , 195,
159 'Auml' , 196,
160 'Aring' , 197,
161 'AElig' , 198,
162 'Ccedil' , 199,
163 'Egrave' , 200,
164 'Eacute' , 201,
165 'Ecirc' , 202,
166 'Euml' , 203,
167 'Igrave' , 204,
168 'Iacute' , 205,
169 'Icirc' , 206,
170 'Iuml' , 207,
171 'ETH' , 208,
172 'Ntilde' , 209,
173 'Ograve' , 210,
174 'Oacute' , 211,
175 'Ocirc' , 212,
176 'Otilde' , 213,
177 'Ouml' , 214,
178 'times' , 215,
179 'Oslash' , 216,
180 'Ugrave' , 217,
181 'Uacute' , 218,
182 'Ucirc' , 219,
183 'Uuml' , 220,
184 'Yacute' , 221,
185 'THORN' , 222,
186 'szlig' , 223,
187 'agrave' , 224,
188 'aacute' , 225,
189 'acirc' , 226,
190 'atilde' , 227,
191 'auml' , 228,
192 'aring' , 229,
193 'aelig' , 230,
194 'ccedil' , 231,
195 'egrave' , 232,
196 'eacute' , 233,
197 'ecirc' , 234,
198 'euml' , 235,
199 'igrave' , 236,
200 'iacute' , 237,
201 'icirc' , 238,
202 'iuml' , 239,
203 'eth' , 240,
204 'ntilde' , 241,
205 'ograve' , 242,
206 'oacute' , 243,
207 'ocirc' , 244,
208 'otilde' , 245,
209 'ouml' , 246,
210 'divide' , 247,
211 'oslash' , 248,
212 'ugrave' , 249,
213 'uacute' , 250,
214 'ucirc' , 251,
215 'uuml' , 252,
216 'yacute' , 253,
217 'thorn' , 254,
218 'yuml' , 255,
219
220 'fnof' , 402,
221 'Alpha' , 913,
222 'Beta' , 914,
223 'Gamma' , 915,
224 'Delta' , 916,
225 'Epsilon' , 917,
226 'Zeta' , 918,
227 'Eta' , 919,
228 'Theta' , 920,
229 'Iota' , 921,
230 'Kappa' , 922,
231 'Lambda' , 923,
232 'Mu' , 924,
233 'Nu' , 925,
234 'Xi' , 926,
235 'Omicron' , 927,
236 'Pi' , 928,
237 'Rho' , 929,
238 'Sigma' , 931,
239 'Tau' , 932,
240 'Upsilon' , 933,
241 'Phi' , 934,
242 'Chi' , 935,
243 'Psi' , 936,
244 'Omega' , 937,
245 'alpha' , 945,
246 'beta' , 946,
247 'gamma' , 947,
248 'delta' , 948,
249 'epsilon' , 949,
250 'zeta' , 950,
251 'eta' , 951,
252 'theta' , 952,
253 'iota' , 953,
254 'kappa' , 954,
255 'lambda' , 955,
256 'mu' , 956,
257 'nu' , 957,
258 'xi' , 958,
259 'omicron' , 959,
260 'pi' , 960,
261 'rho' , 961,
262 'sigmaf' , 962,
263 'sigma' , 963,
264 'tau' , 964,
265 'upsilon' , 965,
266 'phi' , 966,
267 'chi' , 967,
268 'psi' , 968,
269 'omega' , 969,
270 'thetasym' , 977,
271 'upsih' , 978,
272 'piv' , 982,
273 'bull' , 8226,
274 'hellip' , 8230,
275 'prime' , 8242,
276 'Prime' , 8243,
277 'oline' , 8254,
278 'frasl' , 8260,
279 'weierp' , 8472,
280 'image' , 8465,
281 'real' , 8476,
282 'trade' , 8482,
283 'alefsym' , 8501,
284 'larr' , 8592,
285 'uarr' , 8593,
286 'rarr' , 8594,
287 'darr' , 8595,
288 'harr' , 8596,
289 'crarr' , 8629,
290 'lArr' , 8656,
291 'uArr' , 8657,
292 'rArr' , 8658,
293 'dArr' , 8659,
294 'hArr' , 8660,
295 'forall' , 8704,
296 'part' , 8706,
297 'exist' , 8707,
298 'empty' , 8709,
299 'nabla' , 8711,
300 'isin' , 8712,
301 'notin' , 8713,
302 'ni' , 8715,
303 'prod' , 8719,
304 'sum' , 8721,
305 'minus' , 8722,
306 'lowast' , 8727,
307 'radic' , 8730,
308 'prop' , 8733,
309 'infin' , 8734,
310 'ang' , 8736,
311 'and' , 8743,
312 'or' , 8744,
313 'cap' , 8745,
314 'cup' , 8746,
315 'int' , 8747,
316 'there4' , 8756,
317 'sim' , 8764,
318 'cong' , 8773,
319 'asymp' , 8776,
320 'ne' , 8800,
321 'equiv' , 8801,
322 'le' , 8804,
323 'ge' , 8805,
324 'sub' , 8834,
325 'sup' , 8835,
326 'nsub' , 8836,
327 'sube' , 8838,
328 'supe' , 8839,
329 'oplus' , 8853,
330 'otimes' , 8855,
331 'perp' , 8869,
332 'sdot' , 8901,
333 'lceil' , 8968,
334 'rceil' , 8969,
335 'lfloor' , 8970,
336 'rfloor' , 8971,
337 'lang' , 9001,
338 'rang' , 9002,
339 'loz' , 9674,
340 'spades' , 9824,
341 'clubs' , 9827,
342 'hearts' , 9829,
343 'diams' , 9830,
344 'OElig' , 338,
345 'oelig' , 339,
346 'Scaron' , 352,
347 'scaron' , 353,
348 'Yuml' , 376,
349 'circ' , 710,
350 'tilde' , 732,
351 'ensp' , 8194,
352 'emsp' , 8195,
353 'thinsp' , 8201,
354 'zwnj' , 8204,
355 'zwj' , 8205,
356 'lrm' , 8206,
357 'rlm' , 8207,
358 'ndash' , 8211,
359 'mdash' , 8212,
360 'lsquo' , 8216,
361 'rsquo' , 8217,
362 'sbquo' , 8218,
363 'ldquo' , 8220,
364 'rdquo' , 8221,
365 'bdquo' , 8222,
366 'dagger' , 8224,
367 'Dagger' , 8225,
368 'permil' , 8240,
369 'lsaquo' , 8249,
370 'rsaquo' , 8250,
371 'euro' , 8364,
372);
373
374
375# Fill out %Name2character...
376{
377 %Name2character = ();
378 my($name, $number);
379 while( ($name, $number) = each %Name2character_number) {
380 if($] < 5.007 and $number > 255) {
381 $Name2character{$name} = $FAR_CHAR;
382 # substitute for Unicode characters, for perls
383 # that can't reliable handle them
384 } else {
385 $Name2character{$name} = chr $number;
386 # normal case
387 }
388 }
389 # So they resolve 'right' even in EBCDIC-land
390 $Name2character{'lt' } = '<';
391 $Name2character{'gt' } = '>';
392 $Name2character{'quot'} = '"';
393 $Name2character{'amp' } = '&';
394 $Name2character{'apos'} = "'";
395 $Name2character{'sol' } = '/';
396 $Name2character{'verbar'} = '|';
397}
398
399#--------------------------------------------------------------------------
400
401%Code2USASCII = (
402# mostly generated by
403# perl -e "printf qq{ \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)"
404 32, ' ',
405 33, '!',
406 34, '"',
407 35, '#',
408 36, '$',
409 37, '%',
410 38, '&',
411 39, "'", #!
412 40, '(',
413 41, ')',
414 42, '*',
415 43, '+',
416 44, ',',
417 45, '-',
418 46, '.',
419 47, '/',
420 48, '0',
421 49, '1',
422 50, '2',
423 51, '3',
424 52, '4',
425 53, '5',
426 54, '6',
427 55, '7',
428 56, '8',
429 57, '9',
430 58, ':',
431 59, ';',
432 60, '<',
433 61, '=',
434 62, '>',
435 63, '?',
436 64, '@',
437 65, 'A',
438 66, 'B',
439 67, 'C',
440 68, 'D',
441 69, 'E',
442 70, 'F',
443 71, 'G',
444 72, 'H',
445 73, 'I',
446 74, 'J',
447 75, 'K',
448 76, 'L',
449 77, 'M',
450 78, 'N',
451 79, 'O',
452 80, 'P',
453 81, 'Q',
454 82, 'R',
455 83, 'S',
456 84, 'T',
457 85, 'U',
458 86, 'V',
459 87, 'W',
460 88, 'X',
461 89, 'Y',
462 90, 'Z',
463 91, '[',
464 92, "\\", #!
465 93, ']',
466 94, '^',
467 95, '_',
468 96, '`',
469 97, 'a',
470 98, 'b',
471 99, 'c',
472 100, 'd',
473 101, 'e',
474 102, 'f',
475 103, 'g',
476 104, 'h',
477 105, 'i',
478 106, 'j',
479 107, 'k',
480 108, 'l',
481 109, 'm',
482 110, 'n',
483 111, 'o',
484 112, 'p',
485 113, 'q',
486 114, 'r',
487 115, 's',
488 116, 't',
489 117, 'u',
490 118, 'v',
491 119, 'w',
492 120, 'x',
493 121, 'y',
494 122, 'z',
495 123, '{',
496 124, '|',
497 125, '}',
498 126, '~',
499);
500
501#--------------------------------------------------------------------------
502
503%Latin1Code_to_fallback = ();
504@Latin1Code_to_fallback{0xA0 .. 0xFF} = (
505# Copied from Text/Unidecode/x00.pm:
506
507' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-},
508'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?},
509'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
510'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss',
511'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
512'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y',
513
514);
515
516{
517 # Now stuff %Latin1Char_to_fallback:
518 %Latin1Char_to_fallback = ();
519 my($k,$v);
520 while( ($k,$v) = each %Latin1Code_to_fallback) {
521 $Latin1Char_to_fallback{chr $k} = $v;
522 #print chr($k), ' => ', $v, "\n";
523 }
524}
525
526#--------------------------------------------------------------------------
5271;
528__END__
529
530=head1 NAME
531
532Pod::Escapes -- for resolving Pod EE<lt>...E<gt> sequences
533
534=head1 SYNOPSIS
535
536 use Pod::Escapes qw(e2char);
537 ...la la la, parsing POD, la la la...
538 $text = e2char($e_node->label);
539 unless(defined $text) {
540 print "Unknown E sequence \"", $e_node->label, "\"!";
541 }
542 ...else print/interpolate $text...
543
544=head1 DESCRIPTION
545
546This module provides things that are useful in decoding
547Pod EE<lt>...E<gt> sequences. Presumably, it should be used
548only by Pod parsers and/or formatters.
549
550By default, Pod::Escapes exports none of its symbols. But
551you can request any of them to be exported.
552Either request them individually, as with
553C<use Pod::Escapes qw(symbolname symbolname2...);>,
554or you can do C<use Pod::Escapes qw(:ALL);> to get all
555exportable symbols.
556
557=head1 GOODIES
558
559=over
560
561=item e2char($e_content)
562
563Given a name or number that could appear in a
564C<EE<lt>name_or_numE<gt>> sequence, this returns the string that
565it stands for. For example, C<e2char('sol')>, C<e2char('47')>,
566C<e2char('0x2F')>, and C<e2char('057')> all return "/",
567because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
568and C<EE<lt>057E<gt>>, all mean "/". If
569the name has no known value (as with a name of "qacute") or is
570syntactally invalid (as with a name of "1/4"), this returns undef.
571
572=item e2charnum($e_content)
573
574Given a name or number that could appear in a
575C<EE<lt>name_or_numE<gt>> sequence, this returns the number of
576the Unicode character that this stands for. For example,
577C<e2char('sol')>, C<e2char('47')>,
578C<e2char('0x2F')>, and C<e2char('057')> all return 47,
579because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
580and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47. If
581the name has no known value (as with a name of "qacute") or is
582syntactally invalid (as with a name of "1/4"), this returns undef.
583
584=item $Name2character{I<name>}
585
586Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
587to the string that each stands for. Note that this does not
588include numerics (like "64" or "x981c"). Under old Perl versions
589(before 5.7) you get a "?" in place of characters whose Unicode
590value is over 255.
591
592=item $Name2character_number{I<name>}
593
594Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
595to the Unicode value that each stands for. For example,
596C<$Name2character_number{'eacute'}> is 201, and
597C<$Name2character_number{'eacute'}> is 8364. You get the correct
598Unicode value, regardless of the version of Perl you're using --
599which differs from C<%Name2character>'s behavior under pre-5.7 Perls.
600
601Note that this hash does not
602include numerics (like "64" or "x981c").
603
604=item $Latin1Code_to_fallback{I<integer>}
605
606For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps
607from the character code for a Latin-1 character (like 233 for
608lowercase e-acute) to the US-ASCII character that best aproximates
609it (like "e"). You may find this useful if you are rendering
610POD in a format that you think deals well only with US-ASCII
611characters.
612
613=item $Latin1Char_to_fallback{I<character>}
614
615Just as above, but maps from characters (like "\xE9",
616lowercase e-acute) to characters (like "e").
617
618=item $Code2USASCII{I<integer>}
619
620This maps from US-ASCII codes (like 32) to the corresponding
621character (like space, for 32). Only characters 32 to 126 are
622defined. This is meant for use by C<e2char($x)> when it senses
623that it's running on a non-ASCII platform (where chr(32) doesn't
624get you a space -- but $Code2USASCII{32} will). It's
625documented here just in case you might find it useful.
626
627=back
628
629=head1 CAVEATS
630
631On Perl versions before 5.7, Unicode characters with a value
632over 255 (like lambda or emdash) can't be conveyed. This
633module does work under such early Perl versions, but in the
634place of each such character, you get a "?". Latin-1
635characters (characters 160-255) are unaffected.
636
637Under EBCDIC platforms, C<e2char($n)> may not always be the
638same as C<chr(e2charnum($n))>, and ditto for
639C<$Name2character{$name}> and
640C<chr($Name2character_number{$name})>.
641
642=head1 SEE ALSO
643
644L<perlpod|perlpod>
645
646L<perlpodspec|perlpodspec>
647
648L<Text::Unidecode|Text::Unidecode>
649
650=head1 COPYRIGHT AND DISCLAIMERS
651
652Copyright (c) 2001-2004 Sean M. Burke. All rights reserved.
653
654This library is free software; you can redistribute it and/or modify
655it under the same terms as Perl itself.
656
657This program is distributed in the hope that it will be useful, but
658without any warranty; without even the implied warranty of
659merchantability or fitness for a particular purpose.
660
661Portions of the data tables in this module are derived from the
662entity declarations in the W3C XHTML specification.
663
664Currently (October 2001), that's these three:
665
666 http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
667 http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
668 http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent
669
670=head1 AUTHOR
671
672Sean M. Burke C<sburke@cpan.org>
673
674=cut
675
676#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
677# What I used for reading the XHTML .ent files:
678
679use strict;
680my(@norms, @good, @bad);
681my $dir = 'c:/sgml/docbook/';
682my %escapes;
683foreach my $file (qw(
684 xhtml-symbol.ent
685 xhtml-lat1.ent
686 xhtml-special.ent
687)) {
688 open(IN, "<$dir$file") or die "can't read-open $dir$file: $!";
689 print "Reading $file...\n";
690 while(<IN>) {
691 if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) {
692 my($name, $value) = ($1,$2);
693 next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt';
694
695 $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s;
696 print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s;
697 if($value > 255) {
698 push @good , sprintf " %-10s , chr(%s),\n", "'$name'", $value;
699 push @bad , sprintf " %-10s , \$bad,\n", "'$name'", $value;
700 } else {
701 push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value;
702 }
703 } elsif(m/<!ENT/) {
704 print "# Skipping $_";
705 }
706
707 }
708 close(IN);
709}
710
711print @norms;
712print "\n ( \$] .= 5.006001 ? (\n";
713print @good;
714print " ) : (\n";
715print @bad;
716print " )\n);\n";
717
718__END__
719#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
720
721