Commit | Line | Data |
---|---|---|
d7fcd4ce SP |
1 | |
2 | require 5; | |
3 | # The documentation is at the end. | |
4 | # Time-stamp: "2004-05-07 15:31:25 ADT" | |
5 | package Pod::Escapes; | |
6 | require Exporter; | |
7 | @ISA = ('Exporter'); | |
8 | $VERSION = '1.04'; | |
9 | @EXPORT_OK = qw( | |
10 | %Code2USASCII | |
11 | %Name2character | |
12 | %Name2character_number | |
13 | %Latin1Code_to_fallback | |
14 | %Latin1Char_to_fallback | |
15 | e2char | |
16 | e2charnum | |
17 | ); | |
18 | %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); | |
19 | ||
20 | #========================================================================== | |
21 | ||
22 | use strict; | |
23 | use vars qw( | |
24 | %Code2USASCII | |
25 | %Name2character | |
26 | %Name2character_number | |
27 | %Latin1Code_to_fallback | |
28 | %Latin1Char_to_fallback | |
29 | $FAR_CHAR | |
30 | $FAR_CHAR_NUMBER | |
31 | $NOT_ASCII | |
32 | ); | |
33 | ||
34 | $FAR_CHAR = "?" unless defined $FAR_CHAR; | |
35 | $FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER; | |
36 | ||
37 | $NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII; | |
38 | ||
39 | #-------------------------------------------------------------------------- | |
40 | sub e2char { | |
41 | my $in = $_[0]; | |
42 | return undef unless defined $in and length $in; | |
43 | ||
44 | # Convert to decimal: | |
45 | if($in =~ m/^(0[0-7]*)$/s ) { | |
46 | $in = oct $in; | |
47 | } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { | |
48 | $in = hex $1; | |
49 | } # else it's decimal, or named | |
50 | ||
51 | if($NOT_ASCII) { | |
52 | # We're in bizarro world of not-ASCII! | |
53 | # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR. | |
54 | unless($in =~ m/^\d+$/s) { | |
55 | # It's a named character reference. Get its numeric Unicode value. | |
56 | $in = $Name2character{$in}; | |
57 | return undef unless defined $in; # (if there's no such name) | |
58 | $in = ord $in; # (All ents must be one character long.) | |
59 | # ...So $in holds the char's US-ASCII numeric value, which we'll | |
60 | # now go get the local equivalent for. | |
61 | } | |
62 | ||
63 | # It's numeric, whether by origin or by mutation from a known name | |
64 | return $Code2USASCII{$in} # so "65" => "A" everywhere | |
65 | || $Latin1Code_to_fallback{$in} # Fallback. | |
66 | || $FAR_CHAR; # Fall further back | |
67 | } | |
68 | ||
69 | # Normal handling: | |
70 | if($in =~ m/^\d+$/s) { | |
71 | if($] < 5.007 and $in > 255) { # can't be trusted with Unicode | |
72 | return $FAR_CHAR; | |
73 | } else { | |
74 | return chr($in); | |
75 | } | |
76 | } else { | |
77 | return $Name2character{$in}; # returns undef if unknown | |
78 | } | |
79 | } | |
80 | ||
81 | #-------------------------------------------------------------------------- | |
82 | sub e2charnum { | |
83 | my $in = $_[0]; | |
84 | return undef unless defined $in and length $in; | |
85 | ||
86 | # Convert to decimal: | |
87 | if($in =~ m/^(0[0-7]*)$/s ) { | |
88 | $in = oct $in; | |
89 | } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { | |
90 | $in = hex $1; | |
91 | } # else it's decimal, or named | |
92 | ||
93 | if($in =~ m/^\d+$/s) { | |
94 | return 0 + $in; | |
95 | } else { | |
96 | return $Name2character_number{$in}; # returns undef if unknown | |
97 | } | |
98 | } | |
99 | ||
100 | #-------------------------------------------------------------------------- | |
101 | ||
102 | %Name2character_number = ( | |
103 | # General XML/XHTML: | |
104 | 'lt' => 60, | |
105 | 'gt' => 62, | |
106 | 'quot' => 34, | |
107 | 'amp' => 38, | |
108 | 'apos' => 39, | |
109 | ||
110 | # POD-specific: | |
111 | 'sol' => 47, | |
112 | 'verbar' => 124, | |
113 | ||
114 | 'lchevron' => 171, # legacy for laquo | |
115 | 'rchevron' => 187, # legacy for raquo | |
116 | ||
117 | # Remember, grave looks like \ (as in virtu\) | |
118 | # acute looks like / (as in re/sume/) | |
119 | # circumflex looks like ^ (as in papier ma^che/) | |
120 | # umlaut/dieresis looks like " (as in nai"ve, Chloe") | |
121 | ||
122 | # From the XHTML 1 .ent files: | |
123 | 'nbsp' , 160, | |
124 | 'iexcl' , 161, | |
125 | 'cent' , 162, | |
126 | 'pound' , 163, | |
127 | 'curren' , 164, | |
128 | 'yen' , 165, | |
129 | 'brvbar' , 166, | |
130 | 'sect' , 167, | |
131 | 'uml' , 168, | |
132 | 'copy' , 169, | |
133 | 'ordf' , 170, | |
134 | 'laquo' , 171, | |
135 | 'not' , 172, | |
136 | 'shy' , 173, | |
137 | 'reg' , 174, | |
138 | 'macr' , 175, | |
139 | 'deg' , 176, | |
140 | 'plusmn' , 177, | |
141 | 'sup2' , 178, | |
142 | 'sup3' , 179, | |
143 | 'acute' , 180, | |
144 | 'micro' , 181, | |
145 | 'para' , 182, | |
146 | 'middot' , 183, | |
147 | 'cedil' , 184, | |
148 | 'sup1' , 185, | |
149 | 'ordm' , 186, | |
150 | 'raquo' , 187, | |
151 | 'frac14' , 188, | |
152 | 'frac12' , 189, | |
153 | 'frac34' , 190, | |
154 | 'iquest' , 191, | |
155 | 'Agrave' , 192, | |
156 | 'Aacute' , 193, | |
157 | 'Acirc' , 194, | |
158 | 'Atilde' , 195, | |
159 | 'Auml' , 196, | |
160 | 'Aring' , 197, | |
161 | 'AElig' , 198, | |
162 | 'Ccedil' , 199, | |
163 | 'Egrave' , 200, | |
164 | 'Eacute' , 201, | |
165 | 'Ecirc' , 202, | |
166 | 'Euml' , 203, | |
167 | 'Igrave' , 204, | |
168 | 'Iacute' , 205, | |
169 | 'Icirc' , 206, | |
170 | 'Iuml' , 207, | |
171 | 'ETH' , 208, | |
172 | 'Ntilde' , 209, | |
173 | 'Ograve' , 210, | |
174 | 'Oacute' , 211, | |
175 | 'Ocirc' , 212, | |
176 | 'Otilde' , 213, | |
177 | 'Ouml' , 214, | |
178 | 'times' , 215, | |
179 | 'Oslash' , 216, | |
180 | 'Ugrave' , 217, | |
181 | 'Uacute' , 218, | |
182 | 'Ucirc' , 219, | |
183 | 'Uuml' , 220, | |
184 | 'Yacute' , 221, | |
185 | 'THORN' , 222, | |
186 | 'szlig' , 223, | |
187 | 'agrave' , 224, | |
188 | 'aacute' , 225, | |
189 | 'acirc' , 226, | |
190 | 'atilde' , 227, | |
191 | 'auml' , 228, | |
192 | 'aring' , 229, | |
193 | 'aelig' , 230, | |
194 | 'ccedil' , 231, | |
195 | 'egrave' , 232, | |
196 | 'eacute' , 233, | |
197 | 'ecirc' , 234, | |
198 | 'euml' , 235, | |
199 | 'igrave' , 236, | |
200 | 'iacute' , 237, | |
201 | 'icirc' , 238, | |
202 | 'iuml' , 239, | |
203 | 'eth' , 240, | |
204 | 'ntilde' , 241, | |
205 | 'ograve' , 242, | |
206 | 'oacute' , 243, | |
207 | 'ocirc' , 244, | |
208 | 'otilde' , 245, | |
209 | 'ouml' , 246, | |
210 | 'divide' , 247, | |
211 | 'oslash' , 248, | |
212 | 'ugrave' , 249, | |
213 | 'uacute' , 250, | |
214 | 'ucirc' , 251, | |
215 | 'uuml' , 252, | |
216 | 'yacute' , 253, | |
217 | 'thorn' , 254, | |
218 | 'yuml' , 255, | |
219 | ||
220 | 'fnof' , 402, | |
221 | 'Alpha' , 913, | |
222 | 'Beta' , 914, | |
223 | 'Gamma' , 915, | |
224 | 'Delta' , 916, | |
225 | 'Epsilon' , 917, | |
226 | 'Zeta' , 918, | |
227 | 'Eta' , 919, | |
228 | 'Theta' , 920, | |
229 | 'Iota' , 921, | |
230 | 'Kappa' , 922, | |
231 | 'Lambda' , 923, | |
232 | 'Mu' , 924, | |
233 | 'Nu' , 925, | |
234 | 'Xi' , 926, | |
235 | 'Omicron' , 927, | |
236 | 'Pi' , 928, | |
237 | 'Rho' , 929, | |
238 | 'Sigma' , 931, | |
239 | 'Tau' , 932, | |
240 | 'Upsilon' , 933, | |
241 | 'Phi' , 934, | |
242 | 'Chi' , 935, | |
243 | 'Psi' , 936, | |
244 | 'Omega' , 937, | |
245 | 'alpha' , 945, | |
246 | 'beta' , 946, | |
247 | 'gamma' , 947, | |
248 | 'delta' , 948, | |
249 | 'epsilon' , 949, | |
250 | 'zeta' , 950, | |
251 | 'eta' , 951, | |
252 | 'theta' , 952, | |
253 | 'iota' , 953, | |
254 | 'kappa' , 954, | |
255 | 'lambda' , 955, | |
256 | 'mu' , 956, | |
257 | 'nu' , 957, | |
258 | 'xi' , 958, | |
259 | 'omicron' , 959, | |
260 | 'pi' , 960, | |
261 | 'rho' , 961, | |
262 | 'sigmaf' , 962, | |
263 | 'sigma' , 963, | |
264 | 'tau' , 964, | |
265 | 'upsilon' , 965, | |
266 | 'phi' , 966, | |
267 | 'chi' , 967, | |
268 | 'psi' , 968, | |
269 | 'omega' , 969, | |
270 | 'thetasym' , 977, | |
271 | 'upsih' , 978, | |
272 | 'piv' , 982, | |
273 | 'bull' , 8226, | |
274 | 'hellip' , 8230, | |
275 | 'prime' , 8242, | |
276 | 'Prime' , 8243, | |
277 | 'oline' , 8254, | |
278 | 'frasl' , 8260, | |
279 | 'weierp' , 8472, | |
280 | 'image' , 8465, | |
281 | 'real' , 8476, | |
282 | 'trade' , 8482, | |
283 | 'alefsym' , 8501, | |
284 | 'larr' , 8592, | |
285 | 'uarr' , 8593, | |
286 | 'rarr' , 8594, | |
287 | 'darr' , 8595, | |
288 | 'harr' , 8596, | |
289 | 'crarr' , 8629, | |
290 | 'lArr' , 8656, | |
291 | 'uArr' , 8657, | |
292 | 'rArr' , 8658, | |
293 | 'dArr' , 8659, | |
294 | 'hArr' , 8660, | |
295 | 'forall' , 8704, | |
296 | 'part' , 8706, | |
297 | 'exist' , 8707, | |
298 | 'empty' , 8709, | |
299 | 'nabla' , 8711, | |
300 | 'isin' , 8712, | |
301 | 'notin' , 8713, | |
302 | 'ni' , 8715, | |
303 | 'prod' , 8719, | |
304 | 'sum' , 8721, | |
305 | 'minus' , 8722, | |
306 | 'lowast' , 8727, | |
307 | 'radic' , 8730, | |
308 | 'prop' , 8733, | |
309 | 'infin' , 8734, | |
310 | 'ang' , 8736, | |
311 | 'and' , 8743, | |
312 | 'or' , 8744, | |
313 | 'cap' , 8745, | |
314 | 'cup' , 8746, | |
315 | 'int' , 8747, | |
316 | 'there4' , 8756, | |
317 | 'sim' , 8764, | |
318 | 'cong' , 8773, | |
319 | 'asymp' , 8776, | |
320 | 'ne' , 8800, | |
321 | 'equiv' , 8801, | |
322 | 'le' , 8804, | |
323 | 'ge' , 8805, | |
324 | 'sub' , 8834, | |
325 | 'sup' , 8835, | |
326 | 'nsub' , 8836, | |
327 | 'sube' , 8838, | |
328 | 'supe' , 8839, | |
329 | 'oplus' , 8853, | |
330 | 'otimes' , 8855, | |
331 | 'perp' , 8869, | |
332 | 'sdot' , 8901, | |
333 | 'lceil' , 8968, | |
334 | 'rceil' , 8969, | |
335 | 'lfloor' , 8970, | |
336 | 'rfloor' , 8971, | |
337 | 'lang' , 9001, | |
338 | 'rang' , 9002, | |
339 | 'loz' , 9674, | |
340 | 'spades' , 9824, | |
341 | 'clubs' , 9827, | |
342 | 'hearts' , 9829, | |
343 | 'diams' , 9830, | |
344 | 'OElig' , 338, | |
345 | 'oelig' , 339, | |
346 | 'Scaron' , 352, | |
347 | 'scaron' , 353, | |
348 | 'Yuml' , 376, | |
349 | 'circ' , 710, | |
350 | 'tilde' , 732, | |
351 | 'ensp' , 8194, | |
352 | 'emsp' , 8195, | |
353 | 'thinsp' , 8201, | |
354 | 'zwnj' , 8204, | |
355 | 'zwj' , 8205, | |
356 | 'lrm' , 8206, | |
357 | 'rlm' , 8207, | |
358 | 'ndash' , 8211, | |
359 | 'mdash' , 8212, | |
360 | 'lsquo' , 8216, | |
361 | 'rsquo' , 8217, | |
362 | 'sbquo' , 8218, | |
363 | 'ldquo' , 8220, | |
364 | 'rdquo' , 8221, | |
365 | 'bdquo' , 8222, | |
366 | 'dagger' , 8224, | |
367 | 'Dagger' , 8225, | |
368 | 'permil' , 8240, | |
369 | 'lsaquo' , 8249, | |
370 | 'rsaquo' , 8250, | |
371 | 'euro' , 8364, | |
372 | ); | |
373 | ||
374 | ||
375 | # Fill out %Name2character... | |
376 | { | |
377 | %Name2character = (); | |
378 | my($name, $number); | |
379 | while( ($name, $number) = each %Name2character_number) { | |
380 | if($] < 5.007 and $number > 255) { | |
381 | $Name2character{$name} = $FAR_CHAR; | |
382 | # substitute for Unicode characters, for perls | |
383 | # that can't reliable handle them | |
384 | } else { | |
385 | $Name2character{$name} = chr $number; | |
386 | # normal case | |
387 | } | |
388 | } | |
389 | # So they resolve 'right' even in EBCDIC-land | |
390 | $Name2character{'lt' } = '<'; | |
391 | $Name2character{'gt' } = '>'; | |
392 | $Name2character{'quot'} = '"'; | |
393 | $Name2character{'amp' } = '&'; | |
394 | $Name2character{'apos'} = "'"; | |
395 | $Name2character{'sol' } = '/'; | |
396 | $Name2character{'verbar'} = '|'; | |
397 | } | |
398 | ||
399 | #-------------------------------------------------------------------------- | |
400 | ||
401 | %Code2USASCII = ( | |
402 | # mostly generated by | |
403 | # perl -e "printf qq{ \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)" | |
404 | 32, ' ', | |
405 | 33, '!', | |
406 | 34, '"', | |
407 | 35, '#', | |
408 | 36, '$', | |
409 | 37, '%', | |
410 | 38, '&', | |
411 | 39, "'", #! | |
412 | 40, '(', | |
413 | 41, ')', | |
414 | 42, '*', | |
415 | 43, '+', | |
416 | 44, ',', | |
417 | 45, '-', | |
418 | 46, '.', | |
419 | 47, '/', | |
420 | 48, '0', | |
421 | 49, '1', | |
422 | 50, '2', | |
423 | 51, '3', | |
424 | 52, '4', | |
425 | 53, '5', | |
426 | 54, '6', | |
427 | 55, '7', | |
428 | 56, '8', | |
429 | 57, '9', | |
430 | 58, ':', | |
431 | 59, ';', | |
432 | 60, '<', | |
433 | 61, '=', | |
434 | 62, '>', | |
435 | 63, '?', | |
436 | 64, '@', | |
437 | 65, 'A', | |
438 | 66, 'B', | |
439 | 67, 'C', | |
440 | 68, 'D', | |
441 | 69, 'E', | |
442 | 70, 'F', | |
443 | 71, 'G', | |
444 | 72, 'H', | |
445 | 73, 'I', | |
446 | 74, 'J', | |
447 | 75, 'K', | |
448 | 76, 'L', | |
449 | 77, 'M', | |
450 | 78, 'N', | |
451 | 79, 'O', | |
452 | 80, 'P', | |
453 | 81, 'Q', | |
454 | 82, 'R', | |
455 | 83, 'S', | |
456 | 84, 'T', | |
457 | 85, 'U', | |
458 | 86, 'V', | |
459 | 87, 'W', | |
460 | 88, 'X', | |
461 | 89, 'Y', | |
462 | 90, 'Z', | |
463 | 91, '[', | |
464 | 92, "\\", #! | |
465 | 93, ']', | |
466 | 94, '^', | |
467 | 95, '_', | |
468 | 96, '`', | |
469 | 97, 'a', | |
470 | 98, 'b', | |
471 | 99, 'c', | |
472 | 100, 'd', | |
473 | 101, 'e', | |
474 | 102, 'f', | |
475 | 103, 'g', | |
476 | 104, 'h', | |
477 | 105, 'i', | |
478 | 106, 'j', | |
479 | 107, 'k', | |
480 | 108, 'l', | |
481 | 109, 'm', | |
482 | 110, 'n', | |
483 | 111, 'o', | |
484 | 112, 'p', | |
485 | 113, 'q', | |
486 | 114, 'r', | |
487 | 115, 's', | |
488 | 116, 't', | |
489 | 117, 'u', | |
490 | 118, 'v', | |
491 | 119, 'w', | |
492 | 120, 'x', | |
493 | 121, 'y', | |
494 | 122, 'z', | |
495 | 123, '{', | |
496 | 124, '|', | |
497 | 125, '}', | |
498 | 126, '~', | |
499 | ); | |
500 | ||
501 | #-------------------------------------------------------------------------- | |
502 | ||
503 | %Latin1Code_to_fallback = (); | |
504 | @Latin1Code_to_fallback{0xA0 .. 0xFF} = ( | |
505 | # Copied from Text/Unidecode/x00.pm: | |
506 | ||
507 | ' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-}, | |
508 | 'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?}, | |
509 | 'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', | |
510 | 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss', | |
511 | 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', | |
512 | 'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y', | |
513 | ||
514 | ); | |
515 | ||
516 | { | |
517 | # Now stuff %Latin1Char_to_fallback: | |
518 | %Latin1Char_to_fallback = (); | |
519 | my($k,$v); | |
520 | while( ($k,$v) = each %Latin1Code_to_fallback) { | |
521 | $Latin1Char_to_fallback{chr $k} = $v; | |
522 | #print chr($k), ' => ', $v, "\n"; | |
523 | } | |
524 | } | |
525 | ||
526 | #-------------------------------------------------------------------------- | |
527 | 1; | |
528 | __END__ | |
529 | ||
530 | =head1 NAME | |
531 | ||
532 | Pod::Escapes -- for resolving Pod EE<lt>...E<gt> sequences | |
533 | ||
534 | =head1 SYNOPSIS | |
535 | ||
536 | use Pod::Escapes qw(e2char); | |
537 | ...la la la, parsing POD, la la la... | |
538 | $text = e2char($e_node->label); | |
539 | unless(defined $text) { | |
540 | print "Unknown E sequence \"", $e_node->label, "\"!"; | |
541 | } | |
542 | ...else print/interpolate $text... | |
543 | ||
544 | =head1 DESCRIPTION | |
545 | ||
546 | This module provides things that are useful in decoding | |
547 | Pod EE<lt>...E<gt> sequences. Presumably, it should be used | |
548 | only by Pod parsers and/or formatters. | |
549 | ||
550 | By default, Pod::Escapes exports none of its symbols. But | |
551 | you can request any of them to be exported. | |
552 | Either request them individually, as with | |
553 | C<use Pod::Escapes qw(symbolname symbolname2...);>, | |
554 | or you can do C<use Pod::Escapes qw(:ALL);> to get all | |
555 | exportable symbols. | |
556 | ||
557 | =head1 GOODIES | |
558 | ||
559 | =over | |
560 | ||
561 | =item e2char($e_content) | |
562 | ||
563 | Given a name or number that could appear in a | |
564 | C<EE<lt>name_or_numE<gt>> sequence, this returns the string that | |
565 | it stands for. For example, C<e2char('sol')>, C<e2char('47')>, | |
566 | C<e2char('0x2F')>, and C<e2char('057')> all return "/", | |
567 | because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>, | |
568 | and C<EE<lt>057E<gt>>, all mean "/". If | |
569 | the name has no known value (as with a name of "qacute") or is | |
570 | syntactally invalid (as with a name of "1/4"), this returns undef. | |
571 | ||
572 | =item e2charnum($e_content) | |
573 | ||
574 | Given a name or number that could appear in a | |
575 | C<EE<lt>name_or_numE<gt>> sequence, this returns the number of | |
576 | the Unicode character that this stands for. For example, | |
577 | C<e2char('sol')>, C<e2char('47')>, | |
578 | C<e2char('0x2F')>, and C<e2char('057')> all return 47, | |
579 | because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>, | |
580 | and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47. If | |
581 | the name has no known value (as with a name of "qacute") or is | |
582 | syntactally invalid (as with a name of "1/4"), this returns undef. | |
583 | ||
584 | =item $Name2character{I<name>} | |
585 | ||
586 | Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol" | |
587 | to the string that each stands for. Note that this does not | |
588 | include numerics (like "64" or "x981c"). Under old Perl versions | |
589 | (before 5.7) you get a "?" in place of characters whose Unicode | |
590 | value is over 255. | |
591 | ||
592 | =item $Name2character_number{I<name>} | |
593 | ||
594 | Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol" | |
595 | to the Unicode value that each stands for. For example, | |
596 | C<$Name2character_number{'eacute'}> is 201, and | |
597 | C<$Name2character_number{'eacute'}> is 8364. You get the correct | |
598 | Unicode value, regardless of the version of Perl you're using -- | |
599 | which differs from C<%Name2character>'s behavior under pre-5.7 Perls. | |
600 | ||
601 | Note that this hash does not | |
602 | include numerics (like "64" or "x981c"). | |
603 | ||
604 | =item $Latin1Code_to_fallback{I<integer>} | |
605 | ||
606 | For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps | |
607 | from the character code for a Latin-1 character (like 233 for | |
608 | lowercase e-acute) to the US-ASCII character that best aproximates | |
609 | it (like "e"). You may find this useful if you are rendering | |
610 | POD in a format that you think deals well only with US-ASCII | |
611 | characters. | |
612 | ||
613 | =item $Latin1Char_to_fallback{I<character>} | |
614 | ||
615 | Just as above, but maps from characters (like "\xE9", | |
616 | lowercase e-acute) to characters (like "e"). | |
617 | ||
618 | =item $Code2USASCII{I<integer>} | |
619 | ||
620 | This maps from US-ASCII codes (like 32) to the corresponding | |
621 | character (like space, for 32). Only characters 32 to 126 are | |
622 | defined. This is meant for use by C<e2char($x)> when it senses | |
623 | that it's running on a non-ASCII platform (where chr(32) doesn't | |
624 | get you a space -- but $Code2USASCII{32} will). It's | |
625 | documented here just in case you might find it useful. | |
626 | ||
627 | =back | |
628 | ||
629 | =head1 CAVEATS | |
630 | ||
631 | On Perl versions before 5.7, Unicode characters with a value | |
632 | over 255 (like lambda or emdash) can't be conveyed. This | |
633 | module does work under such early Perl versions, but in the | |
634 | place of each such character, you get a "?". Latin-1 | |
635 | characters (characters 160-255) are unaffected. | |
636 | ||
637 | Under EBCDIC platforms, C<e2char($n)> may not always be the | |
638 | same as C<chr(e2charnum($n))>, and ditto for | |
639 | C<$Name2character{$name}> and | |
640 | C<chr($Name2character_number{$name})>. | |
641 | ||
642 | =head1 SEE ALSO | |
643 | ||
644 | L<perlpod|perlpod> | |
645 | ||
646 | L<perlpodspec|perlpodspec> | |
647 | ||
648 | L<Text::Unidecode|Text::Unidecode> | |
649 | ||
650 | =head1 COPYRIGHT AND DISCLAIMERS | |
651 | ||
652 | Copyright (c) 2001-2004 Sean M. Burke. All rights reserved. | |
653 | ||
654 | This library is free software; you can redistribute it and/or modify | |
655 | it under the same terms as Perl itself. | |
656 | ||
657 | This program is distributed in the hope that it will be useful, but | |
658 | without any warranty; without even the implied warranty of | |
659 | merchantability or fitness for a particular purpose. | |
660 | ||
661 | Portions of the data tables in this module are derived from the | |
662 | entity declarations in the W3C XHTML specification. | |
663 | ||
664 | Currently (October 2001), that's these three: | |
665 | ||
666 | http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent | |
667 | http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent | |
668 | http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent | |
669 | ||
670 | =head1 AUTHOR | |
671 | ||
672 | Sean M. Burke C<sburke@cpan.org> | |
673 | ||
674 | =cut | |
675 | ||
676 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
677 | # What I used for reading the XHTML .ent files: | |
678 | ||
679 | use strict; | |
680 | my(@norms, @good, @bad); | |
681 | my $dir = 'c:/sgml/docbook/'; | |
682 | my %escapes; | |
683 | foreach my $file (qw( | |
684 | xhtml-symbol.ent | |
685 | xhtml-lat1.ent | |
686 | xhtml-special.ent | |
687 | )) { | |
688 | open(IN, "<$dir$file") or die "can't read-open $dir$file: $!"; | |
689 | print "Reading $file...\n"; | |
690 | while(<IN>) { | |
691 | if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) { | |
692 | my($name, $value) = ($1,$2); | |
693 | next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt'; | |
694 | ||
695 | $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s; | |
696 | print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s; | |
697 | if($value > 255) { | |
698 | push @good , sprintf " %-10s , chr(%s),\n", "'$name'", $value; | |
699 | push @bad , sprintf " %-10s , \$bad,\n", "'$name'", $value; | |
700 | } else { | |
701 | push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value; | |
702 | } | |
703 | } elsif(m/<!ENT/) { | |
704 | print "# Skipping $_"; | |
705 | } | |
706 | ||
707 | } | |
708 | close(IN); | |
709 | } | |
710 | ||
711 | print @norms; | |
712 | print "\n ( \$] .= 5.006001 ? (\n"; | |
713 | print @good; | |
714 | print " ) : (\n"; | |
715 | print @bad; | |
716 | print " )\n);\n"; | |
717 | ||
718 | __END__ | |
719 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
720 | ||
721 |