This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add clarifying comments to charnames.t
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
16036bcd 5our $VERSION = '1.09';
b75c8c73 6
d5448623 7use bytes (); # for $bytes::hint_bits
423cee85 8
52ea3e69 9my %alias1 = (
16036bcd
KW
10 # Icky 3.2 names with parentheses.
11 'LINE FEED' => 0x0A, # LINE FEED (LF)
12 'FORM FEED' => 0x0C, # FORM FEED (FF)
13 'CARRIAGE RETURN' => 0x0D, # CARRIAGE RETURN (CR)
14 'NEXT LINE' => 0x85, # NEXT LINE (NEL)
15
16 # Some variant names from Wikipedia
17 'SINGLE-SHIFT 2' => 0x8E,
18 'SINGLE-SHIFT 3' => 0x8F,
19 'PRIVATE USE 1' => 0x91,
20 'PRIVATE USE 2' => 0x92,
21 'START OF PROTECTED AREA' => 0x96,
22 'END OF PROTECTED AREA' => 0x97,
23
24 # Convenience. Standard abbreviations for the controls
25 'NUL' => 0x00, # NULL
26 'SOH' => 0x01, # START OF HEADING
27 'STX' => 0x02, # START OF TEXT
28 'ETX' => 0x03, # END OF TEXT
29 'EOT' => 0x04, # END OF TRANSMISSION
30 'ENQ' => 0x05, # ENQUIRY
31 'ACK' => 0x06, # ACKNOWLEDGE
32 'BEL' => 0x07, # BELL
33 'BS' => 0x08, # BACKSPACE
34 'HT' => 0x09, # HORIZONTAL TABULATION
35 'LF' => 0x0A, # LINE FEED (LF)
36 'VT' => 0x0B, # VERTICAL TABULATION
37 'FF' => 0x0C, # FORM FEED (FF)
38 'CR' => 0x0D, # CARRIAGE RETURN (CR)
39 'SO' => 0x0E, # SHIFT OUT
40 'SI' => 0x0F, # SHIFT IN
41 'DLE' => 0x10, # DATA LINK ESCAPE
42 'DC1' => 0x11, # DEVICE CONTROL ONE
43 'DC2' => 0x12, # DEVICE CONTROL TWO
44 'DC3' => 0x13, # DEVICE CONTROL THREE
45 'DC4' => 0x14, # DEVICE CONTROL FOUR
46 'NAK' => 0x15, # NEGATIVE ACKNOWLEDGE
47 'SYN' => 0x16, # SYNCHRONOUS IDLE
48 'ETB' => 0x17, # END OF TRANSMISSION BLOCK
49 'CAN' => 0x18, # CANCEL
50 'EOM' => 0x19, # END OF MEDIUM
51 'SUB' => 0x1A, # SUBSTITUTE
52 'ESC' => 0x1B, # ESCAPE
53 'FS' => 0x1C, # FILE SEPARATOR
54 'GS' => 0x1D, # GROUP SEPARATOR
55 'RS' => 0x1E, # RECORD SEPARATOR
56 'US' => 0x1F, # UNIT SEPARATOR
57 'DEL' => 0x7F, # DELETE
58 'BPH' => 0x82, # BREAK PERMITTED HERE
59 'NBH' => 0x83, # NO BREAK HERE
60 'NEL' => 0x85, # NEXT LINE (NEL)
61 'SSA' => 0x86, # START OF SELECTED AREA
62 'ESA' => 0x87, # END OF SELECTED AREA
63 'HTS' => 0x88, # CHARACTER TABULATION SET
64 'HTJ' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
65 'VTS' => 0x8A, # LINE TABULATION SET
66 'PLD' => 0x8B, # PARTIAL LINE FORWARD
67 'PLU' => 0x8C, # PARTIAL LINE BACKWARD
68 'RI ' => 0x8D, # REVERSE LINE FEED
69 'SS2' => 0x8E, # SINGLE SHIFT TWO
70 'SS3' => 0x8F, # SINGLE SHIFT THREE
71 'DCS' => 0x90, # DEVICE CONTROL STRING
72 'PU1' => 0x91, # PRIVATE USE ONE
73 'PU2' => 0x92, # PRIVATE USE TWO
74 'STS' => 0x93, # SET TRANSMIT STATE
75 'CCH' => 0x94, # CANCEL CHARACTER
76 'MW ' => 0x95, # MESSAGE WAITING
77 'SPA' => 0x96, # START OF GUARDED AREA
78 'EPA' => 0x97, # END OF GUARDED AREA
79 'SOS' => 0x98, # START OF STRING
80 'SCI' => 0x9A, # SINGLE CHARACTER INTRODUCER
81 'CSI' => 0x9B, # CONTROL SEQUENCE INTRODUCER
82 'ST ' => 0x9C, # STRING TERMINATOR
83 'OSC' => 0x9D, # OPERATING SYSTEM COMMAND
84 'PM ' => 0x9E, # PRIVACY MESSAGE
85 'APC' => 0x9F, # APPLICATION PROGRAM COMMAND
86
87 # There are no names for these in the Unicode standard;
88 # perhaps should be deprecated, but then again there are
89 # no alternative names, so am not deprecating. And if
90 # did, the code would have to change to not recommend an
91 # alternative for these.
92 'PADDING CHARACTER' => 0x80,
93 'PAD' => 0x80,
94 'HIGH OCTET PRESET' => 0x81,
95 'HOP' => 0x81,
96 'INDEX' => 0x84,
97 'IND' => 0x84,
98 'SINGLE GRAPHIC CHARACTER INTRODUCER' => 0x99,
99 'SGC' => 0x99,
100
101 # More convenience. For further convenience,
102 # it is suggested some way of using the NamesList
103 # aliases be implemented, but there are ambiguities in
104 # NamesList.txt)
105 'BOM' => 0xFEFF, # BYTE ORDER MARK
106 'BYTE ORDER MARK'=> 0xFEFF,
107 'CGJ' => 0x034F, # COMBINING GRAPHEME JOINER
108 'FVS1' => 0x180B, # MONGOLIAN FREE VARIATION SELECTOR ONE
109 'FVS2' => 0x180C, # MONGOLIAN FREE VARIATION SELECTOR TWO
110 'FVS3' => 0x180D, # MONGOLIAN FREE VARIATION SELECTOR THREE
111 'LRE' => 0x202A, # LEFT-TO-RIGHT EMBEDDING
112 'LRM' => 0x200E, # LEFT-TO-RIGHT MARK
113 'LRO' => 0x202D, # LEFT-TO-RIGHT OVERRIDE
114 'MMSP' => 0x205F, # MEDIUM MATHEMATICAL SPACE
115 'MVS' => 0x180E, # MONGOLIAN VOWEL SEPARATOR
116 'NBSP' => 0x00A0, # NO-BREAK SPACE
117 'NNBSP' => 0x202F, # NARROW NO-BREAK SPACE
118 'PDF' => 0x202C, # POP DIRECTIONAL FORMATTING
119 'RLE' => 0x202B, # RIGHT-TO-LEFT EMBEDDING
120 'RLM' => 0x200F, # RIGHT-TO-LEFT MARK
121 'RLO' => 0x202E, # RIGHT-TO-LEFT OVERRIDE
122 'SHY' => 0x00AD, # SOFT HYPHEN
123 'VS1' => 0xFE00, # VARIATION SELECTOR-1
124 'VS2' => 0xFE01, # VARIATION SELECTOR-2
125 'VS3' => 0xFE02, # VARIATION SELECTOR-3
126 'VS4' => 0xFE03, # VARIATION SELECTOR-4
127 'VS5' => 0xFE04, # VARIATION SELECTOR-5
128 'VS6' => 0xFE05, # VARIATION SELECTOR-6
129 'VS7' => 0xFE06, # VARIATION SELECTOR-7
130 'VS8' => 0xFE07, # VARIATION SELECTOR-8
131 'VS9' => 0xFE08, # VARIATION SELECTOR-9
132 'VS10' => 0xFE09, # VARIATION SELECTOR-10
133 'VS11' => 0xFE0A, # VARIATION SELECTOR-11
134 'VS12' => 0xFE0B, # VARIATION SELECTOR-12
135 'VS13' => 0xFE0C, # VARIATION SELECTOR-13
136 'VS14' => 0xFE0D, # VARIATION SELECTOR-14
137 'VS15' => 0xFE0E, # VARIATION SELECTOR-15
138 'VS16' => 0xFE0F, # VARIATION SELECTOR-16
139 'VS17' => 0xE0100, # VARIATION SELECTOR-17
140 'VS18' => 0xE0101, # VARIATION SELECTOR-18
141 'VS19' => 0xE0102, # VARIATION SELECTOR-19
142 'VS20' => 0xE0103, # VARIATION SELECTOR-20
143 'VS21' => 0xE0104, # VARIATION SELECTOR-21
144 'VS22' => 0xE0105, # VARIATION SELECTOR-22
145 'VS23' => 0xE0106, # VARIATION SELECTOR-23
146 'VS24' => 0xE0107, # VARIATION SELECTOR-24
147 'VS25' => 0xE0108, # VARIATION SELECTOR-25
148 'VS26' => 0xE0109, # VARIATION SELECTOR-26
149 'VS27' => 0xE010A, # VARIATION SELECTOR-27
150 'VS28' => 0xE010B, # VARIATION SELECTOR-28
151 'VS29' => 0xE010C, # VARIATION SELECTOR-29
152 'VS30' => 0xE010D, # VARIATION SELECTOR-30
153 'VS31' => 0xE010E, # VARIATION SELECTOR-31
154 'VS32' => 0xE010F, # VARIATION SELECTOR-32
155 'VS33' => 0xE0110, # VARIATION SELECTOR-33
156 'VS34' => 0xE0111, # VARIATION SELECTOR-34
157 'VS35' => 0xE0112, # VARIATION SELECTOR-35
158 'VS36' => 0xE0113, # VARIATION SELECTOR-36
159 'VS37' => 0xE0114, # VARIATION SELECTOR-37
160 'VS38' => 0xE0115, # VARIATION SELECTOR-38
161 'VS39' => 0xE0116, # VARIATION SELECTOR-39
162 'VS40' => 0xE0117, # VARIATION SELECTOR-40
163 'VS41' => 0xE0118, # VARIATION SELECTOR-41
164 'VS42' => 0xE0119, # VARIATION SELECTOR-42
165 'VS43' => 0xE011A, # VARIATION SELECTOR-43
166 'VS44' => 0xE011B, # VARIATION SELECTOR-44
167 'VS45' => 0xE011C, # VARIATION SELECTOR-45
168 'VS46' => 0xE011D, # VARIATION SELECTOR-46
169 'VS47' => 0xE011E, # VARIATION SELECTOR-47
170 'VS48' => 0xE011F, # VARIATION SELECTOR-48
171 'VS49' => 0xE0120, # VARIATION SELECTOR-49
172 'VS50' => 0xE0121, # VARIATION SELECTOR-50
173 'VS51' => 0xE0122, # VARIATION SELECTOR-51
174 'VS52' => 0xE0123, # VARIATION SELECTOR-52
175 'VS53' => 0xE0124, # VARIATION SELECTOR-53
176 'VS54' => 0xE0125, # VARIATION SELECTOR-54
177 'VS55' => 0xE0126, # VARIATION SELECTOR-55
178 'VS56' => 0xE0127, # VARIATION SELECTOR-56
179 'VS57' => 0xE0128, # VARIATION SELECTOR-57
180 'VS58' => 0xE0129, # VARIATION SELECTOR-58
181 'VS59' => 0xE012A, # VARIATION SELECTOR-59
182 'VS60' => 0xE012B, # VARIATION SELECTOR-60
183 'VS61' => 0xE012C, # VARIATION SELECTOR-61
184 'VS62' => 0xE012D, # VARIATION SELECTOR-62
185 'VS63' => 0xE012E, # VARIATION SELECTOR-63
186 'VS64' => 0xE012F, # VARIATION SELECTOR-64
187 'VS65' => 0xE0130, # VARIATION SELECTOR-65
188 'VS66' => 0xE0131, # VARIATION SELECTOR-66
189 'VS67' => 0xE0132, # VARIATION SELECTOR-67
190 'VS68' => 0xE0133, # VARIATION SELECTOR-68
191 'VS69' => 0xE0134, # VARIATION SELECTOR-69
192 'VS70' => 0xE0135, # VARIATION SELECTOR-70
193 'VS71' => 0xE0136, # VARIATION SELECTOR-71
194 'VS72' => 0xE0137, # VARIATION SELECTOR-72
195 'VS73' => 0xE0138, # VARIATION SELECTOR-73
196 'VS74' => 0xE0139, # VARIATION SELECTOR-74
197 'VS75' => 0xE013A, # VARIATION SELECTOR-75
198 'VS76' => 0xE013B, # VARIATION SELECTOR-76
199 'VS77' => 0xE013C, # VARIATION SELECTOR-77
200 'VS78' => 0xE013D, # VARIATION SELECTOR-78
201 'VS79' => 0xE013E, # VARIATION SELECTOR-79
202 'VS80' => 0xE013F, # VARIATION SELECTOR-80
203 'VS81' => 0xE0140, # VARIATION SELECTOR-81
204 'VS82' => 0xE0141, # VARIATION SELECTOR-82
205 'VS83' => 0xE0142, # VARIATION SELECTOR-83
206 'VS84' => 0xE0143, # VARIATION SELECTOR-84
207 'VS85' => 0xE0144, # VARIATION SELECTOR-85
208 'VS86' => 0xE0145, # VARIATION SELECTOR-86
209 'VS87' => 0xE0146, # VARIATION SELECTOR-87
210 'VS88' => 0xE0147, # VARIATION SELECTOR-88
211 'VS89' => 0xE0148, # VARIATION SELECTOR-89
212 'VS90' => 0xE0149, # VARIATION SELECTOR-90
213 'VS91' => 0xE014A, # VARIATION SELECTOR-91
214 'VS92' => 0xE014B, # VARIATION SELECTOR-92
215 'VS93' => 0xE014C, # VARIATION SELECTOR-93
216 'VS94' => 0xE014D, # VARIATION SELECTOR-94
217 'VS95' => 0xE014E, # VARIATION SELECTOR-95
218 'VS96' => 0xE014F, # VARIATION SELECTOR-96
219 'VS97' => 0xE0150, # VARIATION SELECTOR-97
220 'VS98' => 0xE0151, # VARIATION SELECTOR-98
221 'VS99' => 0xE0152, # VARIATION SELECTOR-99
222 'VS100' => 0xE0153, # VARIATION SELECTOR-100
223 'VS101' => 0xE0154, # VARIATION SELECTOR-101
224 'VS102' => 0xE0155, # VARIATION SELECTOR-102
225 'VS103' => 0xE0156, # VARIATION SELECTOR-103
226 'VS104' => 0xE0157, # VARIATION SELECTOR-104
227 'VS105' => 0xE0158, # VARIATION SELECTOR-105
228 'VS106' => 0xE0159, # VARIATION SELECTOR-106
229 'VS107' => 0xE015A, # VARIATION SELECTOR-107
230 'VS108' => 0xE015B, # VARIATION SELECTOR-108
231 'VS109' => 0xE015C, # VARIATION SELECTOR-109
232 'VS110' => 0xE015D, # VARIATION SELECTOR-110
233 'VS111' => 0xE015E, # VARIATION SELECTOR-111
234 'VS112' => 0xE015F, # VARIATION SELECTOR-112
235 'VS113' => 0xE0160, # VARIATION SELECTOR-113
236 'VS114' => 0xE0161, # VARIATION SELECTOR-114
237 'VS115' => 0xE0162, # VARIATION SELECTOR-115
238 'VS116' => 0xE0163, # VARIATION SELECTOR-116
239 'VS117' => 0xE0164, # VARIATION SELECTOR-117
240 'VS118' => 0xE0165, # VARIATION SELECTOR-118
241 'VS119' => 0xE0166, # VARIATION SELECTOR-119
242 'VS120' => 0xE0167, # VARIATION SELECTOR-120
243 'VS121' => 0xE0168, # VARIATION SELECTOR-121
244 'VS122' => 0xE0169, # VARIATION SELECTOR-122
245 'VS123' => 0xE016A, # VARIATION SELECTOR-123
246 'VS124' => 0xE016B, # VARIATION SELECTOR-124
247 'VS125' => 0xE016C, # VARIATION SELECTOR-125
248 'VS126' => 0xE016D, # VARIATION SELECTOR-126
249 'VS127' => 0xE016E, # VARIATION SELECTOR-127
250 'VS128' => 0xE016F, # VARIATION SELECTOR-128
251 'VS129' => 0xE0170, # VARIATION SELECTOR-129
252 'VS130' => 0xE0171, # VARIATION SELECTOR-130
253 'VS131' => 0xE0172, # VARIATION SELECTOR-131
254 'VS132' => 0xE0173, # VARIATION SELECTOR-132
255 'VS133' => 0xE0174, # VARIATION SELECTOR-133
256 'VS134' => 0xE0175, # VARIATION SELECTOR-134
257 'VS135' => 0xE0176, # VARIATION SELECTOR-135
258 'VS136' => 0xE0177, # VARIATION SELECTOR-136
259 'VS137' => 0xE0178, # VARIATION SELECTOR-137
260 'VS138' => 0xE0179, # VARIATION SELECTOR-138
261 'VS139' => 0xE017A, # VARIATION SELECTOR-139
262 'VS140' => 0xE017B, # VARIATION SELECTOR-140
263 'VS141' => 0xE017C, # VARIATION SELECTOR-141
264 'VS142' => 0xE017D, # VARIATION SELECTOR-142
265 'VS143' => 0xE017E, # VARIATION SELECTOR-143
266 'VS144' => 0xE017F, # VARIATION SELECTOR-144
267 'VS145' => 0xE0180, # VARIATION SELECTOR-145
268 'VS146' => 0xE0181, # VARIATION SELECTOR-146
269 'VS147' => 0xE0182, # VARIATION SELECTOR-147
270 'VS148' => 0xE0183, # VARIATION SELECTOR-148
271 'VS149' => 0xE0184, # VARIATION SELECTOR-149
272 'VS150' => 0xE0185, # VARIATION SELECTOR-150
273 'VS151' => 0xE0186, # VARIATION SELECTOR-151
274 'VS152' => 0xE0187, # VARIATION SELECTOR-152
275 'VS153' => 0xE0188, # VARIATION SELECTOR-153
276 'VS154' => 0xE0189, # VARIATION SELECTOR-154
277 'VS155' => 0xE018A, # VARIATION SELECTOR-155
278 'VS156' => 0xE018B, # VARIATION SELECTOR-156
279 'VS157' => 0xE018C, # VARIATION SELECTOR-157
280 'VS158' => 0xE018D, # VARIATION SELECTOR-158
281 'VS159' => 0xE018E, # VARIATION SELECTOR-159
282 'VS160' => 0xE018F, # VARIATION SELECTOR-160
283 'VS161' => 0xE0190, # VARIATION SELECTOR-161
284 'VS162' => 0xE0191, # VARIATION SELECTOR-162
285 'VS163' => 0xE0192, # VARIATION SELECTOR-163
286 'VS164' => 0xE0193, # VARIATION SELECTOR-164
287 'VS165' => 0xE0194, # VARIATION SELECTOR-165
288 'VS166' => 0xE0195, # VARIATION SELECTOR-166
289 'VS167' => 0xE0196, # VARIATION SELECTOR-167
290 'VS168' => 0xE0197, # VARIATION SELECTOR-168
291 'VS169' => 0xE0198, # VARIATION SELECTOR-169
292 'VS170' => 0xE0199, # VARIATION SELECTOR-170
293 'VS171' => 0xE019A, # VARIATION SELECTOR-171
294 'VS172' => 0xE019B, # VARIATION SELECTOR-172
295 'VS173' => 0xE019C, # VARIATION SELECTOR-173
296 'VS174' => 0xE019D, # VARIATION SELECTOR-174
297 'VS175' => 0xE019E, # VARIATION SELECTOR-175
298 'VS176' => 0xE019F, # VARIATION SELECTOR-176
299 'VS177' => 0xE01A0, # VARIATION SELECTOR-177
300 'VS178' => 0xE01A1, # VARIATION SELECTOR-178
301 'VS179' => 0xE01A2, # VARIATION SELECTOR-179
302 'VS180' => 0xE01A3, # VARIATION SELECTOR-180
303 'VS181' => 0xE01A4, # VARIATION SELECTOR-181
304 'VS182' => 0xE01A5, # VARIATION SELECTOR-182
305 'VS183' => 0xE01A6, # VARIATION SELECTOR-183
306 'VS184' => 0xE01A7, # VARIATION SELECTOR-184
307 'VS185' => 0xE01A8, # VARIATION SELECTOR-185
308 'VS186' => 0xE01A9, # VARIATION SELECTOR-186
309 'VS187' => 0xE01AA, # VARIATION SELECTOR-187
310 'VS188' => 0xE01AB, # VARIATION SELECTOR-188
311 'VS189' => 0xE01AC, # VARIATION SELECTOR-189
312 'VS190' => 0xE01AD, # VARIATION SELECTOR-190
313 'VS191' => 0xE01AE, # VARIATION SELECTOR-191
314 'VS192' => 0xE01AF, # VARIATION SELECTOR-192
315 'VS193' => 0xE01B0, # VARIATION SELECTOR-193
316 'VS194' => 0xE01B1, # VARIATION SELECTOR-194
317 'VS195' => 0xE01B2, # VARIATION SELECTOR-195
318 'VS196' => 0xE01B3, # VARIATION SELECTOR-196
319 'VS197' => 0xE01B4, # VARIATION SELECTOR-197
320 'VS198' => 0xE01B5, # VARIATION SELECTOR-198
321 'VS199' => 0xE01B6, # VARIATION SELECTOR-199
322 'VS200' => 0xE01B7, # VARIATION SELECTOR-200
323 'VS201' => 0xE01B8, # VARIATION SELECTOR-201
324 'VS202' => 0xE01B9, # VARIATION SELECTOR-202
325 'VS203' => 0xE01BA, # VARIATION SELECTOR-203
326 'VS204' => 0xE01BB, # VARIATION SELECTOR-204
327 'VS205' => 0xE01BC, # VARIATION SELECTOR-205
328 'VS206' => 0xE01BD, # VARIATION SELECTOR-206
329 'VS207' => 0xE01BE, # VARIATION SELECTOR-207
330 'VS208' => 0xE01BF, # VARIATION SELECTOR-208
331 'VS209' => 0xE01C0, # VARIATION SELECTOR-209
332 'VS210' => 0xE01C1, # VARIATION SELECTOR-210
333 'VS211' => 0xE01C2, # VARIATION SELECTOR-211
334 'VS212' => 0xE01C3, # VARIATION SELECTOR-212
335 'VS213' => 0xE01C4, # VARIATION SELECTOR-213
336 'VS214' => 0xE01C5, # VARIATION SELECTOR-214
337 'VS215' => 0xE01C6, # VARIATION SELECTOR-215
338 'VS216' => 0xE01C7, # VARIATION SELECTOR-216
339 'VS217' => 0xE01C8, # VARIATION SELECTOR-217
340 'VS218' => 0xE01C9, # VARIATION SELECTOR-218
341 'VS219' => 0xE01CA, # VARIATION SELECTOR-219
342 'VS220' => 0xE01CB, # VARIATION SELECTOR-220
343 'VS221' => 0xE01CC, # VARIATION SELECTOR-221
344 'VS222' => 0xE01CD, # VARIATION SELECTOR-222
345 'VS223' => 0xE01CE, # VARIATION SELECTOR-223
346 'VS224' => 0xE01CF, # VARIATION SELECTOR-224
347 'VS225' => 0xE01D0, # VARIATION SELECTOR-225
348 'VS226' => 0xE01D1, # VARIATION SELECTOR-226
349 'VS227' => 0xE01D2, # VARIATION SELECTOR-227
350 'VS228' => 0xE01D3, # VARIATION SELECTOR-228
351 'VS229' => 0xE01D4, # VARIATION SELECTOR-229
352 'VS230' => 0xE01D5, # VARIATION SELECTOR-230
353 'VS231' => 0xE01D6, # VARIATION SELECTOR-231
354 'VS232' => 0xE01D7, # VARIATION SELECTOR-232
355 'VS233' => 0xE01D8, # VARIATION SELECTOR-233
356 'VS234' => 0xE01D9, # VARIATION SELECTOR-234
357 'VS235' => 0xE01DA, # VARIATION SELECTOR-235
358 'VS236' => 0xE01DB, # VARIATION SELECTOR-236
359 'VS237' => 0xE01DC, # VARIATION SELECTOR-237
360 'VS238' => 0xE01DD, # VARIATION SELECTOR-238
361 'VS239' => 0xE01DE, # VARIATION SELECTOR-239
362 'VS240' => 0xE01DF, # VARIATION SELECTOR-240
363 'VS241' => 0xE01E0, # VARIATION SELECTOR-241
364 'VS242' => 0xE01E1, # VARIATION SELECTOR-242
365 'VS243' => 0xE01E2, # VARIATION SELECTOR-243
366 'VS244' => 0xE01E3, # VARIATION SELECTOR-244
367 'VS245' => 0xE01E4, # VARIATION SELECTOR-245
368 'VS246' => 0xE01E5, # VARIATION SELECTOR-246
369 'VS247' => 0xE01E6, # VARIATION SELECTOR-247
370 'VS248' => 0xE01E7, # VARIATION SELECTOR-248
371 'VS249' => 0xE01E8, # VARIATION SELECTOR-249
372 'VS250' => 0xE01E9, # VARIATION SELECTOR-250
373 'VS251' => 0xE01EA, # VARIATION SELECTOR-251
374 'VS252' => 0xE01EB, # VARIATION SELECTOR-252
375 'VS253' => 0xE01EC, # VARIATION SELECTOR-253
376 'VS254' => 0xE01ED, # VARIATION SELECTOR-254
377 'VS255' => 0xE01EE, # VARIATION SELECTOR-255
378 'VS256' => 0xE01EF, # VARIATION SELECTOR-256
379 'WJ' => 0x2060, # WORD JOINER
380 'ZWJ' => 0x200D, # ZERO WIDTH JOINER
381 'ZWNJ' => 0x200C, # ZERO WIDTH NON-JOINER
382 'ZWSP' => 0x200B, # ZERO WIDTH SPACE
383 );
52ea3e69
JH
384
385my %alias2 = (
16036bcd
KW
386 # Pre-3.2 compatibility (only for the first 256 characters).
387 # Use of these gives deprecated message.
388 'HORIZONTAL TABULATION' => 0x09, # CHARACTER TABULATION
389 'VERTICAL TABULATION' => 0x0B, # LINE TABULATION
390 'FILE SEPARATOR' => 0x1C, # INFORMATION SEPARATOR FOUR
391 'GROUP SEPARATOR' => 0x1D, # INFORMATION SEPARATOR THREE
392 'RECORD SEPARATOR' => 0x1E, # INFORMATION SEPARATOR TWO
393 'UNIT SEPARATOR' => 0x1F, # INFORMATION SEPARATOR ONE
394 'HORIZONTAL TABULATION SET' => 0x88, # CHARACTER TABULATION SET
395 'HORIZONTAL TABULATION WITH JUSTIFICATION' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
396 'PARTIAL LINE DOWN' => 0x8B, # PARTIAL LINE FORWARD
397 'PARTIAL LINE UP' => 0x8C, # PARTIAL LINE BACKWARD
398 'VERTICAL TABULATION SET' => 0x8A, # LINE TABULATION SET
399 'REVERSE INDEX' => 0x8D, # REVERSE LINE FEED
400 );
52ea3e69 401
35c0985d 402my %alias3 = (
16036bcd
KW
403 # User defined aliases. Even more convenient :)
404 );
423cee85
JH
405my $txt;
406
8878f897
T
407sub croak
408{
409 require Carp; goto &Carp::croak;
410} # croak
411
412sub carp
413{
414 require Carp; goto &Carp::carp;
415} # carp
416
35c0985d
MB
417sub alias (@)
418{
419 @_ or return %alias3;
420 my $alias = ref $_[0] ? $_[0] : { @_ };
421 @alias3{keys %$alias} = values %$alias;
422} # alias
423
424sub alias_file ($)
425{
51cf30b6
MB
426 my ($arg, $file) = @_;
427 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
428 $file = $arg;
429 }
430 elsif ($arg =~ m/^\w+$/) {
431 $file = "unicore/${arg}_alias.pl";
432 }
433 else {
434 croak "Charnames alias files can only have identifier characters";
435 }
35c0985d 436 if (my @alias = do $file) {
51cf30b6
MB
437 @alias == 1 && !defined $alias[0] and
438 croak "$file cannot be used as alias file for charnames";
439 @alias % 2 and
440 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
441 alias (@alias);
442 return (1);
443 }
444 0;
445} # alias_file
446
423cee85 447# This is not optimized in any way yet
b177ca84
JF
448sub charnames
449{
450 my $name = shift;
16036bcd
KW
451 my $ord;
452 my $fname;
b177ca84 453
16036bcd
KW
454 if (exists $alias3{$name}) { # User alias should be checked first, or else
455 # can't override ours, and if we add any,
456 # could conflict with theirs.
457 $name = $alias3{$name};
458 }
459 elsif (exists $alias1{$name}) {
460 $ord = $alias1{$name};
461 $fname = $name;
52ea3e69 462 }
35c0985d
MB
463 elsif (exists $alias2{$name}) {
464 require warnings;
16036bcd
KW
465 warnings::warnif('deprecated', "Unicode character name \"$name\" is deprecated, use \"" . viacode($alias2{$name}) . "\" instead");
466 $ord = $alias2{$name};
467 $fname = $name;
52ea3e69 468 }
b177ca84 469
423cee85 470 my @off;
52ea3e69 471
16036bcd 472 if (! defined $ord) {
35c0985d
MB
473 ## Suck in the code/name list as a big string.
474 ## Lines look like:
475 ## "0052\t\tLATIN CAPITAL LETTER R\n"
476 $txt = do "unicore/Name.pl" unless $txt;
477
478 ## @off will hold the index into the code/name string of the start and
479 ## end of the name as we find it.
480
a6d05634 481 ## If :full, look for the name exactly
35c0985d
MB
482 if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
483 @off = ($-[0], $+[0]);
484 }
485
486 ## If we didn't get above, and :short allowed, look for the short name.
487 ## The short name is like "greek:Sigma"
488 unless (@off) {
489 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
490 my ($script, $cname) = ($1, $2);
491 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
492 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
52ea3e69 493 @off = ($-[0], $+[0]);
35c0985d 494 }
423cee85 495 }
35c0985d 496 }
b177ca84 497
35c0985d
MB
498 ## If we still don't have it, check for the name among the loaded
499 ## scripts.
500 if (not @off) {
501 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
502 for my $script (@{$^H{charnames_scripts}}) {
503 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
504 @off = ($-[0], $+[0]);
505 last;
506 }
52ea3e69 507 }
35c0985d
MB
508 }
509
510 ## If we don't have it by now, give up.
511 unless (@off) {
512 carp "Unknown charname '$name'";
513 return "\x{FFFD}";
514 }
515
516 ##
517 ## Now know where in the string the name starts.
518 ## The code, in hex, is before that.
519 ##
520 ## The code can be 4-6 characters long, so we've got to sort of
521 ## go look for it, just after the newline that comes before $off[0].
522 ##
523 ## This would be much easier if unicore/Name.pl had info in
524 ## a name/code order, instead of code/name order.
525 ##
526 ## The +1 after the rindex() is to skip past the newline we're finding,
527 ## or, if the rindex() fails, to put us to an offset of zero.
528 ##
529 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
530
531 ## we know where it starts, so turn into number -
532 ## the ordinal for the char.
075d4edd 533 $ord = CORE::hex substr($txt, $hexstart, $off[0] - $hexstart);
423cee85 534 }
b177ca84 535
d5448623 536 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 537 use bytes;
d41ff1b8 538 return chr $ord if $ord <= 255;
f0175764 539 my $hex = sprintf "%04x", $ord;
52ea3e69 540 if (not defined $fname) {
35c0985d 541 $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
52ea3e69 542 }
f0175764 543 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 544 }
f0175764 545
52ea3e69 546 no warnings 'utf8'; # allow even illegal characters
bfa383d6 547 return pack "U", $ord;
35c0985d 548} # charnames
423cee85 549
b177ca84
JF
550sub import
551{
552 shift; ## ignore class name
553
35c0985d
MB
554 if (not @_) {
555 carp("`use charnames' needs explicit imports list");
b177ca84 556 }
423cee85 557 $^H{charnames} = \&charnames ;
b177ca84
JF
558
559 ##
560 ## fill %h keys with our @_ args.
561 ##
35c0985d 562 my ($promote, %h, @args) = (0);
e5c3f898
MG
563 while (my $arg = shift) {
564 if ($arg eq ":alias") {
51cf30b6
MB
565 @_ or
566 croak ":alias needs an argument in charnames";
35c0985d
MB
567 my $alias = shift;
568 if (ref $alias) {
569 ref $alias eq "HASH" or
51cf30b6 570 croak "Only HASH reference supported as argument to :alias";
35c0985d
MB
571 alias ($alias);
572 next;
573 }
51cf30b6
MB
574 if ($alias =~ m{:(\w+)$}) {
575 $1 eq "full" || $1 eq "short" and
576 croak ":alias cannot use existing pragma :$1 (reversed order?)";
577 alias_file ($1) and $promote = 1;
578 next;
35c0985d 579 }
51cf30b6
MB
580 alias_file ($alias);
581 next;
582 }
e5c3f898
MG
583 if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
584 warn "unsupported special '$arg' in charnames";
51cf30b6 585 next;
35c0985d 586 }
e5c3f898 587 push @args, $arg;
35c0985d
MB
588 }
589 @args == 0 && $promote and @args = (":full");
590 @h{@args} = (1) x @args;
b177ca84 591
423cee85
JH
592 $^H{charnames_full} = delete $h{':full'};
593 $^H{charnames_short} = delete $h{':short'};
594 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
595
596 ##
597 ## If utf8? warnings are enabled, and some scripts were given,
598 ## see if at least we can find one letter of each script.
599 ##
35c0985d
MB
600 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
601 $txt = do "unicore/Name.pl" unless $txt;
602
603 for my $script (@{$^H{charnames_scripts}}) {
604 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
605 warnings::warn('utf8', "No such script: '$script'");
b177ca84 606 }
35c0985d 607 }
bd62941a 608 }
35c0985d 609} # import
423cee85 610
4e2cda5d
JH
611my %viacode;
612
b177ca84
JF
613sub viacode
614{
35c0985d
MB
615 if (@_ != 1) {
616 carp "charnames::viacode() expects one argument";
bd5c3bd9 617 return;
35c0985d 618 }
f0175764 619
35c0985d 620 my $arg = shift;
b177ca84 621
e10d7780
KW
622 # this is derived from Unicode::UCD, where it is nearly the same as the
623 # function _getcode(), but it makes sure that even a hex argument has the
624 # proper number of leading zeros, which is critical in matching against $txt
625 # below
35c0985d 626 my $hex;
bd5c3bd9 627 if ($arg =~ /^[1-9]\d*$/) {
35c0985d 628 $hex = sprintf "%04X", $arg;
bd5c3bd9 629 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
e10d7780 630 # Below is the line that differs from the _getcode() source
c8002005 631 $hex = sprintf "%04X", hex $1;
35c0985d
MB
632 } else {
633 carp("unexpected arg \"$arg\" to charnames::viacode()");
634 return;
635 }
b177ca84 636
bd5c3bd9
T
637 # checking the length first is slightly faster
638 if (length($hex) > 5 && hex($hex) > 0x10FFFF) {
9b5be9b5 639 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
35c0985d
MB
640 return;
641 }
f0175764 642
35c0985d 643 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 644
35c0985d 645 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 646
bd5c3bd9
T
647 return unless $txt =~ m/^$hex\t\t(.+)/m;
648
649 $viacode{$hex} = $1;
35c0985d 650} # viacode
daf0d493 651
4e2cda5d
JH
652my %vianame;
653
daf0d493
JH
654sub vianame
655{
35c0985d
MB
656 if (@_ != 1) {
657 carp "charnames::vianame() expects one name argument";
658 return ()
659 }
daf0d493 660
35c0985d 661 my $arg = shift;
daf0d493 662
075d4edd 663 return chr CORE::hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
dbc0d4f2 664
35c0985d 665 return $vianame{$arg} if exists $vianame{$arg};
4e2cda5d 666
35c0985d 667 $txt = do "unicore/Name.pl" unless $txt;
daf0d493 668
35c0985d 669 my $pos = index $txt, "\t\t$arg\n";
859172fe 670 if (0 <= $pos) {
35c0985d
MB
671 my $posLF = rindex $txt, "\n", $pos;
672 (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
075d4edd 673 return $vianame{$arg} = CORE::hex $code;
35c0985d 674
859172fe
Z
675 # If $pos is at the 1st line, $posLF must be -1 (not found);
676 # then $posLF + 1 equals to 0 (at the beginning of $txt).
35c0985d
MB
677 # Otherwise $posLF is the position of "\n";
678 # then $posLF + 1 must be the position of the next to "\n"
679 # (the beginning of the line).
680 # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
681 # "10300\t", "100000", etc. So we can get the code via removing TAB.
682 } else {
683 return;
684 }
685} # vianame
b177ca84 686
423cee85
JH
687
6881;
689__END__
690
691=head1 NAME
692
274085e3 693charnames - define character names for C<\N{named}> string literal escapes
423cee85
JH
694
695=head1 SYNOPSIS
696
697 use charnames ':full';
4a2d328f 698 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
699
700 use charnames ':short';
4a2d328f 701 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
702
703 use charnames qw(cyrillic greek);
4a2d328f 704 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 705
35c0985d
MB
706 use charnames ":full", ":alias" => {
707 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
76ae0c45 708 };
35c0985d
MB
709 print "\N{e_ACUTE} is a small letter e with an acute.\n";
710
76ae0c45 711 use charnames ();
a23c04e4 712 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
16036bcd
KW
713 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
714 # "10330"
b177ca84 715
423cee85
JH
716=head1 DESCRIPTION
717
35c0985d
MB
718Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
719names and customized aliases. If C<:full> is present, for expansion of
76ae0c45
RGS
720C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
721standard Unicode character names. If C<:short> is present, and
423cee85
JH
722C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
723as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
a191c821 724with script name arguments, then for C<\N{CHARNAME}> the name
423cee85 725C<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
726specified order). Customized aliases can override these, and are explained in
727L</CUSTOM ALIASES>.
423cee85
JH
728
729For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 730this pragma looks for the names
423cee85
JH
731
732 SCRIPTNAME CAPITAL LETTER CHARNAME
733 SCRIPTNAME SMALL LETTER CHARNAME
734 SCRIPTNAME LETTER CHARNAME
735
736in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493
JH
737then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
738is ignored.
739
740Note that C<\N{...}> is compile-time, it's a special form of string
741constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 742use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 743functionality, use charnames::vianame().
423cee85 744
301a3cda 745For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
dbc0d4f2 746as of Unicode 3.1, there are no official Unicode names but you can use
16036bcd
KW
747instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth, and their
748abbreviations, LF, ESC, ...). In
dbc0d4f2 749Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
16036bcd 750has been updated, see L</ALIASES>.
dbc0d4f2
JH
751
752Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
753is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
301a3cda 754
5ffe0e96 755=head1 ALIASES
423cee85 756
5ffe0e96
MB
757A few aliases have been defined for convenience: instead of having
758to use the official names
423cee85 759
5ffe0e96
MB
760 LINE FEED (LF)
761 FORM FEED (FF)
762 CARRIAGE RETURN (CR)
763 NEXT LINE (NEL)
423cee85 764
5ffe0e96 765(yes, with parentheses) one can use
d5448623 766
5ffe0e96
MB
767 LINE FEED
768 FORM FEED
769 CARRIAGE RETURN
770 NEXT LINE
771 LF
772 FF
773 CR
774 NEL
775
16036bcd
KW
776All the other standard abbreviations for the controls, such as C<ACK> for
777C<ACKNOWLEDGE> also can be used.
778
5ffe0e96
MB
779One can also use
780
781 BYTE ORDER MARK
782 BOM
783
16036bcd
KW
784and these abbreviations
785
786 Abbreviation Full Name
787
788 CGJ COMBINING GRAPHEME JOINER
789 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
790 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
791 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
792 LRE LEFT-TO-RIGHT EMBEDDING
793 LRM LEFT-TO-RIGHT MARK
794 LRO LEFT-TO-RIGHT OVERRIDE
795 MMSP MEDIUM MATHEMATICAL SPACE
796 MVS MONGOLIAN VOWEL SEPARATOR
797 NBSP NO-BREAK SPACE
798 NNBSP NARROW NO-BREAK SPACE
799 PDF POP DIRECTIONAL FORMATTING
800 RLE RIGHT-TO-LEFT EMBEDDING
801 RLM RIGHT-TO-LEFT MARK
802 RLO RIGHT-TO-LEFT OVERRIDE
803 SHY SOFT HYPHEN
804 VS1 VARIATION SELECTOR-1
805 .
806 .
807 .
808 VS256 VARIATION SELECTOR-256
809 WJ WORD JOINER
810 ZWJ ZERO WIDTH JOINER
811 ZWNJ ZERO WIDTH NON-JOINER
812 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
813
814For backward compatibility one can use the old names for
815certain C0 and C1 controls
816
817 old new
818
5ffe0e96
MB
819 FILE SEPARATOR INFORMATION SEPARATOR FOUR
820 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
821 HORIZONTAL TABULATION CHARACTER TABULATION
822 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
823 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
824 WITH JUSTIFICATION
5ffe0e96
MB
825 PARTIAL LINE DOWN PARTIAL LINE FORWARD
826 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
827 RECORD SEPARATOR INFORMATION SEPARATOR TWO
828 REVERSE INDEX REVERSE LINE FEED
829 UNIT SEPARATOR INFORMATION SEPARATOR ONE
830 VERTICAL TABULATION LINE TABULATION
831 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
832
833but the old names in addition to giving the character
834will also give a warning about being deprecated.
423cee85 835
16036bcd
KW
836And finally, certain published variants are usable, including some for
837controls that have no Unicode names:
838
839 END OF PROTECTED AREA
840 HIGH OCTET PRESET
841 HOP
842 IND
843 INDEX
844 PAD
845 PADDING CHARACTER
846 PRIVATE USE 1
847 PRIVATE USE 2
848 SGC
849 SINGLE GRAPHIC CHARACTER INTRODUCER
850 SINGLE-SHIFT 2
851 SINGLE-SHIFT 3
852 START OF PROTECTED AREA
853
35c0985d
MB
854=head1 CUSTOM ALIASES
855
856This version of charnames supports three mechanisms of adding local
55bc7d3c 857or customized aliases to standard Unicode naming conventions (:full).
16036bcd
KW
858The aliases override any standard definitions, so, if you're twisted enough,
859you can change C<"\N{LATIN CAPITAL LETTER A}"> to mean C<"B">, etc.
55bc7d3c
KW
860
861Note that an alias should not be something that is a legal curly
862brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
863C<\N{123}> means to match 123 non-newline characters, and is not treated as an
864alias. Aliases are discouraged from beginning with anything other than an
865alphabetic character and from containing anything other than alphanumerics,
bee80e93
KW
866spaces, dashes, colons, parentheses, and underscores. Currently they must be
867ASCII.
35c0985d
MB
868
869=head2 Anonymous hashes
870
871 use charnames ":full", ":alias" => {
872 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
873 };
874 my $str = "\N{e_ACUTE}";
875
876=head2 Alias file
877
878 use charnames ":full", ":alias" => "pro";
879
880 will try to read "unicore/pro_alias.pl" from the @INC path. This
881 file should return a list in plain perl:
882
883 (
884 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
885 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
886 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
887 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
888 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
889 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
890 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
891 );
892
893=head2 Alias shortcut
894
895 use charnames ":alias" => ":pro";
896
897 works exactly the same as the alias pairs, only this time,
898 ":full" is inserted automatically as first argument (if no
899 other argument is given).
900
b177ca84
JF
901=head1 charnames::viacode(code)
902
903Returns the full name of the character indicated by the numeric code.
904The example
905
906 print charnames::viacode(0x2722);
907
908prints "FOUR TEARDROP-SPOKED ASTERISK".
909
daf0d493
JH
910Returns undef if no name is known for the code.
911
35c0985d 912This works only for the standard names, and does not yet apply
daf0d493
JH
913to custom translators.
914
274085e3
PN
915Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
916SPACE", not "BYTE ORDER MARK".
917
eb6a2339 918=head1 charnames::vianame(name)
daf0d493
JH
919
920Returns the code point indicated by the name.
921The example
922
923 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
924
925prints "2722".
926
eb6a2339 927Returns undef if the name is unknown.
b177ca84 928
35c0985d 929This works only for the standard names, and does not yet apply
b177ca84
JF
930to custom translators.
931
5ffe0e96 932=head1 CUSTOM TRANSLATORS
52ea3e69 933
5ffe0e96
MB
934The mechanism of translation of C<\N{...}> escapes is general and not
935hardwired into F<charnames.pm>. A module can install custom
936translations (inside the scope which C<use>s the module) with the
937following magic incantation:
52ea3e69 938
5ffe0e96
MB
939 sub import {
940 shift;
941 $^H{charnames} = \&translator;
942 }
52ea3e69 943
5ffe0e96
MB
944Here translator() is a subroutine which takes C<CHARNAME> as an
945argument, and returns text to insert into the string instead of the
946C<\N{CHARNAME}> escape. Since the text to insert should be different
947in C<bytes> mode and out of it, the function should check the current
948state of C<bytes>-flag as in:
52ea3e69 949
5ffe0e96
MB
950 use bytes (); # for $bytes::hint_bits
951 sub translator {
952 if ($^H & $bytes::hint_bits) {
953 return bytes_translator(@_);
954 }
955 else {
956 return utf8_translator(@_);
957 }
958 }
52ea3e69 959
55bc7d3c
KW
960See L</CUSTOM ALIASES> above for restrictions on C<CHARNAME>.
961
f0175764
JH
962=head1 ILLEGAL CHARACTERS
963
55bc7d3c
KW
964If you ask by name for a character that does not exist, a warning is given and
965the Unicode I<replacement character> "\x{FFFD}" is returned.
00d835f2 966
55bc7d3c 967If you ask by code for a character that is unassigned, no warning is
00d835f2 968given and C<undef> is returned. (Though if you ask for a code point
55bc7d3c 969past U+10FFFF you do get a warning.) See L</BUGS> below.
f0175764 970
423cee85
JH
971=head1 BUGS
972
55bc7d3c
KW
973viacode should return an empty string for unassigned in-range Unicode code
974points, as that is their correct current name.
975
55bc7d3c 976vianame returns a chr if the input name is of the form C<U+...>, and an ord
a0a3bc7f 977otherwise. It is proposed to change this to always return an ord. Send email
16036bcd 978to C<perl5-porters@perl.org> to comment on this proposal.
55bc7d3c
KW
979
980None of the functions work on almost all the Hangul syllable and CJK Unicode
981characters that have their code points as part of their names.
982
16036bcd
KW
983Names must be ASCII characters only, which means that you are out of luck if
984you want to create aliases in a language where some or all the characters of
985the desired aliases are non-ASCII.
bee80e93 986
fe749c9a
KW
987Unicode standard named sequences are not recognized, such as
988C<LATIN CAPITAL LETTER A WITH MACRON AND GRAVE>
989(which should mean C<LATIN CAPITAL LETTER A WITH MACRON> with an additional
990C<COMBINING GRAVE ACCENT>).
991
55bc7d3c 992Since evaluation of the translation function happens in the middle of
423cee85
JH
993compilation (of a string literal), the translation function should not
994do any C<eval>s or C<require>s. This restriction should be lifted in
995a future version of Perl.
996
997=cut