This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Clean up viacode, accept large aliases
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
63098191 5our $VERSION = '1.11';
b75c8c73 6
d5448623 7use bytes (); # for $bytes::hint_bits
423cee85 8
232cbbee 9my %system_aliases = (
16036bcd
KW
10 # Icky 3.2 names with parentheses.
11 'LINE FEED' => 0x0A, # LINE FEED (LF)
12 'FORM FEED' => 0x0C, # FORM FEED (FF)
13 'CARRIAGE RETURN' => 0x0D, # CARRIAGE RETURN (CR)
14 'NEXT LINE' => 0x85, # NEXT LINE (NEL)
15
16 # Some variant names from Wikipedia
17 'SINGLE-SHIFT 2' => 0x8E,
18 'SINGLE-SHIFT 3' => 0x8F,
19 'PRIVATE USE 1' => 0x91,
20 'PRIVATE USE 2' => 0x92,
21 'START OF PROTECTED AREA' => 0x96,
22 'END OF PROTECTED AREA' => 0x97,
23
24 # Convenience. Standard abbreviations for the controls
25 'NUL' => 0x00, # NULL
26 'SOH' => 0x01, # START OF HEADING
27 'STX' => 0x02, # START OF TEXT
28 'ETX' => 0x03, # END OF TEXT
29 'EOT' => 0x04, # END OF TRANSMISSION
30 'ENQ' => 0x05, # ENQUIRY
31 'ACK' => 0x06, # ACKNOWLEDGE
32 'BEL' => 0x07, # BELL
33 'BS' => 0x08, # BACKSPACE
34 'HT' => 0x09, # HORIZONTAL TABULATION
35 'LF' => 0x0A, # LINE FEED (LF)
36 'VT' => 0x0B, # VERTICAL TABULATION
37 'FF' => 0x0C, # FORM FEED (FF)
38 'CR' => 0x0D, # CARRIAGE RETURN (CR)
39 'SO' => 0x0E, # SHIFT OUT
40 'SI' => 0x0F, # SHIFT IN
41 'DLE' => 0x10, # DATA LINK ESCAPE
42 'DC1' => 0x11, # DEVICE CONTROL ONE
43 'DC2' => 0x12, # DEVICE CONTROL TWO
44 'DC3' => 0x13, # DEVICE CONTROL THREE
45 'DC4' => 0x14, # DEVICE CONTROL FOUR
46 'NAK' => 0x15, # NEGATIVE ACKNOWLEDGE
47 'SYN' => 0x16, # SYNCHRONOUS IDLE
48 'ETB' => 0x17, # END OF TRANSMISSION BLOCK
49 'CAN' => 0x18, # CANCEL
50 'EOM' => 0x19, # END OF MEDIUM
51 'SUB' => 0x1A, # SUBSTITUTE
52 'ESC' => 0x1B, # ESCAPE
53 'FS' => 0x1C, # FILE SEPARATOR
54 'GS' => 0x1D, # GROUP SEPARATOR
55 'RS' => 0x1E, # RECORD SEPARATOR
56 'US' => 0x1F, # UNIT SEPARATOR
57 'DEL' => 0x7F, # DELETE
58 'BPH' => 0x82, # BREAK PERMITTED HERE
59 'NBH' => 0x83, # NO BREAK HERE
60 'NEL' => 0x85, # NEXT LINE (NEL)
61 'SSA' => 0x86, # START OF SELECTED AREA
62 'ESA' => 0x87, # END OF SELECTED AREA
63 'HTS' => 0x88, # CHARACTER TABULATION SET
64 'HTJ' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
65 'VTS' => 0x8A, # LINE TABULATION SET
66 'PLD' => 0x8B, # PARTIAL LINE FORWARD
67 'PLU' => 0x8C, # PARTIAL LINE BACKWARD
68 'RI ' => 0x8D, # REVERSE LINE FEED
69 'SS2' => 0x8E, # SINGLE SHIFT TWO
70 'SS3' => 0x8F, # SINGLE SHIFT THREE
71 'DCS' => 0x90, # DEVICE CONTROL STRING
72 'PU1' => 0x91, # PRIVATE USE ONE
73 'PU2' => 0x92, # PRIVATE USE TWO
74 'STS' => 0x93, # SET TRANSMIT STATE
75 'CCH' => 0x94, # CANCEL CHARACTER
76 'MW ' => 0x95, # MESSAGE WAITING
77 'SPA' => 0x96, # START OF GUARDED AREA
78 'EPA' => 0x97, # END OF GUARDED AREA
79 'SOS' => 0x98, # START OF STRING
80 'SCI' => 0x9A, # SINGLE CHARACTER INTRODUCER
81 'CSI' => 0x9B, # CONTROL SEQUENCE INTRODUCER
82 'ST ' => 0x9C, # STRING TERMINATOR
83 'OSC' => 0x9D, # OPERATING SYSTEM COMMAND
84 'PM ' => 0x9E, # PRIVACY MESSAGE
85 'APC' => 0x9F, # APPLICATION PROGRAM COMMAND
86
87 # There are no names for these in the Unicode standard;
88 # perhaps should be deprecated, but then again there are
89 # no alternative names, so am not deprecating. And if
90 # did, the code would have to change to not recommend an
91 # alternative for these.
92 'PADDING CHARACTER' => 0x80,
93 'PAD' => 0x80,
94 'HIGH OCTET PRESET' => 0x81,
95 'HOP' => 0x81,
96 'INDEX' => 0x84,
97 'IND' => 0x84,
98 'SINGLE GRAPHIC CHARACTER INTRODUCER' => 0x99,
99 'SGC' => 0x99,
100
101 # More convenience. For further convenience,
102 # it is suggested some way of using the NamesList
103 # aliases be implemented, but there are ambiguities in
232cbbee 104 # NamesList.txt
16036bcd
KW
105 'BOM' => 0xFEFF, # BYTE ORDER MARK
106 'BYTE ORDER MARK'=> 0xFEFF,
107 'CGJ' => 0x034F, # COMBINING GRAPHEME JOINER
108 'FVS1' => 0x180B, # MONGOLIAN FREE VARIATION SELECTOR ONE
109 'FVS2' => 0x180C, # MONGOLIAN FREE VARIATION SELECTOR TWO
110 'FVS3' => 0x180D, # MONGOLIAN FREE VARIATION SELECTOR THREE
111 'LRE' => 0x202A, # LEFT-TO-RIGHT EMBEDDING
112 'LRM' => 0x200E, # LEFT-TO-RIGHT MARK
113 'LRO' => 0x202D, # LEFT-TO-RIGHT OVERRIDE
114 'MMSP' => 0x205F, # MEDIUM MATHEMATICAL SPACE
115 'MVS' => 0x180E, # MONGOLIAN VOWEL SEPARATOR
116 'NBSP' => 0x00A0, # NO-BREAK SPACE
117 'NNBSP' => 0x202F, # NARROW NO-BREAK SPACE
118 'PDF' => 0x202C, # POP DIRECTIONAL FORMATTING
119 'RLE' => 0x202B, # RIGHT-TO-LEFT EMBEDDING
120 'RLM' => 0x200F, # RIGHT-TO-LEFT MARK
121 'RLO' => 0x202E, # RIGHT-TO-LEFT OVERRIDE
122 'SHY' => 0x00AD, # SOFT HYPHEN
123 'VS1' => 0xFE00, # VARIATION SELECTOR-1
124 'VS2' => 0xFE01, # VARIATION SELECTOR-2
125 'VS3' => 0xFE02, # VARIATION SELECTOR-3
126 'VS4' => 0xFE03, # VARIATION SELECTOR-4
127 'VS5' => 0xFE04, # VARIATION SELECTOR-5
128 'VS6' => 0xFE05, # VARIATION SELECTOR-6
129 'VS7' => 0xFE06, # VARIATION SELECTOR-7
130 'VS8' => 0xFE07, # VARIATION SELECTOR-8
131 'VS9' => 0xFE08, # VARIATION SELECTOR-9
132 'VS10' => 0xFE09, # VARIATION SELECTOR-10
133 'VS11' => 0xFE0A, # VARIATION SELECTOR-11
134 'VS12' => 0xFE0B, # VARIATION SELECTOR-12
135 'VS13' => 0xFE0C, # VARIATION SELECTOR-13
136 'VS14' => 0xFE0D, # VARIATION SELECTOR-14
137 'VS15' => 0xFE0E, # VARIATION SELECTOR-15
138 'VS16' => 0xFE0F, # VARIATION SELECTOR-16
139 'VS17' => 0xE0100, # VARIATION SELECTOR-17
140 'VS18' => 0xE0101, # VARIATION SELECTOR-18
141 'VS19' => 0xE0102, # VARIATION SELECTOR-19
142 'VS20' => 0xE0103, # VARIATION SELECTOR-20
143 'VS21' => 0xE0104, # VARIATION SELECTOR-21
144 'VS22' => 0xE0105, # VARIATION SELECTOR-22
145 'VS23' => 0xE0106, # VARIATION SELECTOR-23
146 'VS24' => 0xE0107, # VARIATION SELECTOR-24
147 'VS25' => 0xE0108, # VARIATION SELECTOR-25
148 'VS26' => 0xE0109, # VARIATION SELECTOR-26
149 'VS27' => 0xE010A, # VARIATION SELECTOR-27
150 'VS28' => 0xE010B, # VARIATION SELECTOR-28
151 'VS29' => 0xE010C, # VARIATION SELECTOR-29
152 'VS30' => 0xE010D, # VARIATION SELECTOR-30
153 'VS31' => 0xE010E, # VARIATION SELECTOR-31
154 'VS32' => 0xE010F, # VARIATION SELECTOR-32
155 'VS33' => 0xE0110, # VARIATION SELECTOR-33
156 'VS34' => 0xE0111, # VARIATION SELECTOR-34
157 'VS35' => 0xE0112, # VARIATION SELECTOR-35
158 'VS36' => 0xE0113, # VARIATION SELECTOR-36
159 'VS37' => 0xE0114, # VARIATION SELECTOR-37
160 'VS38' => 0xE0115, # VARIATION SELECTOR-38
161 'VS39' => 0xE0116, # VARIATION SELECTOR-39
162 'VS40' => 0xE0117, # VARIATION SELECTOR-40
163 'VS41' => 0xE0118, # VARIATION SELECTOR-41
164 'VS42' => 0xE0119, # VARIATION SELECTOR-42
165 'VS43' => 0xE011A, # VARIATION SELECTOR-43
166 'VS44' => 0xE011B, # VARIATION SELECTOR-44
167 'VS45' => 0xE011C, # VARIATION SELECTOR-45
168 'VS46' => 0xE011D, # VARIATION SELECTOR-46
169 'VS47' => 0xE011E, # VARIATION SELECTOR-47
170 'VS48' => 0xE011F, # VARIATION SELECTOR-48
171 'VS49' => 0xE0120, # VARIATION SELECTOR-49
172 'VS50' => 0xE0121, # VARIATION SELECTOR-50
173 'VS51' => 0xE0122, # VARIATION SELECTOR-51
174 'VS52' => 0xE0123, # VARIATION SELECTOR-52
175 'VS53' => 0xE0124, # VARIATION SELECTOR-53
176 'VS54' => 0xE0125, # VARIATION SELECTOR-54
177 'VS55' => 0xE0126, # VARIATION SELECTOR-55
178 'VS56' => 0xE0127, # VARIATION SELECTOR-56
179 'VS57' => 0xE0128, # VARIATION SELECTOR-57
180 'VS58' => 0xE0129, # VARIATION SELECTOR-58
181 'VS59' => 0xE012A, # VARIATION SELECTOR-59
182 'VS60' => 0xE012B, # VARIATION SELECTOR-60
183 'VS61' => 0xE012C, # VARIATION SELECTOR-61
184 'VS62' => 0xE012D, # VARIATION SELECTOR-62
185 'VS63' => 0xE012E, # VARIATION SELECTOR-63
186 'VS64' => 0xE012F, # VARIATION SELECTOR-64
187 'VS65' => 0xE0130, # VARIATION SELECTOR-65
188 'VS66' => 0xE0131, # VARIATION SELECTOR-66
189 'VS67' => 0xE0132, # VARIATION SELECTOR-67
190 'VS68' => 0xE0133, # VARIATION SELECTOR-68
191 'VS69' => 0xE0134, # VARIATION SELECTOR-69
192 'VS70' => 0xE0135, # VARIATION SELECTOR-70
193 'VS71' => 0xE0136, # VARIATION SELECTOR-71
194 'VS72' => 0xE0137, # VARIATION SELECTOR-72
195 'VS73' => 0xE0138, # VARIATION SELECTOR-73
196 'VS74' => 0xE0139, # VARIATION SELECTOR-74
197 'VS75' => 0xE013A, # VARIATION SELECTOR-75
198 'VS76' => 0xE013B, # VARIATION SELECTOR-76
199 'VS77' => 0xE013C, # VARIATION SELECTOR-77
200 'VS78' => 0xE013D, # VARIATION SELECTOR-78
201 'VS79' => 0xE013E, # VARIATION SELECTOR-79
202 'VS80' => 0xE013F, # VARIATION SELECTOR-80
203 'VS81' => 0xE0140, # VARIATION SELECTOR-81
204 'VS82' => 0xE0141, # VARIATION SELECTOR-82
205 'VS83' => 0xE0142, # VARIATION SELECTOR-83
206 'VS84' => 0xE0143, # VARIATION SELECTOR-84
207 'VS85' => 0xE0144, # VARIATION SELECTOR-85
208 'VS86' => 0xE0145, # VARIATION SELECTOR-86
209 'VS87' => 0xE0146, # VARIATION SELECTOR-87
210 'VS88' => 0xE0147, # VARIATION SELECTOR-88
211 'VS89' => 0xE0148, # VARIATION SELECTOR-89
212 'VS90' => 0xE0149, # VARIATION SELECTOR-90
213 'VS91' => 0xE014A, # VARIATION SELECTOR-91
214 'VS92' => 0xE014B, # VARIATION SELECTOR-92
215 'VS93' => 0xE014C, # VARIATION SELECTOR-93
216 'VS94' => 0xE014D, # VARIATION SELECTOR-94
217 'VS95' => 0xE014E, # VARIATION SELECTOR-95
218 'VS96' => 0xE014F, # VARIATION SELECTOR-96
219 'VS97' => 0xE0150, # VARIATION SELECTOR-97
220 'VS98' => 0xE0151, # VARIATION SELECTOR-98
221 'VS99' => 0xE0152, # VARIATION SELECTOR-99
222 'VS100' => 0xE0153, # VARIATION SELECTOR-100
223 'VS101' => 0xE0154, # VARIATION SELECTOR-101
224 'VS102' => 0xE0155, # VARIATION SELECTOR-102
225 'VS103' => 0xE0156, # VARIATION SELECTOR-103
226 'VS104' => 0xE0157, # VARIATION SELECTOR-104
227 'VS105' => 0xE0158, # VARIATION SELECTOR-105
228 'VS106' => 0xE0159, # VARIATION SELECTOR-106
229 'VS107' => 0xE015A, # VARIATION SELECTOR-107
230 'VS108' => 0xE015B, # VARIATION SELECTOR-108
231 'VS109' => 0xE015C, # VARIATION SELECTOR-109
232 'VS110' => 0xE015D, # VARIATION SELECTOR-110
233 'VS111' => 0xE015E, # VARIATION SELECTOR-111
234 'VS112' => 0xE015F, # VARIATION SELECTOR-112
235 'VS113' => 0xE0160, # VARIATION SELECTOR-113
236 'VS114' => 0xE0161, # VARIATION SELECTOR-114
237 'VS115' => 0xE0162, # VARIATION SELECTOR-115
238 'VS116' => 0xE0163, # VARIATION SELECTOR-116
239 'VS117' => 0xE0164, # VARIATION SELECTOR-117
240 'VS118' => 0xE0165, # VARIATION SELECTOR-118
241 'VS119' => 0xE0166, # VARIATION SELECTOR-119
242 'VS120' => 0xE0167, # VARIATION SELECTOR-120
243 'VS121' => 0xE0168, # VARIATION SELECTOR-121
244 'VS122' => 0xE0169, # VARIATION SELECTOR-122
245 'VS123' => 0xE016A, # VARIATION SELECTOR-123
246 'VS124' => 0xE016B, # VARIATION SELECTOR-124
247 'VS125' => 0xE016C, # VARIATION SELECTOR-125
248 'VS126' => 0xE016D, # VARIATION SELECTOR-126
249 'VS127' => 0xE016E, # VARIATION SELECTOR-127
250 'VS128' => 0xE016F, # VARIATION SELECTOR-128
251 'VS129' => 0xE0170, # VARIATION SELECTOR-129
252 'VS130' => 0xE0171, # VARIATION SELECTOR-130
253 'VS131' => 0xE0172, # VARIATION SELECTOR-131
254 'VS132' => 0xE0173, # VARIATION SELECTOR-132
255 'VS133' => 0xE0174, # VARIATION SELECTOR-133
256 'VS134' => 0xE0175, # VARIATION SELECTOR-134
257 'VS135' => 0xE0176, # VARIATION SELECTOR-135
258 'VS136' => 0xE0177, # VARIATION SELECTOR-136
259 'VS137' => 0xE0178, # VARIATION SELECTOR-137
260 'VS138' => 0xE0179, # VARIATION SELECTOR-138
261 'VS139' => 0xE017A, # VARIATION SELECTOR-139
262 'VS140' => 0xE017B, # VARIATION SELECTOR-140
263 'VS141' => 0xE017C, # VARIATION SELECTOR-141
264 'VS142' => 0xE017D, # VARIATION SELECTOR-142
265 'VS143' => 0xE017E, # VARIATION SELECTOR-143
266 'VS144' => 0xE017F, # VARIATION SELECTOR-144
267 'VS145' => 0xE0180, # VARIATION SELECTOR-145
268 'VS146' => 0xE0181, # VARIATION SELECTOR-146
269 'VS147' => 0xE0182, # VARIATION SELECTOR-147
270 'VS148' => 0xE0183, # VARIATION SELECTOR-148
271 'VS149' => 0xE0184, # VARIATION SELECTOR-149
272 'VS150' => 0xE0185, # VARIATION SELECTOR-150
273 'VS151' => 0xE0186, # VARIATION SELECTOR-151
274 'VS152' => 0xE0187, # VARIATION SELECTOR-152
275 'VS153' => 0xE0188, # VARIATION SELECTOR-153
276 'VS154' => 0xE0189, # VARIATION SELECTOR-154
277 'VS155' => 0xE018A, # VARIATION SELECTOR-155
278 'VS156' => 0xE018B, # VARIATION SELECTOR-156
279 'VS157' => 0xE018C, # VARIATION SELECTOR-157
280 'VS158' => 0xE018D, # VARIATION SELECTOR-158
281 'VS159' => 0xE018E, # VARIATION SELECTOR-159
282 'VS160' => 0xE018F, # VARIATION SELECTOR-160
283 'VS161' => 0xE0190, # VARIATION SELECTOR-161
284 'VS162' => 0xE0191, # VARIATION SELECTOR-162
285 'VS163' => 0xE0192, # VARIATION SELECTOR-163
286 'VS164' => 0xE0193, # VARIATION SELECTOR-164
287 'VS165' => 0xE0194, # VARIATION SELECTOR-165
288 'VS166' => 0xE0195, # VARIATION SELECTOR-166
289 'VS167' => 0xE0196, # VARIATION SELECTOR-167
290 'VS168' => 0xE0197, # VARIATION SELECTOR-168
291 'VS169' => 0xE0198, # VARIATION SELECTOR-169
292 'VS170' => 0xE0199, # VARIATION SELECTOR-170
293 'VS171' => 0xE019A, # VARIATION SELECTOR-171
294 'VS172' => 0xE019B, # VARIATION SELECTOR-172
295 'VS173' => 0xE019C, # VARIATION SELECTOR-173
296 'VS174' => 0xE019D, # VARIATION SELECTOR-174
297 'VS175' => 0xE019E, # VARIATION SELECTOR-175
298 'VS176' => 0xE019F, # VARIATION SELECTOR-176
299 'VS177' => 0xE01A0, # VARIATION SELECTOR-177
300 'VS178' => 0xE01A1, # VARIATION SELECTOR-178
301 'VS179' => 0xE01A2, # VARIATION SELECTOR-179
302 'VS180' => 0xE01A3, # VARIATION SELECTOR-180
303 'VS181' => 0xE01A4, # VARIATION SELECTOR-181
304 'VS182' => 0xE01A5, # VARIATION SELECTOR-182
305 'VS183' => 0xE01A6, # VARIATION SELECTOR-183
306 'VS184' => 0xE01A7, # VARIATION SELECTOR-184
307 'VS185' => 0xE01A8, # VARIATION SELECTOR-185
308 'VS186' => 0xE01A9, # VARIATION SELECTOR-186
309 'VS187' => 0xE01AA, # VARIATION SELECTOR-187
310 'VS188' => 0xE01AB, # VARIATION SELECTOR-188
311 'VS189' => 0xE01AC, # VARIATION SELECTOR-189
312 'VS190' => 0xE01AD, # VARIATION SELECTOR-190
313 'VS191' => 0xE01AE, # VARIATION SELECTOR-191
314 'VS192' => 0xE01AF, # VARIATION SELECTOR-192
315 'VS193' => 0xE01B0, # VARIATION SELECTOR-193
316 'VS194' => 0xE01B1, # VARIATION SELECTOR-194
317 'VS195' => 0xE01B2, # VARIATION SELECTOR-195
318 'VS196' => 0xE01B3, # VARIATION SELECTOR-196
319 'VS197' => 0xE01B4, # VARIATION SELECTOR-197
320 'VS198' => 0xE01B5, # VARIATION SELECTOR-198
321 'VS199' => 0xE01B6, # VARIATION SELECTOR-199
322 'VS200' => 0xE01B7, # VARIATION SELECTOR-200
323 'VS201' => 0xE01B8, # VARIATION SELECTOR-201
324 'VS202' => 0xE01B9, # VARIATION SELECTOR-202
325 'VS203' => 0xE01BA, # VARIATION SELECTOR-203
326 'VS204' => 0xE01BB, # VARIATION SELECTOR-204
327 'VS205' => 0xE01BC, # VARIATION SELECTOR-205
328 'VS206' => 0xE01BD, # VARIATION SELECTOR-206
329 'VS207' => 0xE01BE, # VARIATION SELECTOR-207
330 'VS208' => 0xE01BF, # VARIATION SELECTOR-208
331 'VS209' => 0xE01C0, # VARIATION SELECTOR-209
332 'VS210' => 0xE01C1, # VARIATION SELECTOR-210
333 'VS211' => 0xE01C2, # VARIATION SELECTOR-211
334 'VS212' => 0xE01C3, # VARIATION SELECTOR-212
335 'VS213' => 0xE01C4, # VARIATION SELECTOR-213
336 'VS214' => 0xE01C5, # VARIATION SELECTOR-214
337 'VS215' => 0xE01C6, # VARIATION SELECTOR-215
338 'VS216' => 0xE01C7, # VARIATION SELECTOR-216
339 'VS217' => 0xE01C8, # VARIATION SELECTOR-217
340 'VS218' => 0xE01C9, # VARIATION SELECTOR-218
341 'VS219' => 0xE01CA, # VARIATION SELECTOR-219
342 'VS220' => 0xE01CB, # VARIATION SELECTOR-220
343 'VS221' => 0xE01CC, # VARIATION SELECTOR-221
344 'VS222' => 0xE01CD, # VARIATION SELECTOR-222
345 'VS223' => 0xE01CE, # VARIATION SELECTOR-223
346 'VS224' => 0xE01CF, # VARIATION SELECTOR-224
347 'VS225' => 0xE01D0, # VARIATION SELECTOR-225
348 'VS226' => 0xE01D1, # VARIATION SELECTOR-226
349 'VS227' => 0xE01D2, # VARIATION SELECTOR-227
350 'VS228' => 0xE01D3, # VARIATION SELECTOR-228
351 'VS229' => 0xE01D4, # VARIATION SELECTOR-229
352 'VS230' => 0xE01D5, # VARIATION SELECTOR-230
353 'VS231' => 0xE01D6, # VARIATION SELECTOR-231
354 'VS232' => 0xE01D7, # VARIATION SELECTOR-232
355 'VS233' => 0xE01D8, # VARIATION SELECTOR-233
356 'VS234' => 0xE01D9, # VARIATION SELECTOR-234
357 'VS235' => 0xE01DA, # VARIATION SELECTOR-235
358 'VS236' => 0xE01DB, # VARIATION SELECTOR-236
359 'VS237' => 0xE01DC, # VARIATION SELECTOR-237
360 'VS238' => 0xE01DD, # VARIATION SELECTOR-238
361 'VS239' => 0xE01DE, # VARIATION SELECTOR-239
362 'VS240' => 0xE01DF, # VARIATION SELECTOR-240
363 'VS241' => 0xE01E0, # VARIATION SELECTOR-241
364 'VS242' => 0xE01E1, # VARIATION SELECTOR-242
365 'VS243' => 0xE01E2, # VARIATION SELECTOR-243
366 'VS244' => 0xE01E3, # VARIATION SELECTOR-244
367 'VS245' => 0xE01E4, # VARIATION SELECTOR-245
368 'VS246' => 0xE01E5, # VARIATION SELECTOR-246
369 'VS247' => 0xE01E6, # VARIATION SELECTOR-247
370 'VS248' => 0xE01E7, # VARIATION SELECTOR-248
371 'VS249' => 0xE01E8, # VARIATION SELECTOR-249
372 'VS250' => 0xE01E9, # VARIATION SELECTOR-250
373 'VS251' => 0xE01EA, # VARIATION SELECTOR-251
374 'VS252' => 0xE01EB, # VARIATION SELECTOR-252
375 'VS253' => 0xE01EC, # VARIATION SELECTOR-253
376 'VS254' => 0xE01ED, # VARIATION SELECTOR-254
377 'VS255' => 0xE01EE, # VARIATION SELECTOR-255
378 'VS256' => 0xE01EF, # VARIATION SELECTOR-256
379 'WJ' => 0x2060, # WORD JOINER
380 'ZWJ' => 0x200D, # ZERO WIDTH JOINER
381 'ZWNJ' => 0x200C, # ZERO WIDTH NON-JOINER
382 'ZWSP' => 0x200B, # ZERO WIDTH SPACE
383 );
52ea3e69 384
232cbbee 385my %deprecated_aliases = (
16036bcd
KW
386 # Pre-3.2 compatibility (only for the first 256 characters).
387 # Use of these gives deprecated message.
388 'HORIZONTAL TABULATION' => 0x09, # CHARACTER TABULATION
389 'VERTICAL TABULATION' => 0x0B, # LINE TABULATION
390 'FILE SEPARATOR' => 0x1C, # INFORMATION SEPARATOR FOUR
391 'GROUP SEPARATOR' => 0x1D, # INFORMATION SEPARATOR THREE
392 'RECORD SEPARATOR' => 0x1E, # INFORMATION SEPARATOR TWO
393 'UNIT SEPARATOR' => 0x1F, # INFORMATION SEPARATOR ONE
394 'HORIZONTAL TABULATION SET' => 0x88, # CHARACTER TABULATION SET
395 'HORIZONTAL TABULATION WITH JUSTIFICATION' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
396 'PARTIAL LINE DOWN' => 0x8B, # PARTIAL LINE FORWARD
397 'PARTIAL LINE UP' => 0x8C, # PARTIAL LINE BACKWARD
398 'VERTICAL TABULATION SET' => 0x8A, # LINE TABULATION SET
399 'REVERSE INDEX' => 0x8D, # REVERSE LINE FEED
400 );
52ea3e69 401
232cbbee 402my %user_name_aliases = (
16036bcd 403 # User defined aliases. Even more convenient :)
232cbbee
KW
404 # These are the ones that resolved to names
405 );
406
407my %user_numeric_aliases = (
408 # And these resolve directly to code points.
409 );
410my %inverse_user_aliases = (
411 # Map from code point to name
16036bcd 412 );
423cee85 413my $txt;
232cbbee
KW
414my $decimal_qr = qr/^[1-9]\d*$/;
415
416# Returns the hex number in $1.
417my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/;
423cee85 418
8878f897
T
419sub croak
420{
421 require Carp; goto &Carp::croak;
422} # croak
423
424sub carp
425{
426 require Carp; goto &Carp::carp;
427} # carp
428
35c0985d
MB
429sub alias (@)
430{
35c0985d 431 my $alias = ref $_[0] ? $_[0] : { @_ };
232cbbee
KW
432 foreach my $name (keys %$alias) {
433 my $value = $alias->{$name};
434 if ($value =~ $decimal_qr) {
435 $user_numeric_aliases{$name} = $value;
436
437 # Use a canonical form.
438 $inverse_user_aliases{sprintf("%04X", $value)} = $name;
439 }
440 elsif ($value =~ $hex_qr) {
b342e77e 441 my $decimal = CORE::hex $1;
232cbbee
KW
442 $user_numeric_aliases{$name} = $decimal;
443
444 # Must convert to decimal and back to guarantee canonical form
445 $inverse_user_aliases{sprintf("%04X", $decimal)} = $name;
446 }
447 else {
448 $user_name_aliases{$name} = $value;
449 }
450 }
35c0985d
MB
451} # alias
452
453sub alias_file ($)
454{
51cf30b6
MB
455 my ($arg, $file) = @_;
456 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
457 $file = $arg;
458 }
459 elsif ($arg =~ m/^\w+$/) {
460 $file = "unicore/${arg}_alias.pl";
461 }
462 else {
463 croak "Charnames alias files can only have identifier characters";
464 }
35c0985d 465 if (my @alias = do $file) {
51cf30b6
MB
466 @alias == 1 && !defined $alias[0] and
467 croak "$file cannot be used as alias file for charnames";
468 @alias % 2 and
469 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
470 alias (@alias);
471 return (1);
472 }
473 0;
474} # alias_file
475
63098191
KW
476
477sub lookup_name {
b177ca84 478 my $name = shift;
63098191
KW
479 my $runtime = shift; # compile vs run time
480
481 # Finds the ordinal of a character name, first in the aliases, then in
482 # the large table. If not found, returns undef if runtime; complains
483 # and returns the Unicode replacement if compile.
484 # This is not optimized in any way yet
485
16036bcd 486 my $ord;
b177ca84 487
232cbbee
KW
488 # User alias should be checked first or else can't override ours, and if we
489 # add any, could conflict with theirs.
490 if (exists $user_numeric_aliases{$name}) {
491 $ord = $user_numeric_aliases{$name};
16036bcd 492 }
232cbbee
KW
493 elsif (exists $user_name_aliases{$name}) {
494 $name = $user_name_aliases{$name};
495 }
496 elsif (exists $system_aliases{$name}) {
497 $ord = $system_aliases{$name};
52ea3e69 498 }
232cbbee 499 elsif (exists $deprecated_aliases{$name}) {
35c0985d 500 require warnings;
232cbbee
KW
501 warnings::warnif('deprecated', "Unicode character name \"$name\" is deprecated, use \"" . viacode($deprecated_aliases{$name}) . "\" instead");
502 $ord = $deprecated_aliases{$name};
52ea3e69 503 }
b177ca84 504
423cee85 505 my @off;
52ea3e69 506
16036bcd 507 if (! defined $ord) {
35c0985d
MB
508 ## Suck in the code/name list as a big string.
509 ## Lines look like:
510 ## "0052\t\tLATIN CAPITAL LETTER R\n"
511 $txt = do "unicore/Name.pl" unless $txt;
512
513 ## @off will hold the index into the code/name string of the start and
514 ## end of the name as we find it.
515
63098191
KW
516 ## If :full, look for the name exactly; runtime implies full
517 if (($runtime || $^H{charnames_full}) && $txt =~ /\t\t\Q$name\E$/m) {
518 @off = ($-[0] + 2, $+[0]); # The 2 is for the 2 tabs
35c0985d
MB
519 }
520
521 ## If we didn't get above, and :short allowed, look for the short name.
522 ## The short name is like "greek:Sigma"
523 unless (@off) {
63098191
KW
524 if (($runtime || $^H{charnames_short}) && $name =~ /^(.+?):(.+)/s) {
525 my ($script, $cname) = ($1, $2);
526 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
527 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
528 @off = ($-[0] + 2, $+[0]);
529 }
423cee85 530 }
35c0985d 531 }
b177ca84 532
35c0985d
MB
533 ## If we still don't have it, check for the name among the loaded
534 ## scripts.
63098191 535 if (! $runtime && not @off) {
35c0985d
MB
536 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
537 for my $script (@{$^H{charnames_scripts}}) {
63098191
KW
538 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
539 @off = ($-[0] + 2, $+[0]);
540 last;
541 }
52ea3e69 542 }
35c0985d
MB
543 }
544
545 ## If we don't have it by now, give up.
546 unless (@off) {
63098191 547 return if $runtime;
35c0985d
MB
548 carp "Unknown charname '$name'";
549 return "\x{FFFD}";
550 }
551
63098191
KW
552 # Get the official name in case need to output a message
553 $name = substr($txt, $off[0], $off[1] - $off[0]);
554
35c0985d
MB
555 ##
556 ## Now know where in the string the name starts.
557 ## The code, in hex, is before that.
558 ##
559 ## The code can be 4-6 characters long, so we've got to sort of
560 ## go look for it, just after the newline that comes before $off[0].
561 ##
562 ## This would be much easier if unicore/Name.pl had info in
563 ## a name/code order, instead of code/name order.
564 ##
565 ## The +1 after the rindex() is to skip past the newline we're finding,
566 ## or, if the rindex() fails, to put us to an offset of zero.
567 ##
568 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
569
570 ## we know where it starts, so turn into number -
571 ## the ordinal for the char.
63098191 572 $ord = CORE::hex substr($txt, $hexstart, $off[0] - 2 - $hexstart);
423cee85 573 }
b177ca84 574
63098191
KW
575 return $ord if $runtime || $ord <= 255 || ! ($^H & $bytes::hint_bits);
576
577 # Here is compile time, "use bytes" is in effect, and the character
578 # won't fit in a byte
579
580 croak sprintf("Character 0x%04x with name '$name' is above 0xFF", $ord);
581} # lookup_name
582
583sub charnames {
584 my $name = shift;
585
586 # For \N{...}. Looks up the character name and returns its ordinal if
587 # found, undef otherwise. If not in 'use bytes', forces into utf8
588
589 my $ord = lookup_name($name, 0); # 0 means compile-time
590 return unless defined $ord;
591 return chr $ord if $^H & $bytes::hint_bits;
f0175764 592
52ea3e69 593 no warnings 'utf8'; # allow even illegal characters
bfa383d6 594 return pack "U", $ord;
63098191 595}
423cee85 596
b177ca84
JF
597sub import
598{
599 shift; ## ignore class name
600
35c0985d
MB
601 if (not @_) {
602 carp("`use charnames' needs explicit imports list");
b177ca84 603 }
423cee85 604 $^H{charnames} = \&charnames ;
b177ca84
JF
605
606 ##
607 ## fill %h keys with our @_ args.
608 ##
35c0985d 609 my ($promote, %h, @args) = (0);
e5c3f898
MG
610 while (my $arg = shift) {
611 if ($arg eq ":alias") {
51cf30b6
MB
612 @_ or
613 croak ":alias needs an argument in charnames";
35c0985d
MB
614 my $alias = shift;
615 if (ref $alias) {
616 ref $alias eq "HASH" or
51cf30b6 617 croak "Only HASH reference supported as argument to :alias";
35c0985d
MB
618 alias ($alias);
619 next;
620 }
51cf30b6
MB
621 if ($alias =~ m{:(\w+)$}) {
622 $1 eq "full" || $1 eq "short" and
623 croak ":alias cannot use existing pragma :$1 (reversed order?)";
624 alias_file ($1) and $promote = 1;
625 next;
35c0985d 626 }
51cf30b6
MB
627 alias_file ($alias);
628 next;
629 }
e5c3f898
MG
630 if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
631 warn "unsupported special '$arg' in charnames";
51cf30b6 632 next;
35c0985d 633 }
e5c3f898 634 push @args, $arg;
35c0985d
MB
635 }
636 @args == 0 && $promote and @args = (":full");
637 @h{@args} = (1) x @args;
b177ca84 638
423cee85
JH
639 $^H{charnames_full} = delete $h{':full'};
640 $^H{charnames_short} = delete $h{':short'};
641 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
642
643 ##
644 ## If utf8? warnings are enabled, and some scripts were given,
645 ## see if at least we can find one letter of each script.
646 ##
35c0985d
MB
647 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
648 $txt = do "unicore/Name.pl" unless $txt;
649
650 for my $script (@{$^H{charnames_scripts}}) {
651 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
652 warnings::warn('utf8', "No such script: '$script'");
b177ca84 653 }
35c0985d 654 }
bd62941a 655 }
35c0985d 656} # import
423cee85 657
63098191
KW
658my %viacode; # Cache of already-found codes
659
660sub viacode {
661
662 # Returns the name of the code point argument
4e2cda5d 663
35c0985d
MB
664 if (@_ != 1) {
665 carp "charnames::viacode() expects one argument";
bd5c3bd9 666 return;
35c0985d 667 }
f0175764 668
35c0985d 669 my $arg = shift;
b177ca84 670
e5432b89
KW
671 # This is derived from Unicode::UCD, where it is nearly the same as the
672 # function _getcode(), but here it makes sure that even a hex argument
673 # has the proper number of leading zeros, which is critical in
674 # matching against $txt below
35c0985d 675 my $hex;
232cbbee 676 if ($arg =~ $decimal_qr) {
35c0985d 677 $hex = sprintf "%04X", $arg;
232cbbee 678 } elsif ($arg =~ $hex_qr) {
e10d7780 679 # Below is the line that differs from the _getcode() source
c8002005 680 $hex = sprintf "%04X", hex $1;
35c0985d
MB
681 } else {
682 carp("unexpected arg \"$arg\" to charnames::viacode()");
683 return;
684 }
b177ca84 685
35c0985d 686 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 687
ac046fe1
KW
688 # If the code point is above the max in the table, there's no point
689 # looking through it. Checking the length first is slightly faster
690 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) {
691 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 692
ac046fe1
KW
693 # Return the official name, if exists. It's unclear to me (khw) at
694 # this juncture if it is better to return a user-defined override, so
695 # leaving it as is for now.
696 if ($txt =~ m/^$hex\t\t(.+)/m) {
697 $viacode{$hex} = $1;
698 return $1;
699 }
232cbbee
KW
700 }
701
702 # See if there is a user name for it, before giving up completely.
ac046fe1
KW
703 if (! exists $inverse_user_aliases{$hex}) {
704 if (CORE::hex($hex) > 0x10FFFF) {
705 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
706 }
707 return;
708 }
bd5c3bd9 709
232cbbee
KW
710 $viacode{$hex} = $inverse_user_aliases{$hex};
711 return $inverse_user_aliases{$hex};
35c0985d 712} # viacode
daf0d493 713
63098191 714my %vianame; # Cache of already-found names
4e2cda5d 715
daf0d493
JH
716sub vianame
717{
35c0985d
MB
718 if (@_ != 1) {
719 carp "charnames::vianame() expects one name argument";
720 return ()
721 }
daf0d493 722
63098191
KW
723 # Looks up the character name and returns its ordinal if
724 # found, undef otherwise.
daf0d493 725
63098191 726 my $arg = shift;
dbc0d4f2 727
63098191 728 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d 729
63098191
KW
730 # khw claims that this is bad. The function should return either a
731 # an ord or a chr for all inputs; not be bipolar. Also, under 'use
732 # bytes', can create a chr above 255.
733 return chr CORE::hex $1;
734 }
daf0d493 735
63098191
KW
736 if (! exists $vianame{$arg}) {
737 $vianame{$arg} = lookup_name($arg, 1); # 1 means run-time
35c0985d 738 }
63098191
KW
739
740 return $vianame{$arg};
35c0985d 741} # vianame
b177ca84 742
423cee85
JH
743
7441;
745__END__
746
747=head1 NAME
748
274085e3 749charnames - define character names for C<\N{named}> string literal escapes
423cee85
JH
750
751=head1 SYNOPSIS
752
753 use charnames ':full';
4a2d328f 754 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
755
756 use charnames ':short';
4a2d328f 757 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
758
759 use charnames qw(cyrillic greek);
4a2d328f 760 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 761
35c0985d
MB
762 use charnames ":full", ":alias" => {
763 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
e5432b89 764 mychar => 0xE8000, # Private use area
76ae0c45 765 };
35c0985d 766 print "\N{e_ACUTE} is a small letter e with an acute.\n";
e5432b89 767 print "\\N{mychar} allows me to name and use private use characters.\n";
35c0985d 768
76ae0c45 769 use charnames ();
a23c04e4 770 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
16036bcd
KW
771 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
772 # "10330"
b177ca84 773
423cee85
JH
774=head1 DESCRIPTION
775
e5432b89
KW
776Pragma C<use charnames> enables the use of C<\N{CHARNAME}> sequences to
777insert a Unicode character into a string based on its name. (However,
778you don't need this pragma to use C<\N{U+...}> where the C<...> is a
779hexadecimal ordinal number.)
780
781The pragma supports arguments C<:full>, C<:short>, script names and
782customized aliases. If C<:full> is present, for expansion of
76ae0c45
RGS
783C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
784standard Unicode character names. If C<:short> is present, and
423cee85
JH
785C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
786as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
a191c821 787with script name arguments, then for C<\N{CHARNAME}> the name
423cee85 788C<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
789specified order). Customized aliases can override these, and are explained in
790L</CUSTOM ALIASES>.
423cee85
JH
791
792For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 793this pragma looks for the names
423cee85
JH
794
795 SCRIPTNAME CAPITAL LETTER CHARNAME
796 SCRIPTNAME SMALL LETTER CHARNAME
797 SCRIPTNAME LETTER CHARNAME
798
799in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493
JH
800then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
801is ignored.
802
803Note that C<\N{...}> is compile-time, it's a special form of string
804constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 805use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 806functionality, use charnames::vianame().
423cee85 807
301a3cda 808For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
dbc0d4f2 809as of Unicode 3.1, there are no official Unicode names but you can use
16036bcd
KW
810instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth, and their
811abbreviations, LF, ESC, ...). In
dbc0d4f2 812Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
16036bcd 813has been updated, see L</ALIASES>.
dbc0d4f2
JH
814
815Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
816is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
301a3cda 817
e5432b89
KW
818If the input name is unknown, C<\N{NAME}> raises a warning and
819substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
820
821It is a fatal error if C<use bytes> is in effect and the input name is
822that of a character that won't fit into a byte (i.e., whose ordinal is
823above 255).
824
5ffe0e96 825=head1 ALIASES
423cee85 826
5ffe0e96
MB
827A few aliases have been defined for convenience: instead of having
828to use the official names
423cee85 829
5ffe0e96
MB
830 LINE FEED (LF)
831 FORM FEED (FF)
832 CARRIAGE RETURN (CR)
833 NEXT LINE (NEL)
423cee85 834
e5432b89 835(yes, with parentheses), one can use
d5448623 836
5ffe0e96
MB
837 LINE FEED
838 FORM FEED
839 CARRIAGE RETURN
840 NEXT LINE
841 LF
842 FF
843 CR
844 NEL
845
16036bcd
KW
846All the other standard abbreviations for the controls, such as C<ACK> for
847C<ACKNOWLEDGE> also can be used.
848
5ffe0e96
MB
849One can also use
850
851 BYTE ORDER MARK
852 BOM
853
16036bcd
KW
854and these abbreviations
855
856 Abbreviation Full Name
857
858 CGJ COMBINING GRAPHEME JOINER
859 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
860 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
861 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
862 LRE LEFT-TO-RIGHT EMBEDDING
863 LRM LEFT-TO-RIGHT MARK
864 LRO LEFT-TO-RIGHT OVERRIDE
865 MMSP MEDIUM MATHEMATICAL SPACE
866 MVS MONGOLIAN VOWEL SEPARATOR
867 NBSP NO-BREAK SPACE
868 NNBSP NARROW NO-BREAK SPACE
869 PDF POP DIRECTIONAL FORMATTING
870 RLE RIGHT-TO-LEFT EMBEDDING
871 RLM RIGHT-TO-LEFT MARK
872 RLO RIGHT-TO-LEFT OVERRIDE
873 SHY SOFT HYPHEN
874 VS1 VARIATION SELECTOR-1
875 .
876 .
877 .
878 VS256 VARIATION SELECTOR-256
879 WJ WORD JOINER
880 ZWJ ZERO WIDTH JOINER
881 ZWNJ ZERO WIDTH NON-JOINER
882 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
883
884For backward compatibility one can use the old names for
885certain C0 and C1 controls
886
887 old new
888
5ffe0e96
MB
889 FILE SEPARATOR INFORMATION SEPARATOR FOUR
890 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
891 HORIZONTAL TABULATION CHARACTER TABULATION
892 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
893 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
894 WITH JUSTIFICATION
5ffe0e96
MB
895 PARTIAL LINE DOWN PARTIAL LINE FORWARD
896 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
897 RECORD SEPARATOR INFORMATION SEPARATOR TWO
898 REVERSE INDEX REVERSE LINE FEED
899 UNIT SEPARATOR INFORMATION SEPARATOR ONE
900 VERTICAL TABULATION LINE TABULATION
901 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
902
903but the old names in addition to giving the character
904will also give a warning about being deprecated.
423cee85 905
16036bcd
KW
906And finally, certain published variants are usable, including some for
907controls that have no Unicode names:
908
909 END OF PROTECTED AREA
910 HIGH OCTET PRESET
911 HOP
912 IND
913 INDEX
914 PAD
915 PADDING CHARACTER
916 PRIVATE USE 1
917 PRIVATE USE 2
918 SGC
919 SINGLE GRAPHIC CHARACTER INTRODUCER
920 SINGLE-SHIFT 2
921 SINGLE-SHIFT 3
922 START OF PROTECTED AREA
923
35c0985d
MB
924=head1 CUSTOM ALIASES
925
926This version of charnames supports three mechanisms of adding local
55bc7d3c 927or customized aliases to standard Unicode naming conventions (:full).
e5432b89
KW
928The aliases override any standard definitions, so, if you're twisted
929enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to mean C<"B">,
930etc.
55bc7d3c
KW
931
932Note that an alias should not be something that is a legal curly
933brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
e5432b89
KW
934C<\N{123}> means to match 123 non-newline characters, and is not treated as a
935charnames alias. Aliases are discouraged from beginning with anything
936other than an alphabetic character and from containing anything other
937than alphanumerics, spaces, dashes, parentheses, and underscores.
938Currently they must be ASCII.
939
940An alias can map to either an official Unicode character name or to a
941numeric code point (ordinal). The latter is useful for assigning names
942to code points in Unicode private use areas such as U+E800 through
943U+F8FF. The number must look like an unsigned decimal integer, or a
944hexadecimal constant beginning with C<0x>, or C<U+>.
232cbbee 945
35c0985d
MB
946=head2 Anonymous hashes
947
948 use charnames ":full", ":alias" => {
949 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee 950 mychar1 => 0xE8000,
35c0985d
MB
951 };
952 my $str = "\N{e_ACUTE}";
953
954=head2 Alias file
955
956 use charnames ":full", ":alias" => "pro";
957
958 will try to read "unicore/pro_alias.pl" from the @INC path. This
959 file should return a list in plain perl:
960
961 (
962 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
963 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
964 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
965 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
966 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
967 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
968 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
232cbbee 969 mychar2 => U+E8001,
35c0985d
MB
970 );
971
972=head2 Alias shortcut
973
974 use charnames ":alias" => ":pro";
975
232cbbee
KW
976works exactly the same as the alias pairs, only this time,
977":full" is inserted automatically as the first argument (if no
978other argument is given).
35c0985d 979
b177ca84
JF
980=head1 charnames::viacode(code)
981
982Returns the full name of the character indicated by the numeric code.
983The example
984
985 print charnames::viacode(0x2722);
986
987prints "FOUR TEARDROP-SPOKED ASTERISK".
988
daf0d493
JH
989Returns undef if no name is known for the code.
990
232cbbee
KW
991The name returned is the official name for the code point, if
992available, otherwise your custom alias for it. This means that your
993alias will only be returned for code points that don't have an official
994Unicode name (nor Unicode version 1 name), such as private use code
995points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
daf0d493 996
274085e3
PN
997Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
998SPACE", not "BYTE ORDER MARK".
999
eb6a2339 1000=head1 charnames::vianame(name)
daf0d493
JH
1001
1002Returns the code point indicated by the name.
1003The example
1004
1005 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
1006
1007prints "2722".
1008
eb6a2339 1009Returns undef if the name is unknown.
b177ca84 1010
35c0985d 1011This works only for the standard names, and does not yet apply
b177ca84
JF
1012to custom translators.
1013
5ffe0e96 1014=head1 CUSTOM TRANSLATORS
52ea3e69 1015
5ffe0e96
MB
1016The mechanism of translation of C<\N{...}> escapes is general and not
1017hardwired into F<charnames.pm>. A module can install custom
1018translations (inside the scope which C<use>s the module) with the
1019following magic incantation:
52ea3e69 1020
5ffe0e96
MB
1021 sub import {
1022 shift;
1023 $^H{charnames} = \&translator;
1024 }
52ea3e69 1025
5ffe0e96
MB
1026Here translator() is a subroutine which takes C<CHARNAME> as an
1027argument, and returns text to insert into the string instead of the
1028C<\N{CHARNAME}> escape. Since the text to insert should be different
1029in C<bytes> mode and out of it, the function should check the current
1030state of C<bytes>-flag as in:
52ea3e69 1031
5ffe0e96
MB
1032 use bytes (); # for $bytes::hint_bits
1033 sub translator {
1034 if ($^H & $bytes::hint_bits) {
1035 return bytes_translator(@_);
1036 }
1037 else {
1038 return utf8_translator(@_);
1039 }
1040 }
52ea3e69 1041
55bc7d3c
KW
1042See L</CUSTOM ALIASES> above for restrictions on C<CHARNAME>.
1043
f0175764
JH
1044=head1 ILLEGAL CHARACTERS
1045
55bc7d3c
KW
1046If you ask by name for a character that does not exist, a warning is given and
1047the Unicode I<replacement character> "\x{FFFD}" is returned.
00d835f2 1048
0320cda0
KW
1049If you ask by code (C<charnames::viacode()>) for a character that is
1050unassigned, no warning is given and C<undef> is returned. In Unicode
1051the proper name of these is the empty string, which C<undef> stringifies
1052to. (If you ask for a code point past the legal Unicode maximum of
1053U+10FFFF you do get C<undef> and a warning.)
f0175764 1054
423cee85
JH
1055=head1 BUGS
1056
55bc7d3c 1057vianame returns a chr if the input name is of the form C<U+...>, and an ord
a0a3bc7f 1058otherwise. It is proposed to change this to always return an ord. Send email
16036bcd 1059to C<perl5-porters@perl.org> to comment on this proposal.
55bc7d3c
KW
1060
1061None of the functions work on almost all the Hangul syllable and CJK Unicode
1062characters that have their code points as part of their names.
1063
16036bcd
KW
1064Names must be ASCII characters only, which means that you are out of luck if
1065you want to create aliases in a language where some or all the characters of
1066the desired aliases are non-ASCII.
bee80e93 1067
fe749c9a
KW
1068Unicode standard named sequences are not recognized, such as
1069C<LATIN CAPITAL LETTER A WITH MACRON AND GRAVE>
1070(which should mean C<LATIN CAPITAL LETTER A WITH MACRON> with an additional
1071C<COMBINING GRAVE ACCENT>).
1072
55bc7d3c 1073Since evaluation of the translation function happens in the middle of
423cee85
JH
1074compilation (of a string literal), the translation function should not
1075do any C<eval>s or C<require>s. This restriction should be lifted in
1076a future version of Perl.
1077
1078=cut