This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
charnames: check for use bytes in vianame; efficiency
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
63098191 5our $VERSION = '1.11';
b75c8c73 6
d5448623 7use bytes (); # for $bytes::hint_bits
423cee85 8
232cbbee 9my %system_aliases = (
16036bcd
KW
10 # Icky 3.2 names with parentheses.
11 'LINE FEED' => 0x0A, # LINE FEED (LF)
12 'FORM FEED' => 0x0C, # FORM FEED (FF)
13 'CARRIAGE RETURN' => 0x0D, # CARRIAGE RETURN (CR)
14 'NEXT LINE' => 0x85, # NEXT LINE (NEL)
15
16 # Some variant names from Wikipedia
17 'SINGLE-SHIFT 2' => 0x8E,
18 'SINGLE-SHIFT 3' => 0x8F,
19 'PRIVATE USE 1' => 0x91,
20 'PRIVATE USE 2' => 0x92,
21 'START OF PROTECTED AREA' => 0x96,
22 'END OF PROTECTED AREA' => 0x97,
23
24 # Convenience. Standard abbreviations for the controls
25 'NUL' => 0x00, # NULL
26 'SOH' => 0x01, # START OF HEADING
27 'STX' => 0x02, # START OF TEXT
28 'ETX' => 0x03, # END OF TEXT
29 'EOT' => 0x04, # END OF TRANSMISSION
30 'ENQ' => 0x05, # ENQUIRY
31 'ACK' => 0x06, # ACKNOWLEDGE
32 'BEL' => 0x07, # BELL
33 'BS' => 0x08, # BACKSPACE
34 'HT' => 0x09, # HORIZONTAL TABULATION
35 'LF' => 0x0A, # LINE FEED (LF)
36 'VT' => 0x0B, # VERTICAL TABULATION
37 'FF' => 0x0C, # FORM FEED (FF)
38 'CR' => 0x0D, # CARRIAGE RETURN (CR)
39 'SO' => 0x0E, # SHIFT OUT
40 'SI' => 0x0F, # SHIFT IN
41 'DLE' => 0x10, # DATA LINK ESCAPE
42 'DC1' => 0x11, # DEVICE CONTROL ONE
43 'DC2' => 0x12, # DEVICE CONTROL TWO
44 'DC3' => 0x13, # DEVICE CONTROL THREE
45 'DC4' => 0x14, # DEVICE CONTROL FOUR
46 'NAK' => 0x15, # NEGATIVE ACKNOWLEDGE
47 'SYN' => 0x16, # SYNCHRONOUS IDLE
48 'ETB' => 0x17, # END OF TRANSMISSION BLOCK
49 'CAN' => 0x18, # CANCEL
50 'EOM' => 0x19, # END OF MEDIUM
51 'SUB' => 0x1A, # SUBSTITUTE
52 'ESC' => 0x1B, # ESCAPE
53 'FS' => 0x1C, # FILE SEPARATOR
54 'GS' => 0x1D, # GROUP SEPARATOR
55 'RS' => 0x1E, # RECORD SEPARATOR
56 'US' => 0x1F, # UNIT SEPARATOR
57 'DEL' => 0x7F, # DELETE
58 'BPH' => 0x82, # BREAK PERMITTED HERE
59 'NBH' => 0x83, # NO BREAK HERE
60 'NEL' => 0x85, # NEXT LINE (NEL)
61 'SSA' => 0x86, # START OF SELECTED AREA
62 'ESA' => 0x87, # END OF SELECTED AREA
63 'HTS' => 0x88, # CHARACTER TABULATION SET
64 'HTJ' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
65 'VTS' => 0x8A, # LINE TABULATION SET
66 'PLD' => 0x8B, # PARTIAL LINE FORWARD
67 'PLU' => 0x8C, # PARTIAL LINE BACKWARD
68 'RI ' => 0x8D, # REVERSE LINE FEED
69 'SS2' => 0x8E, # SINGLE SHIFT TWO
70 'SS3' => 0x8F, # SINGLE SHIFT THREE
71 'DCS' => 0x90, # DEVICE CONTROL STRING
72 'PU1' => 0x91, # PRIVATE USE ONE
73 'PU2' => 0x92, # PRIVATE USE TWO
74 'STS' => 0x93, # SET TRANSMIT STATE
75 'CCH' => 0x94, # CANCEL CHARACTER
76 'MW ' => 0x95, # MESSAGE WAITING
77 'SPA' => 0x96, # START OF GUARDED AREA
78 'EPA' => 0x97, # END OF GUARDED AREA
79 'SOS' => 0x98, # START OF STRING
80 'SCI' => 0x9A, # SINGLE CHARACTER INTRODUCER
81 'CSI' => 0x9B, # CONTROL SEQUENCE INTRODUCER
82 'ST ' => 0x9C, # STRING TERMINATOR
83 'OSC' => 0x9D, # OPERATING SYSTEM COMMAND
84 'PM ' => 0x9E, # PRIVACY MESSAGE
85 'APC' => 0x9F, # APPLICATION PROGRAM COMMAND
86
87 # There are no names for these in the Unicode standard;
88 # perhaps should be deprecated, but then again there are
89 # no alternative names, so am not deprecating. And if
90 # did, the code would have to change to not recommend an
91 # alternative for these.
92 'PADDING CHARACTER' => 0x80,
93 'PAD' => 0x80,
94 'HIGH OCTET PRESET' => 0x81,
95 'HOP' => 0x81,
96 'INDEX' => 0x84,
97 'IND' => 0x84,
98 'SINGLE GRAPHIC CHARACTER INTRODUCER' => 0x99,
99 'SGC' => 0x99,
100
101 # More convenience. For further convenience,
102 # it is suggested some way of using the NamesList
103 # aliases be implemented, but there are ambiguities in
232cbbee 104 # NamesList.txt
16036bcd
KW
105 'BOM' => 0xFEFF, # BYTE ORDER MARK
106 'BYTE ORDER MARK'=> 0xFEFF,
107 'CGJ' => 0x034F, # COMBINING GRAPHEME JOINER
108 'FVS1' => 0x180B, # MONGOLIAN FREE VARIATION SELECTOR ONE
109 'FVS2' => 0x180C, # MONGOLIAN FREE VARIATION SELECTOR TWO
110 'FVS3' => 0x180D, # MONGOLIAN FREE VARIATION SELECTOR THREE
111 'LRE' => 0x202A, # LEFT-TO-RIGHT EMBEDDING
112 'LRM' => 0x200E, # LEFT-TO-RIGHT MARK
113 'LRO' => 0x202D, # LEFT-TO-RIGHT OVERRIDE
114 'MMSP' => 0x205F, # MEDIUM MATHEMATICAL SPACE
115 'MVS' => 0x180E, # MONGOLIAN VOWEL SEPARATOR
116 'NBSP' => 0x00A0, # NO-BREAK SPACE
117 'NNBSP' => 0x202F, # NARROW NO-BREAK SPACE
118 'PDF' => 0x202C, # POP DIRECTIONAL FORMATTING
119 'RLE' => 0x202B, # RIGHT-TO-LEFT EMBEDDING
120 'RLM' => 0x200F, # RIGHT-TO-LEFT MARK
121 'RLO' => 0x202E, # RIGHT-TO-LEFT OVERRIDE
122 'SHY' => 0x00AD, # SOFT HYPHEN
123 'VS1' => 0xFE00, # VARIATION SELECTOR-1
124 'VS2' => 0xFE01, # VARIATION SELECTOR-2
125 'VS3' => 0xFE02, # VARIATION SELECTOR-3
126 'VS4' => 0xFE03, # VARIATION SELECTOR-4
127 'VS5' => 0xFE04, # VARIATION SELECTOR-5
128 'VS6' => 0xFE05, # VARIATION SELECTOR-6
129 'VS7' => 0xFE06, # VARIATION SELECTOR-7
130 'VS8' => 0xFE07, # VARIATION SELECTOR-8
131 'VS9' => 0xFE08, # VARIATION SELECTOR-9
132 'VS10' => 0xFE09, # VARIATION SELECTOR-10
133 'VS11' => 0xFE0A, # VARIATION SELECTOR-11
134 'VS12' => 0xFE0B, # VARIATION SELECTOR-12
135 'VS13' => 0xFE0C, # VARIATION SELECTOR-13
136 'VS14' => 0xFE0D, # VARIATION SELECTOR-14
137 'VS15' => 0xFE0E, # VARIATION SELECTOR-15
138 'VS16' => 0xFE0F, # VARIATION SELECTOR-16
139 'VS17' => 0xE0100, # VARIATION SELECTOR-17
140 'VS18' => 0xE0101, # VARIATION SELECTOR-18
141 'VS19' => 0xE0102, # VARIATION SELECTOR-19
142 'VS20' => 0xE0103, # VARIATION SELECTOR-20
143 'VS21' => 0xE0104, # VARIATION SELECTOR-21
144 'VS22' => 0xE0105, # VARIATION SELECTOR-22
145 'VS23' => 0xE0106, # VARIATION SELECTOR-23
146 'VS24' => 0xE0107, # VARIATION SELECTOR-24
147 'VS25' => 0xE0108, # VARIATION SELECTOR-25
148 'VS26' => 0xE0109, # VARIATION SELECTOR-26
149 'VS27' => 0xE010A, # VARIATION SELECTOR-27
150 'VS28' => 0xE010B, # VARIATION SELECTOR-28
151 'VS29' => 0xE010C, # VARIATION SELECTOR-29
152 'VS30' => 0xE010D, # VARIATION SELECTOR-30
153 'VS31' => 0xE010E, # VARIATION SELECTOR-31
154 'VS32' => 0xE010F, # VARIATION SELECTOR-32
155 'VS33' => 0xE0110, # VARIATION SELECTOR-33
156 'VS34' => 0xE0111, # VARIATION SELECTOR-34
157 'VS35' => 0xE0112, # VARIATION SELECTOR-35
158 'VS36' => 0xE0113, # VARIATION SELECTOR-36
159 'VS37' => 0xE0114, # VARIATION SELECTOR-37
160 'VS38' => 0xE0115, # VARIATION SELECTOR-38
161 'VS39' => 0xE0116, # VARIATION SELECTOR-39
162 'VS40' => 0xE0117, # VARIATION SELECTOR-40
163 'VS41' => 0xE0118, # VARIATION SELECTOR-41
164 'VS42' => 0xE0119, # VARIATION SELECTOR-42
165 'VS43' => 0xE011A, # VARIATION SELECTOR-43
166 'VS44' => 0xE011B, # VARIATION SELECTOR-44
167 'VS45' => 0xE011C, # VARIATION SELECTOR-45
168 'VS46' => 0xE011D, # VARIATION SELECTOR-46
169 'VS47' => 0xE011E, # VARIATION SELECTOR-47
170 'VS48' => 0xE011F, # VARIATION SELECTOR-48
171 'VS49' => 0xE0120, # VARIATION SELECTOR-49
172 'VS50' => 0xE0121, # VARIATION SELECTOR-50
173 'VS51' => 0xE0122, # VARIATION SELECTOR-51
174 'VS52' => 0xE0123, # VARIATION SELECTOR-52
175 'VS53' => 0xE0124, # VARIATION SELECTOR-53
176 'VS54' => 0xE0125, # VARIATION SELECTOR-54
177 'VS55' => 0xE0126, # VARIATION SELECTOR-55
178 'VS56' => 0xE0127, # VARIATION SELECTOR-56
179 'VS57' => 0xE0128, # VARIATION SELECTOR-57
180 'VS58' => 0xE0129, # VARIATION SELECTOR-58
181 'VS59' => 0xE012A, # VARIATION SELECTOR-59
182 'VS60' => 0xE012B, # VARIATION SELECTOR-60
183 'VS61' => 0xE012C, # VARIATION SELECTOR-61
184 'VS62' => 0xE012D, # VARIATION SELECTOR-62
185 'VS63' => 0xE012E, # VARIATION SELECTOR-63
186 'VS64' => 0xE012F, # VARIATION SELECTOR-64
187 'VS65' => 0xE0130, # VARIATION SELECTOR-65
188 'VS66' => 0xE0131, # VARIATION SELECTOR-66
189 'VS67' => 0xE0132, # VARIATION SELECTOR-67
190 'VS68' => 0xE0133, # VARIATION SELECTOR-68
191 'VS69' => 0xE0134, # VARIATION SELECTOR-69
192 'VS70' => 0xE0135, # VARIATION SELECTOR-70
193 'VS71' => 0xE0136, # VARIATION SELECTOR-71
194 'VS72' => 0xE0137, # VARIATION SELECTOR-72
195 'VS73' => 0xE0138, # VARIATION SELECTOR-73
196 'VS74' => 0xE0139, # VARIATION SELECTOR-74
197 'VS75' => 0xE013A, # VARIATION SELECTOR-75
198 'VS76' => 0xE013B, # VARIATION SELECTOR-76
199 'VS77' => 0xE013C, # VARIATION SELECTOR-77
200 'VS78' => 0xE013D, # VARIATION SELECTOR-78
201 'VS79' => 0xE013E, # VARIATION SELECTOR-79
202 'VS80' => 0xE013F, # VARIATION SELECTOR-80
203 'VS81' => 0xE0140, # VARIATION SELECTOR-81
204 'VS82' => 0xE0141, # VARIATION SELECTOR-82
205 'VS83' => 0xE0142, # VARIATION SELECTOR-83
206 'VS84' => 0xE0143, # VARIATION SELECTOR-84
207 'VS85' => 0xE0144, # VARIATION SELECTOR-85
208 'VS86' => 0xE0145, # VARIATION SELECTOR-86
209 'VS87' => 0xE0146, # VARIATION SELECTOR-87
210 'VS88' => 0xE0147, # VARIATION SELECTOR-88
211 'VS89' => 0xE0148, # VARIATION SELECTOR-89
212 'VS90' => 0xE0149, # VARIATION SELECTOR-90
213 'VS91' => 0xE014A, # VARIATION SELECTOR-91
214 'VS92' => 0xE014B, # VARIATION SELECTOR-92
215 'VS93' => 0xE014C, # VARIATION SELECTOR-93
216 'VS94' => 0xE014D, # VARIATION SELECTOR-94
217 'VS95' => 0xE014E, # VARIATION SELECTOR-95
218 'VS96' => 0xE014F, # VARIATION SELECTOR-96
219 'VS97' => 0xE0150, # VARIATION SELECTOR-97
220 'VS98' => 0xE0151, # VARIATION SELECTOR-98
221 'VS99' => 0xE0152, # VARIATION SELECTOR-99
222 'VS100' => 0xE0153, # VARIATION SELECTOR-100
223 'VS101' => 0xE0154, # VARIATION SELECTOR-101
224 'VS102' => 0xE0155, # VARIATION SELECTOR-102
225 'VS103' => 0xE0156, # VARIATION SELECTOR-103
226 'VS104' => 0xE0157, # VARIATION SELECTOR-104
227 'VS105' => 0xE0158, # VARIATION SELECTOR-105
228 'VS106' => 0xE0159, # VARIATION SELECTOR-106
229 'VS107' => 0xE015A, # VARIATION SELECTOR-107
230 'VS108' => 0xE015B, # VARIATION SELECTOR-108
231 'VS109' => 0xE015C, # VARIATION SELECTOR-109
232 'VS110' => 0xE015D, # VARIATION SELECTOR-110
233 'VS111' => 0xE015E, # VARIATION SELECTOR-111
234 'VS112' => 0xE015F, # VARIATION SELECTOR-112
235 'VS113' => 0xE0160, # VARIATION SELECTOR-113
236 'VS114' => 0xE0161, # VARIATION SELECTOR-114
237 'VS115' => 0xE0162, # VARIATION SELECTOR-115
238 'VS116' => 0xE0163, # VARIATION SELECTOR-116
239 'VS117' => 0xE0164, # VARIATION SELECTOR-117
240 'VS118' => 0xE0165, # VARIATION SELECTOR-118
241 'VS119' => 0xE0166, # VARIATION SELECTOR-119
242 'VS120' => 0xE0167, # VARIATION SELECTOR-120
243 'VS121' => 0xE0168, # VARIATION SELECTOR-121
244 'VS122' => 0xE0169, # VARIATION SELECTOR-122
245 'VS123' => 0xE016A, # VARIATION SELECTOR-123
246 'VS124' => 0xE016B, # VARIATION SELECTOR-124
247 'VS125' => 0xE016C, # VARIATION SELECTOR-125
248 'VS126' => 0xE016D, # VARIATION SELECTOR-126
249 'VS127' => 0xE016E, # VARIATION SELECTOR-127
250 'VS128' => 0xE016F, # VARIATION SELECTOR-128
251 'VS129' => 0xE0170, # VARIATION SELECTOR-129
252 'VS130' => 0xE0171, # VARIATION SELECTOR-130
253 'VS131' => 0xE0172, # VARIATION SELECTOR-131
254 'VS132' => 0xE0173, # VARIATION SELECTOR-132
255 'VS133' => 0xE0174, # VARIATION SELECTOR-133
256 'VS134' => 0xE0175, # VARIATION SELECTOR-134
257 'VS135' => 0xE0176, # VARIATION SELECTOR-135
258 'VS136' => 0xE0177, # VARIATION SELECTOR-136
259 'VS137' => 0xE0178, # VARIATION SELECTOR-137
260 'VS138' => 0xE0179, # VARIATION SELECTOR-138
261 'VS139' => 0xE017A, # VARIATION SELECTOR-139
262 'VS140' => 0xE017B, # VARIATION SELECTOR-140
263 'VS141' => 0xE017C, # VARIATION SELECTOR-141
264 'VS142' => 0xE017D, # VARIATION SELECTOR-142
265 'VS143' => 0xE017E, # VARIATION SELECTOR-143
266 'VS144' => 0xE017F, # VARIATION SELECTOR-144
267 'VS145' => 0xE0180, # VARIATION SELECTOR-145
268 'VS146' => 0xE0181, # VARIATION SELECTOR-146
269 'VS147' => 0xE0182, # VARIATION SELECTOR-147
270 'VS148' => 0xE0183, # VARIATION SELECTOR-148
271 'VS149' => 0xE0184, # VARIATION SELECTOR-149
272 'VS150' => 0xE0185, # VARIATION SELECTOR-150
273 'VS151' => 0xE0186, # VARIATION SELECTOR-151
274 'VS152' => 0xE0187, # VARIATION SELECTOR-152
275 'VS153' => 0xE0188, # VARIATION SELECTOR-153
276 'VS154' => 0xE0189, # VARIATION SELECTOR-154
277 'VS155' => 0xE018A, # VARIATION SELECTOR-155
278 'VS156' => 0xE018B, # VARIATION SELECTOR-156
279 'VS157' => 0xE018C, # VARIATION SELECTOR-157
280 'VS158' => 0xE018D, # VARIATION SELECTOR-158
281 'VS159' => 0xE018E, # VARIATION SELECTOR-159
282 'VS160' => 0xE018F, # VARIATION SELECTOR-160
283 'VS161' => 0xE0190, # VARIATION SELECTOR-161
284 'VS162' => 0xE0191, # VARIATION SELECTOR-162
285 'VS163' => 0xE0192, # VARIATION SELECTOR-163
286 'VS164' => 0xE0193, # VARIATION SELECTOR-164
287 'VS165' => 0xE0194, # VARIATION SELECTOR-165
288 'VS166' => 0xE0195, # VARIATION SELECTOR-166
289 'VS167' => 0xE0196, # VARIATION SELECTOR-167
290 'VS168' => 0xE0197, # VARIATION SELECTOR-168
291 'VS169' => 0xE0198, # VARIATION SELECTOR-169
292 'VS170' => 0xE0199, # VARIATION SELECTOR-170
293 'VS171' => 0xE019A, # VARIATION SELECTOR-171
294 'VS172' => 0xE019B, # VARIATION SELECTOR-172
295 'VS173' => 0xE019C, # VARIATION SELECTOR-173
296 'VS174' => 0xE019D, # VARIATION SELECTOR-174
297 'VS175' => 0xE019E, # VARIATION SELECTOR-175
298 'VS176' => 0xE019F, # VARIATION SELECTOR-176
299 'VS177' => 0xE01A0, # VARIATION SELECTOR-177
300 'VS178' => 0xE01A1, # VARIATION SELECTOR-178
301 'VS179' => 0xE01A2, # VARIATION SELECTOR-179
302 'VS180' => 0xE01A3, # VARIATION SELECTOR-180
303 'VS181' => 0xE01A4, # VARIATION SELECTOR-181
304 'VS182' => 0xE01A5, # VARIATION SELECTOR-182
305 'VS183' => 0xE01A6, # VARIATION SELECTOR-183
306 'VS184' => 0xE01A7, # VARIATION SELECTOR-184
307 'VS185' => 0xE01A8, # VARIATION SELECTOR-185
308 'VS186' => 0xE01A9, # VARIATION SELECTOR-186
309 'VS187' => 0xE01AA, # VARIATION SELECTOR-187
310 'VS188' => 0xE01AB, # VARIATION SELECTOR-188
311 'VS189' => 0xE01AC, # VARIATION SELECTOR-189
312 'VS190' => 0xE01AD, # VARIATION SELECTOR-190
313 'VS191' => 0xE01AE, # VARIATION SELECTOR-191
314 'VS192' => 0xE01AF, # VARIATION SELECTOR-192
315 'VS193' => 0xE01B0, # VARIATION SELECTOR-193
316 'VS194' => 0xE01B1, # VARIATION SELECTOR-194
317 'VS195' => 0xE01B2, # VARIATION SELECTOR-195
318 'VS196' => 0xE01B3, # VARIATION SELECTOR-196
319 'VS197' => 0xE01B4, # VARIATION SELECTOR-197
320 'VS198' => 0xE01B5, # VARIATION SELECTOR-198
321 'VS199' => 0xE01B6, # VARIATION SELECTOR-199
322 'VS200' => 0xE01B7, # VARIATION SELECTOR-200
323 'VS201' => 0xE01B8, # VARIATION SELECTOR-201
324 'VS202' => 0xE01B9, # VARIATION SELECTOR-202
325 'VS203' => 0xE01BA, # VARIATION SELECTOR-203
326 'VS204' => 0xE01BB, # VARIATION SELECTOR-204
327 'VS205' => 0xE01BC, # VARIATION SELECTOR-205
328 'VS206' => 0xE01BD, # VARIATION SELECTOR-206
329 'VS207' => 0xE01BE, # VARIATION SELECTOR-207
330 'VS208' => 0xE01BF, # VARIATION SELECTOR-208
331 'VS209' => 0xE01C0, # VARIATION SELECTOR-209
332 'VS210' => 0xE01C1, # VARIATION SELECTOR-210
333 'VS211' => 0xE01C2, # VARIATION SELECTOR-211
334 'VS212' => 0xE01C3, # VARIATION SELECTOR-212
335 'VS213' => 0xE01C4, # VARIATION SELECTOR-213
336 'VS214' => 0xE01C5, # VARIATION SELECTOR-214
337 'VS215' => 0xE01C6, # VARIATION SELECTOR-215
338 'VS216' => 0xE01C7, # VARIATION SELECTOR-216
339 'VS217' => 0xE01C8, # VARIATION SELECTOR-217
340 'VS218' => 0xE01C9, # VARIATION SELECTOR-218
341 'VS219' => 0xE01CA, # VARIATION SELECTOR-219
342 'VS220' => 0xE01CB, # VARIATION SELECTOR-220
343 'VS221' => 0xE01CC, # VARIATION SELECTOR-221
344 'VS222' => 0xE01CD, # VARIATION SELECTOR-222
345 'VS223' => 0xE01CE, # VARIATION SELECTOR-223
346 'VS224' => 0xE01CF, # VARIATION SELECTOR-224
347 'VS225' => 0xE01D0, # VARIATION SELECTOR-225
348 'VS226' => 0xE01D1, # VARIATION SELECTOR-226
349 'VS227' => 0xE01D2, # VARIATION SELECTOR-227
350 'VS228' => 0xE01D3, # VARIATION SELECTOR-228
351 'VS229' => 0xE01D4, # VARIATION SELECTOR-229
352 'VS230' => 0xE01D5, # VARIATION SELECTOR-230
353 'VS231' => 0xE01D6, # VARIATION SELECTOR-231
354 'VS232' => 0xE01D7, # VARIATION SELECTOR-232
355 'VS233' => 0xE01D8, # VARIATION SELECTOR-233
356 'VS234' => 0xE01D9, # VARIATION SELECTOR-234
357 'VS235' => 0xE01DA, # VARIATION SELECTOR-235
358 'VS236' => 0xE01DB, # VARIATION SELECTOR-236
359 'VS237' => 0xE01DC, # VARIATION SELECTOR-237
360 'VS238' => 0xE01DD, # VARIATION SELECTOR-238
361 'VS239' => 0xE01DE, # VARIATION SELECTOR-239
362 'VS240' => 0xE01DF, # VARIATION SELECTOR-240
363 'VS241' => 0xE01E0, # VARIATION SELECTOR-241
364 'VS242' => 0xE01E1, # VARIATION SELECTOR-242
365 'VS243' => 0xE01E2, # VARIATION SELECTOR-243
366 'VS244' => 0xE01E3, # VARIATION SELECTOR-244
367 'VS245' => 0xE01E4, # VARIATION SELECTOR-245
368 'VS246' => 0xE01E5, # VARIATION SELECTOR-246
369 'VS247' => 0xE01E6, # VARIATION SELECTOR-247
370 'VS248' => 0xE01E7, # VARIATION SELECTOR-248
371 'VS249' => 0xE01E8, # VARIATION SELECTOR-249
372 'VS250' => 0xE01E9, # VARIATION SELECTOR-250
373 'VS251' => 0xE01EA, # VARIATION SELECTOR-251
374 'VS252' => 0xE01EB, # VARIATION SELECTOR-252
375 'VS253' => 0xE01EC, # VARIATION SELECTOR-253
376 'VS254' => 0xE01ED, # VARIATION SELECTOR-254
377 'VS255' => 0xE01EE, # VARIATION SELECTOR-255
378 'VS256' => 0xE01EF, # VARIATION SELECTOR-256
379 'WJ' => 0x2060, # WORD JOINER
380 'ZWJ' => 0x200D, # ZERO WIDTH JOINER
381 'ZWNJ' => 0x200C, # ZERO WIDTH NON-JOINER
382 'ZWSP' => 0x200B, # ZERO WIDTH SPACE
383 );
52ea3e69 384
232cbbee 385my %deprecated_aliases = (
16036bcd
KW
386 # Pre-3.2 compatibility (only for the first 256 characters).
387 # Use of these gives deprecated message.
388 'HORIZONTAL TABULATION' => 0x09, # CHARACTER TABULATION
389 'VERTICAL TABULATION' => 0x0B, # LINE TABULATION
390 'FILE SEPARATOR' => 0x1C, # INFORMATION SEPARATOR FOUR
391 'GROUP SEPARATOR' => 0x1D, # INFORMATION SEPARATOR THREE
392 'RECORD SEPARATOR' => 0x1E, # INFORMATION SEPARATOR TWO
393 'UNIT SEPARATOR' => 0x1F, # INFORMATION SEPARATOR ONE
394 'HORIZONTAL TABULATION SET' => 0x88, # CHARACTER TABULATION SET
395 'HORIZONTAL TABULATION WITH JUSTIFICATION' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
396 'PARTIAL LINE DOWN' => 0x8B, # PARTIAL LINE FORWARD
397 'PARTIAL LINE UP' => 0x8C, # PARTIAL LINE BACKWARD
398 'VERTICAL TABULATION SET' => 0x8A, # LINE TABULATION SET
399 'REVERSE INDEX' => 0x8D, # REVERSE LINE FEED
400 );
52ea3e69 401
232cbbee 402my %user_name_aliases = (
16036bcd 403 # User defined aliases. Even more convenient :)
232cbbee
KW
404 # These are the ones that resolved to names
405 );
406
407my %user_numeric_aliases = (
408 # And these resolve directly to code points.
409 );
410my %inverse_user_aliases = (
411 # Map from code point to name
16036bcd 412 );
423cee85 413my $txt;
232cbbee
KW
414my $decimal_qr = qr/^[1-9]\d*$/;
415
416# Returns the hex number in $1.
417my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/;
423cee85 418
8878f897
T
419sub croak
420{
421 require Carp; goto &Carp::croak;
422} # croak
423
424sub carp
425{
426 require Carp; goto &Carp::carp;
427} # carp
428
35c0985d
MB
429sub alias (@)
430{
35c0985d 431 my $alias = ref $_[0] ? $_[0] : { @_ };
232cbbee
KW
432 foreach my $name (keys %$alias) {
433 my $value = $alias->{$name};
434 if ($value =~ $decimal_qr) {
435 $user_numeric_aliases{$name} = $value;
436
437 # Use a canonical form.
438 $inverse_user_aliases{sprintf("%04X", $value)} = $name;
439 }
440 elsif ($value =~ $hex_qr) {
b342e77e 441 my $decimal = CORE::hex $1;
232cbbee
KW
442 $user_numeric_aliases{$name} = $decimal;
443
444 # Must convert to decimal and back to guarantee canonical form
445 $inverse_user_aliases{sprintf("%04X", $decimal)} = $name;
446 }
447 else {
448 $user_name_aliases{$name} = $value;
449 }
450 }
35c0985d
MB
451} # alias
452
5a7fb30a
KW
453sub not_legal_use_bytes_msg {
454 my ($name, $ord) = @_;
455 return sprintf("Character 0x%04x with name '$name' is above 0xFF with 'use bytes' in effect", $ord);
456}
457
35c0985d
MB
458sub alias_file ($)
459{
51cf30b6
MB
460 my ($arg, $file) = @_;
461 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
462 $file = $arg;
463 }
464 elsif ($arg =~ m/^\w+$/) {
465 $file = "unicore/${arg}_alias.pl";
466 }
467 else {
468 croak "Charnames alias files can only have identifier characters";
469 }
35c0985d 470 if (my @alias = do $file) {
51cf30b6
MB
471 @alias == 1 && !defined $alias[0] and
472 croak "$file cannot be used as alias file for charnames";
473 @alias % 2 and
474 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
475 alias (@alias);
476 return (1);
477 }
478 0;
479} # alias_file
480
63098191
KW
481
482sub lookup_name {
b177ca84 483 my $name = shift;
63098191
KW
484 my $runtime = shift; # compile vs run time
485
486 # Finds the ordinal of a character name, first in the aliases, then in
487 # the large table. If not found, returns undef if runtime; complains
488 # and returns the Unicode replacement if compile.
489 # This is not optimized in any way yet
490
16036bcd 491 my $ord;
b177ca84 492
232cbbee
KW
493 # User alias should be checked first or else can't override ours, and if we
494 # add any, could conflict with theirs.
495 if (exists $user_numeric_aliases{$name}) {
496 $ord = $user_numeric_aliases{$name};
16036bcd 497 }
232cbbee
KW
498 elsif (exists $user_name_aliases{$name}) {
499 $name = $user_name_aliases{$name};
500 }
501 elsif (exists $system_aliases{$name}) {
502 $ord = $system_aliases{$name};
52ea3e69 503 }
232cbbee 504 elsif (exists $deprecated_aliases{$name}) {
35c0985d 505 require warnings;
232cbbee
KW
506 warnings::warnif('deprecated', "Unicode character name \"$name\" is deprecated, use \"" . viacode($deprecated_aliases{$name}) . "\" instead");
507 $ord = $deprecated_aliases{$name};
52ea3e69 508 }
b177ca84 509
423cee85 510 my @off;
52ea3e69 511
16036bcd 512 if (! defined $ord) {
35c0985d
MB
513 ## Suck in the code/name list as a big string.
514 ## Lines look like:
515 ## "0052\t\tLATIN CAPITAL LETTER R\n"
516 $txt = do "unicore/Name.pl" unless $txt;
517
518 ## @off will hold the index into the code/name string of the start and
519 ## end of the name as we find it.
520
63098191
KW
521 ## If :full, look for the name exactly; runtime implies full
522 if (($runtime || $^H{charnames_full}) && $txt =~ /\t\t\Q$name\E$/m) {
523 @off = ($-[0] + 2, $+[0]); # The 2 is for the 2 tabs
35c0985d
MB
524 }
525
526 ## If we didn't get above, and :short allowed, look for the short name.
527 ## The short name is like "greek:Sigma"
528 unless (@off) {
63098191
KW
529 if (($runtime || $^H{charnames_short}) && $name =~ /^(.+?):(.+)/s) {
530 my ($script, $cname) = ($1, $2);
531 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
532 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
533 @off = ($-[0] + 2, $+[0]);
534 }
423cee85 535 }
35c0985d 536 }
b177ca84 537
35c0985d
MB
538 ## If we still don't have it, check for the name among the loaded
539 ## scripts.
63098191 540 if (! $runtime && not @off) {
35c0985d
MB
541 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
542 for my $script (@{$^H{charnames_scripts}}) {
63098191
KW
543 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
544 @off = ($-[0] + 2, $+[0]);
545 last;
546 }
52ea3e69 547 }
35c0985d
MB
548 }
549
550 ## If we don't have it by now, give up.
551 unless (@off) {
63098191 552 return if $runtime;
35c0985d
MB
553 carp "Unknown charname '$name'";
554 return "\x{FFFD}";
555 }
556
557 ##
558 ## Now know where in the string the name starts.
559 ## The code, in hex, is before that.
560 ##
561 ## The code can be 4-6 characters long, so we've got to sort of
562 ## go look for it, just after the newline that comes before $off[0].
563 ##
564 ## This would be much easier if unicore/Name.pl had info in
565 ## a name/code order, instead of code/name order.
566 ##
567 ## The +1 after the rindex() is to skip past the newline we're finding,
568 ## or, if the rindex() fails, to put us to an offset of zero.
569 ##
570 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
571
572 ## we know where it starts, so turn into number -
573 ## the ordinal for the char.
63098191 574 $ord = CORE::hex substr($txt, $hexstart, $off[0] - 2 - $hexstart);
423cee85 575 }
b177ca84 576
63098191
KW
577 return $ord if $runtime || $ord <= 255 || ! ($^H & $bytes::hint_bits);
578
579 # Here is compile time, "use bytes" is in effect, and the character
580 # won't fit in a byte
581
5a7fb30a
KW
582
583 # Get the official name if have one for the message
584 $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
585
586 croak not_legal_use_bytes_msg($name, $ord);
63098191
KW
587} # lookup_name
588
589sub charnames {
590 my $name = shift;
591
592 # For \N{...}. Looks up the character name and returns its ordinal if
593 # found, undef otherwise. If not in 'use bytes', forces into utf8
594
595 my $ord = lookup_name($name, 0); # 0 means compile-time
596 return unless defined $ord;
597 return chr $ord if $^H & $bytes::hint_bits;
f0175764 598
52ea3e69 599 no warnings 'utf8'; # allow even illegal characters
bfa383d6 600 return pack "U", $ord;
63098191 601}
423cee85 602
b177ca84
JF
603sub import
604{
605 shift; ## ignore class name
606
35c0985d
MB
607 if (not @_) {
608 carp("`use charnames' needs explicit imports list");
b177ca84 609 }
423cee85 610 $^H{charnames} = \&charnames ;
b177ca84
JF
611
612 ##
613 ## fill %h keys with our @_ args.
614 ##
35c0985d 615 my ($promote, %h, @args) = (0);
e5c3f898
MG
616 while (my $arg = shift) {
617 if ($arg eq ":alias") {
51cf30b6
MB
618 @_ or
619 croak ":alias needs an argument in charnames";
35c0985d
MB
620 my $alias = shift;
621 if (ref $alias) {
622 ref $alias eq "HASH" or
51cf30b6 623 croak "Only HASH reference supported as argument to :alias";
35c0985d
MB
624 alias ($alias);
625 next;
626 }
51cf30b6
MB
627 if ($alias =~ m{:(\w+)$}) {
628 $1 eq "full" || $1 eq "short" and
629 croak ":alias cannot use existing pragma :$1 (reversed order?)";
630 alias_file ($1) and $promote = 1;
631 next;
35c0985d 632 }
51cf30b6
MB
633 alias_file ($alias);
634 next;
635 }
e5c3f898
MG
636 if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
637 warn "unsupported special '$arg' in charnames";
51cf30b6 638 next;
35c0985d 639 }
e5c3f898 640 push @args, $arg;
35c0985d
MB
641 }
642 @args == 0 && $promote and @args = (":full");
643 @h{@args} = (1) x @args;
b177ca84 644
423cee85
JH
645 $^H{charnames_full} = delete $h{':full'};
646 $^H{charnames_short} = delete $h{':short'};
647 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
648
649 ##
650 ## If utf8? warnings are enabled, and some scripts were given,
651 ## see if at least we can find one letter of each script.
652 ##
35c0985d
MB
653 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
654 $txt = do "unicore/Name.pl" unless $txt;
655
656 for my $script (@{$^H{charnames_scripts}}) {
657 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
658 warnings::warn('utf8', "No such script: '$script'");
b177ca84 659 }
35c0985d 660 }
bd62941a 661 }
35c0985d 662} # import
423cee85 663
63098191
KW
664my %viacode; # Cache of already-found codes
665
666sub viacode {
667
668 # Returns the name of the code point argument
4e2cda5d 669
35c0985d
MB
670 if (@_ != 1) {
671 carp "charnames::viacode() expects one argument";
bd5c3bd9 672 return;
35c0985d 673 }
f0175764 674
35c0985d 675 my $arg = shift;
b177ca84 676
e5432b89
KW
677 # This is derived from Unicode::UCD, where it is nearly the same as the
678 # function _getcode(), but here it makes sure that even a hex argument
679 # has the proper number of leading zeros, which is critical in
680 # matching against $txt below
35c0985d 681 my $hex;
232cbbee 682 if ($arg =~ $decimal_qr) {
35c0985d 683 $hex = sprintf "%04X", $arg;
232cbbee 684 } elsif ($arg =~ $hex_qr) {
e10d7780 685 # Below is the line that differs from the _getcode() source
c8002005 686 $hex = sprintf "%04X", hex $1;
35c0985d
MB
687 } else {
688 carp("unexpected arg \"$arg\" to charnames::viacode()");
689 return;
690 }
b177ca84 691
35c0985d 692 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 693
ac046fe1
KW
694 # If the code point is above the max in the table, there's no point
695 # looking through it. Checking the length first is slightly faster
696 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) {
697 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 698
ac046fe1
KW
699 # Return the official name, if exists. It's unclear to me (khw) at
700 # this juncture if it is better to return a user-defined override, so
701 # leaving it as is for now.
702 if ($txt =~ m/^$hex\t\t(.+)/m) {
703 $viacode{$hex} = $1;
704 return $1;
705 }
232cbbee
KW
706 }
707
708 # See if there is a user name for it, before giving up completely.
ac046fe1
KW
709 if (! exists $inverse_user_aliases{$hex}) {
710 if (CORE::hex($hex) > 0x10FFFF) {
711 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
712 }
713 return;
714 }
bd5c3bd9 715
232cbbee
KW
716 $viacode{$hex} = $inverse_user_aliases{$hex};
717 return $inverse_user_aliases{$hex};
35c0985d 718} # viacode
daf0d493 719
63098191 720my %vianame; # Cache of already-found names
4e2cda5d 721
daf0d493
JH
722sub vianame
723{
35c0985d
MB
724 if (@_ != 1) {
725 carp "charnames::vianame() expects one name argument";
726 return ()
727 }
daf0d493 728
63098191
KW
729 # Looks up the character name and returns its ordinal if
730 # found, undef otherwise.
daf0d493 731
63098191 732 my $arg = shift;
dbc0d4f2 733
63098191 734 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d 735
63098191
KW
736 # khw claims that this is bad. The function should return either a
737 # an ord or a chr for all inputs; not be bipolar. Also, under 'use
738 # bytes', can create a chr above 255.
5a7fb30a
KW
739 my $ord = CORE::hex $1;
740 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
741 carp not_legal_use_bytes_msg($arg, $ord);
742 return;
63098191 743 }
daf0d493 744
63098191
KW
745 if (! exists $vianame{$arg}) {
746 $vianame{$arg} = lookup_name($arg, 1); # 1 means run-time
35c0985d 747 }
63098191
KW
748
749 return $vianame{$arg};
35c0985d 750} # vianame
b177ca84 751
423cee85
JH
752
7531;
754__END__
755
756=head1 NAME
757
da9dec57 758charnames - access to Unicode character names and define character names for C<\N{named}> string literal escapes
423cee85
JH
759
760=head1 SYNOPSIS
761
762 use charnames ':full';
4a2d328f 763 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
764
765 use charnames ':short';
4a2d328f 766 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
767
768 use charnames qw(cyrillic greek);
4a2d328f 769 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 770
35c0985d
MB
771 use charnames ":full", ":alias" => {
772 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
e5432b89 773 mychar => 0xE8000, # Private use area
76ae0c45 774 };
35c0985d 775 print "\N{e_ACUTE} is a small letter e with an acute.\n";
da9dec57 776 print "\\N{mychar} allows me to name private use characters.\n";
35c0985d 777
76ae0c45 778 use charnames ();
a23c04e4 779 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
16036bcd
KW
780 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
781 # "10330"
b177ca84 782
423cee85
JH
783=head1 DESCRIPTION
784
da9dec57
KW
785Pragma C<use charnames> is used to gain access to the names of the
786Unicode characters, and to allow you to define your own character names.
787
788All forms of the pragma enable use of the
789L</charnames::vianame(I<name>)> function for run-time lookup of a
790character name to get its ordinal (code point), and the inverse
791function, L</charnames::viacode(I<code>)>.
792
793Forms other than C<S<"use charnames ();">> enable the use of of
794C<\N{I<CHARNAME>}> sequences to compile a Unicode character into a
795string based on its name.
796
797Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
798also inserts a character into a string, but doesn't require the use of
799this pragma. The character it inserts is the one whose code point
800(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
801the Unicode (white background, black foreground) smiley face; it doesn't
802require this pragma, whereas the equivalent, C<"\N{WHITE SMILING FACE}">
803does.
804Also, C<\N{I<...>}> can mean a regex quantifier instead of a character
805name, when the I<...> is a number (or comma separated pair of numbers;
806see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
807
808The C<charnames> pragma supports arguments C<:full>, C<:short>, script
809names and customized aliases. If C<:full> is present, for expansion of
810C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
76ae0c45 811standard Unicode character names. If C<:short> is present, and
da9dec57
KW
812I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
813as a letter in script I<SCRIPT>. If C<use charnames> is used
814with script name arguments, then for C<\N{I<CHARNAME>}> the name
815I<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
816specified order). Customized aliases can override these, and are explained in
817L</CUSTOM ALIASES>.
423cee85 818
da9dec57 819For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>
d5448623 820this pragma looks for the names
423cee85
JH
821
822 SCRIPTNAME CAPITAL LETTER CHARNAME
823 SCRIPTNAME SMALL LETTER CHARNAME
824 SCRIPTNAME LETTER CHARNAME
825
da9dec57 826in the table of standard Unicode names. If I<CHARNAME> is lowercase,
daf0d493
JH
827then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
828is ignored.
829
da9dec57
KW
830Note that C<\N{...}> is compile-time; it's a special form of string
831constant used inside double-quotish strings; this means that you cannot
4e2cda5d 832use variables inside the C<\N{...}>. If you want similar run-time
da9dec57 833functionality, use L<charnames::vianame()|/charnames::vianame(I<name>)>.
423cee85 834
301a3cda 835For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
da9dec57
KW
836there are no official Unicode names but you can use instead the ISO 6429
837names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF,
838ESC, ...). In Unicode 3.2 (as of Perl 5.8) some naming changes take
839place ISO 6429 has been updated, see L</ALIASES>.
301a3cda 840
e5432b89
KW
841If the input name is unknown, C<\N{NAME}> raises a warning and
842substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
843
844It is a fatal error if C<use bytes> is in effect and the input name is
845that of a character that won't fit into a byte (i.e., whose ordinal is
846above 255).
847
da9dec57
KW
848Otherwise, any string that includes a C<\N{I<charname>}> or
849C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
850L<perlunicode/Byte and Character Semantics>).
851
5ffe0e96 852=head1 ALIASES
423cee85 853
5ffe0e96
MB
854A few aliases have been defined for convenience: instead of having
855to use the official names
423cee85 856
5ffe0e96
MB
857 LINE FEED (LF)
858 FORM FEED (FF)
859 CARRIAGE RETURN (CR)
860 NEXT LINE (NEL)
423cee85 861
e5432b89 862(yes, with parentheses), one can use
d5448623 863
5ffe0e96
MB
864 LINE FEED
865 FORM FEED
866 CARRIAGE RETURN
867 NEXT LINE
868 LF
869 FF
870 CR
871 NEL
872
16036bcd
KW
873All the other standard abbreviations for the controls, such as C<ACK> for
874C<ACKNOWLEDGE> also can be used.
875
5ffe0e96
MB
876One can also use
877
878 BYTE ORDER MARK
879 BOM
880
16036bcd
KW
881and these abbreviations
882
883 Abbreviation Full Name
884
885 CGJ COMBINING GRAPHEME JOINER
886 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
887 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
888 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
889 LRE LEFT-TO-RIGHT EMBEDDING
890 LRM LEFT-TO-RIGHT MARK
891 LRO LEFT-TO-RIGHT OVERRIDE
892 MMSP MEDIUM MATHEMATICAL SPACE
893 MVS MONGOLIAN VOWEL SEPARATOR
894 NBSP NO-BREAK SPACE
895 NNBSP NARROW NO-BREAK SPACE
896 PDF POP DIRECTIONAL FORMATTING
897 RLE RIGHT-TO-LEFT EMBEDDING
898 RLM RIGHT-TO-LEFT MARK
899 RLO RIGHT-TO-LEFT OVERRIDE
900 SHY SOFT HYPHEN
901 VS1 VARIATION SELECTOR-1
902 .
903 .
904 .
905 VS256 VARIATION SELECTOR-256
906 WJ WORD JOINER
907 ZWJ ZERO WIDTH JOINER
908 ZWNJ ZERO WIDTH NON-JOINER
909 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
910
911For backward compatibility one can use the old names for
912certain C0 and C1 controls
913
914 old new
915
5ffe0e96
MB
916 FILE SEPARATOR INFORMATION SEPARATOR FOUR
917 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
918 HORIZONTAL TABULATION CHARACTER TABULATION
919 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
920 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
921 WITH JUSTIFICATION
5ffe0e96
MB
922 PARTIAL LINE DOWN PARTIAL LINE FORWARD
923 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
924 RECORD SEPARATOR INFORMATION SEPARATOR TWO
925 REVERSE INDEX REVERSE LINE FEED
926 UNIT SEPARATOR INFORMATION SEPARATOR ONE
927 VERTICAL TABULATION LINE TABULATION
928 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
929
930but the old names in addition to giving the character
931will also give a warning about being deprecated.
423cee85 932
16036bcd
KW
933And finally, certain published variants are usable, including some for
934controls that have no Unicode names:
935
936 END OF PROTECTED AREA
937 HIGH OCTET PRESET
938 HOP
939 IND
940 INDEX
941 PAD
942 PADDING CHARACTER
943 PRIVATE USE 1
944 PRIVATE USE 2
945 SGC
946 SINGLE GRAPHIC CHARACTER INTRODUCER
947 SINGLE-SHIFT 2
948 SINGLE-SHIFT 3
949 START OF PROTECTED AREA
950
35c0985d
MB
951=head1 CUSTOM ALIASES
952
da9dec57
KW
953You can add customized aliases to standard Unicode naming conventions
954(C<:full>). The aliases override any standard definitions, so, if
955you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
956mean C<"B">, etc.
55bc7d3c
KW
957
958Note that an alias should not be something that is a legal curly
959brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
e5432b89
KW
960C<\N{123}> means to match 123 non-newline characters, and is not treated as a
961charnames alias. Aliases are discouraged from beginning with anything
962other than an alphabetic character and from containing anything other
963than alphanumerics, spaces, dashes, parentheses, and underscores.
964Currently they must be ASCII.
965
966An alias can map to either an official Unicode character name or to a
967numeric code point (ordinal). The latter is useful for assigning names
968to code points in Unicode private use areas such as U+E800 through
969U+F8FF. The number must look like an unsigned decimal integer, or a
970hexadecimal constant beginning with C<0x>, or C<U+>.
232cbbee 971
da9dec57 972Aliases are added either by the use of anonymous hashes:
35c0985d 973
da9dec57 974 use charnames ":alias" => {
35c0985d 975 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee 976 mychar1 => 0xE8000,
35c0985d
MB
977 };
978 my $str = "\N{e_ACUTE}";
979
da9dec57 980or by using a file containing aliases:
35c0985d 981
da9dec57 982 use charnames ":alias" => "pro";
35c0985d 983
da9dec57
KW
984will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
985file should return a list in plain perl:
35c0985d
MB
986
987 (
988 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
989 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
990 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
991 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
992 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
993 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
994 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
232cbbee 995 mychar2 => U+E8001,
35c0985d
MB
996 );
997
da9dec57
KW
998Both these methods insert C<":full"> automatically as the first argument (if no
999other argument is given), and you can give the C<":full"> explicitly as
1000well, like
35c0985d 1001
da9dec57 1002 use charnames ":full", ":alias" => "pro";
35c0985d 1003
da9dec57 1004=head1 charnames::viacode(I<code>)
b177ca84
JF
1005
1006Returns the full name of the character indicated by the numeric code.
da9dec57 1007For example,
b177ca84
JF
1008
1009 print charnames::viacode(0x2722);
1010
1011prints "FOUR TEARDROP-SPOKED ASTERISK".
1012
232cbbee
KW
1013The name returned is the official name for the code point, if
1014available, otherwise your custom alias for it. This means that your
1015alias will only be returned for code points that don't have an official
1016Unicode name (nor Unicode version 1 name), such as private use code
1017points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
da9dec57
KW
1018If you define more than one name for the code point, it is indeterminate
1019which one will be returned.
1020
1021The function returns C<undef> if no name is known for the code point.
1022In Unicode the proper name of these is the empty string, which
1023C<undef> stringifies to. (If you ask for a code point past the legal
1024Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
1025get C<undef> and a warning.)
daf0d493 1026
274085e3
PN
1027Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
1028SPACE", not "BYTE ORDER MARK".
1029
da9dec57 1030=head1 charnames::vianame(I<name>)
daf0d493
JH
1031
1032Returns the code point indicated by the name.
1033The example
1034
1035 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
1036
1037prints "2722".
1038
da9dec57
KW
1039C<vianame> takes the identical inputs that C<\N{...}> does under the
1040L<C<:full> and C<:short>|/DESCRIPTION> options to the C<charnames>
1041pragma, including any L<custom aliases|/CUSTOM ALIASES> you may have
1042defined.
b177ca84 1043
da9dec57
KW
1044There are just two differences. The first is that if the input name is
1045unknown it returns C<undef> instead of the REPLACEMENT CHARACTER, and
1046does not raise a warning message.
1047The second is the C<S<use bytes>> pragma has no effect on this function.
b177ca84 1048
5ffe0e96 1049=head1 CUSTOM TRANSLATORS
52ea3e69 1050
5ffe0e96
MB
1051The mechanism of translation of C<\N{...}> escapes is general and not
1052hardwired into F<charnames.pm>. A module can install custom
1053translations (inside the scope which C<use>s the module) with the
1054following magic incantation:
52ea3e69 1055
5ffe0e96
MB
1056 sub import {
1057 shift;
1058 $^H{charnames} = \&translator;
1059 }
52ea3e69 1060
da9dec57 1061Here translator() is a subroutine which takes I<CHARNAME> as an
5ffe0e96 1062argument, and returns text to insert into the string instead of the
da9dec57 1063C<\N{I<CHARNAME>}> escape. Since the text to insert should be different
5ffe0e96
MB
1064in C<bytes> mode and out of it, the function should check the current
1065state of C<bytes>-flag as in:
52ea3e69 1066
5ffe0e96
MB
1067 use bytes (); # for $bytes::hint_bits
1068 sub translator {
1069 if ($^H & $bytes::hint_bits) {
1070 return bytes_translator(@_);
1071 }
1072 else {
1073 return utf8_translator(@_);
1074 }
1075 }
52ea3e69 1076
da9dec57 1077See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
f0175764 1078
423cee85
JH
1079=head1 BUGS
1080
55bc7d3c 1081vianame returns a chr if the input name is of the form C<U+...>, and an ord
a0a3bc7f 1082otherwise. It is proposed to change this to always return an ord. Send email
16036bcd 1083to C<perl5-porters@perl.org> to comment on this proposal.
55bc7d3c 1084
da9dec57
KW
1085All the Hangul syllable characters are treated as having no names, as
1086are almost all the CJK Unicode characters that have their code points as
1087part of their names.
55bc7d3c 1088
16036bcd
KW
1089Names must be ASCII characters only, which means that you are out of luck if
1090you want to create aliases in a language where some or all the characters of
1091the desired aliases are non-ASCII.
bee80e93 1092
fe749c9a
KW
1093Unicode standard named sequences are not recognized, such as
1094C<LATIN CAPITAL LETTER A WITH MACRON AND GRAVE>
1095(which should mean C<LATIN CAPITAL LETTER A WITH MACRON> with an additional
1096C<COMBINING GRAVE ACCENT>).
1097
55bc7d3c 1098Since evaluation of the translation function happens in the middle of
423cee85 1099compilation (of a string literal), the translation function should not
da9dec57
KW
1100do any C<eval>s or C<require>s. This restriction should be lifted (but
1101is low priority) in a future version of Perl.
423cee85
JH
1102
1103=cut