This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
charnames.pm: More refactoring for performance
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
e79869e1 5our $VERSION = '1.14';
b75c8c73 6
52fb7278 7use bytes (); # for $bytes::hint_bits
423cee85 8
232cbbee 9my %system_aliases = (
16036bcd
KW
10 # Icky 3.2 names with parentheses.
11 'LINE FEED' => 0x0A, # LINE FEED (LF)
12 'FORM FEED' => 0x0C, # FORM FEED (FF)
13 'CARRIAGE RETURN' => 0x0D, # CARRIAGE RETURN (CR)
14 'NEXT LINE' => 0x85, # NEXT LINE (NEL)
15
16 # Some variant names from Wikipedia
17 'SINGLE-SHIFT 2' => 0x8E,
18 'SINGLE-SHIFT 3' => 0x8F,
19 'PRIVATE USE 1' => 0x91,
20 'PRIVATE USE 2' => 0x92,
21 'START OF PROTECTED AREA' => 0x96,
22 'END OF PROTECTED AREA' => 0x97,
23
24 # Convenience. Standard abbreviations for the controls
25 'NUL' => 0x00, # NULL
26 'SOH' => 0x01, # START OF HEADING
27 'STX' => 0x02, # START OF TEXT
28 'ETX' => 0x03, # END OF TEXT
29 'EOT' => 0x04, # END OF TRANSMISSION
30 'ENQ' => 0x05, # ENQUIRY
31 'ACK' => 0x06, # ACKNOWLEDGE
32 'BEL' => 0x07, # BELL
33 'BS' => 0x08, # BACKSPACE
34 'HT' => 0x09, # HORIZONTAL TABULATION
35 'LF' => 0x0A, # LINE FEED (LF)
36 'VT' => 0x0B, # VERTICAL TABULATION
37 'FF' => 0x0C, # FORM FEED (FF)
38 'CR' => 0x0D, # CARRIAGE RETURN (CR)
39 'SO' => 0x0E, # SHIFT OUT
40 'SI' => 0x0F, # SHIFT IN
41 'DLE' => 0x10, # DATA LINK ESCAPE
42 'DC1' => 0x11, # DEVICE CONTROL ONE
43 'DC2' => 0x12, # DEVICE CONTROL TWO
44 'DC3' => 0x13, # DEVICE CONTROL THREE
45 'DC4' => 0x14, # DEVICE CONTROL FOUR
46 'NAK' => 0x15, # NEGATIVE ACKNOWLEDGE
47 'SYN' => 0x16, # SYNCHRONOUS IDLE
48 'ETB' => 0x17, # END OF TRANSMISSION BLOCK
49 'CAN' => 0x18, # CANCEL
50 'EOM' => 0x19, # END OF MEDIUM
51 'SUB' => 0x1A, # SUBSTITUTE
52 'ESC' => 0x1B, # ESCAPE
53 'FS' => 0x1C, # FILE SEPARATOR
54 'GS' => 0x1D, # GROUP SEPARATOR
55 'RS' => 0x1E, # RECORD SEPARATOR
56 'US' => 0x1F, # UNIT SEPARATOR
57 'DEL' => 0x7F, # DELETE
58 'BPH' => 0x82, # BREAK PERMITTED HERE
59 'NBH' => 0x83, # NO BREAK HERE
60 'NEL' => 0x85, # NEXT LINE (NEL)
61 'SSA' => 0x86, # START OF SELECTED AREA
62 'ESA' => 0x87, # END OF SELECTED AREA
63 'HTS' => 0x88, # CHARACTER TABULATION SET
64 'HTJ' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
65 'VTS' => 0x8A, # LINE TABULATION SET
66 'PLD' => 0x8B, # PARTIAL LINE FORWARD
67 'PLU' => 0x8C, # PARTIAL LINE BACKWARD
68 'RI ' => 0x8D, # REVERSE LINE FEED
69 'SS2' => 0x8E, # SINGLE SHIFT TWO
70 'SS3' => 0x8F, # SINGLE SHIFT THREE
71 'DCS' => 0x90, # DEVICE CONTROL STRING
72 'PU1' => 0x91, # PRIVATE USE ONE
73 'PU2' => 0x92, # PRIVATE USE TWO
74 'STS' => 0x93, # SET TRANSMIT STATE
75 'CCH' => 0x94, # CANCEL CHARACTER
76 'MW ' => 0x95, # MESSAGE WAITING
77 'SPA' => 0x96, # START OF GUARDED AREA
78 'EPA' => 0x97, # END OF GUARDED AREA
79 'SOS' => 0x98, # START OF STRING
80 'SCI' => 0x9A, # SINGLE CHARACTER INTRODUCER
81 'CSI' => 0x9B, # CONTROL SEQUENCE INTRODUCER
82 'ST ' => 0x9C, # STRING TERMINATOR
83 'OSC' => 0x9D, # OPERATING SYSTEM COMMAND
84 'PM ' => 0x9E, # PRIVACY MESSAGE
85 'APC' => 0x9F, # APPLICATION PROGRAM COMMAND
86
87 # There are no names for these in the Unicode standard;
88 # perhaps should be deprecated, but then again there are
89 # no alternative names, so am not deprecating. And if
90 # did, the code would have to change to not recommend an
91 # alternative for these.
92 'PADDING CHARACTER' => 0x80,
93 'PAD' => 0x80,
94 'HIGH OCTET PRESET' => 0x81,
95 'HOP' => 0x81,
96 'INDEX' => 0x84,
97 'IND' => 0x84,
98 'SINGLE GRAPHIC CHARACTER INTRODUCER' => 0x99,
99 'SGC' => 0x99,
100
101 # More convenience. For further convenience,
102 # it is suggested some way of using the NamesList
103 # aliases be implemented, but there are ambiguities in
232cbbee 104 # NamesList.txt
16036bcd
KW
105 'BOM' => 0xFEFF, # BYTE ORDER MARK
106 'BYTE ORDER MARK'=> 0xFEFF,
107 'CGJ' => 0x034F, # COMBINING GRAPHEME JOINER
108 'FVS1' => 0x180B, # MONGOLIAN FREE VARIATION SELECTOR ONE
109 'FVS2' => 0x180C, # MONGOLIAN FREE VARIATION SELECTOR TWO
110 'FVS3' => 0x180D, # MONGOLIAN FREE VARIATION SELECTOR THREE
111 'LRE' => 0x202A, # LEFT-TO-RIGHT EMBEDDING
112 'LRM' => 0x200E, # LEFT-TO-RIGHT MARK
113 'LRO' => 0x202D, # LEFT-TO-RIGHT OVERRIDE
114 'MMSP' => 0x205F, # MEDIUM MATHEMATICAL SPACE
115 'MVS' => 0x180E, # MONGOLIAN VOWEL SEPARATOR
116 'NBSP' => 0x00A0, # NO-BREAK SPACE
117 'NNBSP' => 0x202F, # NARROW NO-BREAK SPACE
118 'PDF' => 0x202C, # POP DIRECTIONAL FORMATTING
119 'RLE' => 0x202B, # RIGHT-TO-LEFT EMBEDDING
120 'RLM' => 0x200F, # RIGHT-TO-LEFT MARK
121 'RLO' => 0x202E, # RIGHT-TO-LEFT OVERRIDE
122 'SHY' => 0x00AD, # SOFT HYPHEN
123 'VS1' => 0xFE00, # VARIATION SELECTOR-1
124 'VS2' => 0xFE01, # VARIATION SELECTOR-2
125 'VS3' => 0xFE02, # VARIATION SELECTOR-3
126 'VS4' => 0xFE03, # VARIATION SELECTOR-4
127 'VS5' => 0xFE04, # VARIATION SELECTOR-5
128 'VS6' => 0xFE05, # VARIATION SELECTOR-6
129 'VS7' => 0xFE06, # VARIATION SELECTOR-7
130 'VS8' => 0xFE07, # VARIATION SELECTOR-8
131 'VS9' => 0xFE08, # VARIATION SELECTOR-9
132 'VS10' => 0xFE09, # VARIATION SELECTOR-10
133 'VS11' => 0xFE0A, # VARIATION SELECTOR-11
134 'VS12' => 0xFE0B, # VARIATION SELECTOR-12
135 'VS13' => 0xFE0C, # VARIATION SELECTOR-13
136 'VS14' => 0xFE0D, # VARIATION SELECTOR-14
137 'VS15' => 0xFE0E, # VARIATION SELECTOR-15
138 'VS16' => 0xFE0F, # VARIATION SELECTOR-16
139 'VS17' => 0xE0100, # VARIATION SELECTOR-17
140 'VS18' => 0xE0101, # VARIATION SELECTOR-18
141 'VS19' => 0xE0102, # VARIATION SELECTOR-19
142 'VS20' => 0xE0103, # VARIATION SELECTOR-20
143 'VS21' => 0xE0104, # VARIATION SELECTOR-21
144 'VS22' => 0xE0105, # VARIATION SELECTOR-22
145 'VS23' => 0xE0106, # VARIATION SELECTOR-23
146 'VS24' => 0xE0107, # VARIATION SELECTOR-24
147 'VS25' => 0xE0108, # VARIATION SELECTOR-25
148 'VS26' => 0xE0109, # VARIATION SELECTOR-26
149 'VS27' => 0xE010A, # VARIATION SELECTOR-27
150 'VS28' => 0xE010B, # VARIATION SELECTOR-28
151 'VS29' => 0xE010C, # VARIATION SELECTOR-29
152 'VS30' => 0xE010D, # VARIATION SELECTOR-30
153 'VS31' => 0xE010E, # VARIATION SELECTOR-31
154 'VS32' => 0xE010F, # VARIATION SELECTOR-32
155 'VS33' => 0xE0110, # VARIATION SELECTOR-33
156 'VS34' => 0xE0111, # VARIATION SELECTOR-34
157 'VS35' => 0xE0112, # VARIATION SELECTOR-35
158 'VS36' => 0xE0113, # VARIATION SELECTOR-36
159 'VS37' => 0xE0114, # VARIATION SELECTOR-37
160 'VS38' => 0xE0115, # VARIATION SELECTOR-38
161 'VS39' => 0xE0116, # VARIATION SELECTOR-39
162 'VS40' => 0xE0117, # VARIATION SELECTOR-40
163 'VS41' => 0xE0118, # VARIATION SELECTOR-41
164 'VS42' => 0xE0119, # VARIATION SELECTOR-42
165 'VS43' => 0xE011A, # VARIATION SELECTOR-43
166 'VS44' => 0xE011B, # VARIATION SELECTOR-44
167 'VS45' => 0xE011C, # VARIATION SELECTOR-45
168 'VS46' => 0xE011D, # VARIATION SELECTOR-46
169 'VS47' => 0xE011E, # VARIATION SELECTOR-47
170 'VS48' => 0xE011F, # VARIATION SELECTOR-48
171 'VS49' => 0xE0120, # VARIATION SELECTOR-49
172 'VS50' => 0xE0121, # VARIATION SELECTOR-50
173 'VS51' => 0xE0122, # VARIATION SELECTOR-51
174 'VS52' => 0xE0123, # VARIATION SELECTOR-52
175 'VS53' => 0xE0124, # VARIATION SELECTOR-53
176 'VS54' => 0xE0125, # VARIATION SELECTOR-54
177 'VS55' => 0xE0126, # VARIATION SELECTOR-55
178 'VS56' => 0xE0127, # VARIATION SELECTOR-56
179 'VS57' => 0xE0128, # VARIATION SELECTOR-57
180 'VS58' => 0xE0129, # VARIATION SELECTOR-58
181 'VS59' => 0xE012A, # VARIATION SELECTOR-59
182 'VS60' => 0xE012B, # VARIATION SELECTOR-60
183 'VS61' => 0xE012C, # VARIATION SELECTOR-61
184 'VS62' => 0xE012D, # VARIATION SELECTOR-62
185 'VS63' => 0xE012E, # VARIATION SELECTOR-63
186 'VS64' => 0xE012F, # VARIATION SELECTOR-64
187 'VS65' => 0xE0130, # VARIATION SELECTOR-65
188 'VS66' => 0xE0131, # VARIATION SELECTOR-66
189 'VS67' => 0xE0132, # VARIATION SELECTOR-67
190 'VS68' => 0xE0133, # VARIATION SELECTOR-68
191 'VS69' => 0xE0134, # VARIATION SELECTOR-69
192 'VS70' => 0xE0135, # VARIATION SELECTOR-70
193 'VS71' => 0xE0136, # VARIATION SELECTOR-71
194 'VS72' => 0xE0137, # VARIATION SELECTOR-72
195 'VS73' => 0xE0138, # VARIATION SELECTOR-73
196 'VS74' => 0xE0139, # VARIATION SELECTOR-74
197 'VS75' => 0xE013A, # VARIATION SELECTOR-75
198 'VS76' => 0xE013B, # VARIATION SELECTOR-76
199 'VS77' => 0xE013C, # VARIATION SELECTOR-77
200 'VS78' => 0xE013D, # VARIATION SELECTOR-78
201 'VS79' => 0xE013E, # VARIATION SELECTOR-79
202 'VS80' => 0xE013F, # VARIATION SELECTOR-80
203 'VS81' => 0xE0140, # VARIATION SELECTOR-81
204 'VS82' => 0xE0141, # VARIATION SELECTOR-82
205 'VS83' => 0xE0142, # VARIATION SELECTOR-83
206 'VS84' => 0xE0143, # VARIATION SELECTOR-84
207 'VS85' => 0xE0144, # VARIATION SELECTOR-85
208 'VS86' => 0xE0145, # VARIATION SELECTOR-86
209 'VS87' => 0xE0146, # VARIATION SELECTOR-87
210 'VS88' => 0xE0147, # VARIATION SELECTOR-88
211 'VS89' => 0xE0148, # VARIATION SELECTOR-89
212 'VS90' => 0xE0149, # VARIATION SELECTOR-90
213 'VS91' => 0xE014A, # VARIATION SELECTOR-91
214 'VS92' => 0xE014B, # VARIATION SELECTOR-92
215 'VS93' => 0xE014C, # VARIATION SELECTOR-93
216 'VS94' => 0xE014D, # VARIATION SELECTOR-94
217 'VS95' => 0xE014E, # VARIATION SELECTOR-95
218 'VS96' => 0xE014F, # VARIATION SELECTOR-96
219 'VS97' => 0xE0150, # VARIATION SELECTOR-97
220 'VS98' => 0xE0151, # VARIATION SELECTOR-98
221 'VS99' => 0xE0152, # VARIATION SELECTOR-99
222 'VS100' => 0xE0153, # VARIATION SELECTOR-100
223 'VS101' => 0xE0154, # VARIATION SELECTOR-101
224 'VS102' => 0xE0155, # VARIATION SELECTOR-102
225 'VS103' => 0xE0156, # VARIATION SELECTOR-103
226 'VS104' => 0xE0157, # VARIATION SELECTOR-104
227 'VS105' => 0xE0158, # VARIATION SELECTOR-105
228 'VS106' => 0xE0159, # VARIATION SELECTOR-106
229 'VS107' => 0xE015A, # VARIATION SELECTOR-107
230 'VS108' => 0xE015B, # VARIATION SELECTOR-108
231 'VS109' => 0xE015C, # VARIATION SELECTOR-109
232 'VS110' => 0xE015D, # VARIATION SELECTOR-110
233 'VS111' => 0xE015E, # VARIATION SELECTOR-111
234 'VS112' => 0xE015F, # VARIATION SELECTOR-112
235 'VS113' => 0xE0160, # VARIATION SELECTOR-113
236 'VS114' => 0xE0161, # VARIATION SELECTOR-114
237 'VS115' => 0xE0162, # VARIATION SELECTOR-115
238 'VS116' => 0xE0163, # VARIATION SELECTOR-116
239 'VS117' => 0xE0164, # VARIATION SELECTOR-117
240 'VS118' => 0xE0165, # VARIATION SELECTOR-118
241 'VS119' => 0xE0166, # VARIATION SELECTOR-119
242 'VS120' => 0xE0167, # VARIATION SELECTOR-120
243 'VS121' => 0xE0168, # VARIATION SELECTOR-121
244 'VS122' => 0xE0169, # VARIATION SELECTOR-122
245 'VS123' => 0xE016A, # VARIATION SELECTOR-123
246 'VS124' => 0xE016B, # VARIATION SELECTOR-124
247 'VS125' => 0xE016C, # VARIATION SELECTOR-125
248 'VS126' => 0xE016D, # VARIATION SELECTOR-126
249 'VS127' => 0xE016E, # VARIATION SELECTOR-127
250 'VS128' => 0xE016F, # VARIATION SELECTOR-128
251 'VS129' => 0xE0170, # VARIATION SELECTOR-129
252 'VS130' => 0xE0171, # VARIATION SELECTOR-130
253 'VS131' => 0xE0172, # VARIATION SELECTOR-131
254 'VS132' => 0xE0173, # VARIATION SELECTOR-132
255 'VS133' => 0xE0174, # VARIATION SELECTOR-133
256 'VS134' => 0xE0175, # VARIATION SELECTOR-134
257 'VS135' => 0xE0176, # VARIATION SELECTOR-135
258 'VS136' => 0xE0177, # VARIATION SELECTOR-136
259 'VS137' => 0xE0178, # VARIATION SELECTOR-137
260 'VS138' => 0xE0179, # VARIATION SELECTOR-138
261 'VS139' => 0xE017A, # VARIATION SELECTOR-139
262 'VS140' => 0xE017B, # VARIATION SELECTOR-140
263 'VS141' => 0xE017C, # VARIATION SELECTOR-141
264 'VS142' => 0xE017D, # VARIATION SELECTOR-142
265 'VS143' => 0xE017E, # VARIATION SELECTOR-143
266 'VS144' => 0xE017F, # VARIATION SELECTOR-144
267 'VS145' => 0xE0180, # VARIATION SELECTOR-145
268 'VS146' => 0xE0181, # VARIATION SELECTOR-146
269 'VS147' => 0xE0182, # VARIATION SELECTOR-147
270 'VS148' => 0xE0183, # VARIATION SELECTOR-148
271 'VS149' => 0xE0184, # VARIATION SELECTOR-149
272 'VS150' => 0xE0185, # VARIATION SELECTOR-150
273 'VS151' => 0xE0186, # VARIATION SELECTOR-151
274 'VS152' => 0xE0187, # VARIATION SELECTOR-152
275 'VS153' => 0xE0188, # VARIATION SELECTOR-153
276 'VS154' => 0xE0189, # VARIATION SELECTOR-154
277 'VS155' => 0xE018A, # VARIATION SELECTOR-155
278 'VS156' => 0xE018B, # VARIATION SELECTOR-156
279 'VS157' => 0xE018C, # VARIATION SELECTOR-157
280 'VS158' => 0xE018D, # VARIATION SELECTOR-158
281 'VS159' => 0xE018E, # VARIATION SELECTOR-159
282 'VS160' => 0xE018F, # VARIATION SELECTOR-160
283 'VS161' => 0xE0190, # VARIATION SELECTOR-161
284 'VS162' => 0xE0191, # VARIATION SELECTOR-162
285 'VS163' => 0xE0192, # VARIATION SELECTOR-163
286 'VS164' => 0xE0193, # VARIATION SELECTOR-164
287 'VS165' => 0xE0194, # VARIATION SELECTOR-165
288 'VS166' => 0xE0195, # VARIATION SELECTOR-166
289 'VS167' => 0xE0196, # VARIATION SELECTOR-167
290 'VS168' => 0xE0197, # VARIATION SELECTOR-168
291 'VS169' => 0xE0198, # VARIATION SELECTOR-169
292 'VS170' => 0xE0199, # VARIATION SELECTOR-170
293 'VS171' => 0xE019A, # VARIATION SELECTOR-171
294 'VS172' => 0xE019B, # VARIATION SELECTOR-172
295 'VS173' => 0xE019C, # VARIATION SELECTOR-173
296 'VS174' => 0xE019D, # VARIATION SELECTOR-174
297 'VS175' => 0xE019E, # VARIATION SELECTOR-175
298 'VS176' => 0xE019F, # VARIATION SELECTOR-176
299 'VS177' => 0xE01A0, # VARIATION SELECTOR-177
300 'VS178' => 0xE01A1, # VARIATION SELECTOR-178
301 'VS179' => 0xE01A2, # VARIATION SELECTOR-179
302 'VS180' => 0xE01A3, # VARIATION SELECTOR-180
303 'VS181' => 0xE01A4, # VARIATION SELECTOR-181
304 'VS182' => 0xE01A5, # VARIATION SELECTOR-182
305 'VS183' => 0xE01A6, # VARIATION SELECTOR-183
306 'VS184' => 0xE01A7, # VARIATION SELECTOR-184
307 'VS185' => 0xE01A8, # VARIATION SELECTOR-185
308 'VS186' => 0xE01A9, # VARIATION SELECTOR-186
309 'VS187' => 0xE01AA, # VARIATION SELECTOR-187
310 'VS188' => 0xE01AB, # VARIATION SELECTOR-188
311 'VS189' => 0xE01AC, # VARIATION SELECTOR-189
312 'VS190' => 0xE01AD, # VARIATION SELECTOR-190
313 'VS191' => 0xE01AE, # VARIATION SELECTOR-191
314 'VS192' => 0xE01AF, # VARIATION SELECTOR-192
315 'VS193' => 0xE01B0, # VARIATION SELECTOR-193
316 'VS194' => 0xE01B1, # VARIATION SELECTOR-194
317 'VS195' => 0xE01B2, # VARIATION SELECTOR-195
318 'VS196' => 0xE01B3, # VARIATION SELECTOR-196
319 'VS197' => 0xE01B4, # VARIATION SELECTOR-197
320 'VS198' => 0xE01B5, # VARIATION SELECTOR-198
321 'VS199' => 0xE01B6, # VARIATION SELECTOR-199
322 'VS200' => 0xE01B7, # VARIATION SELECTOR-200
323 'VS201' => 0xE01B8, # VARIATION SELECTOR-201
324 'VS202' => 0xE01B9, # VARIATION SELECTOR-202
325 'VS203' => 0xE01BA, # VARIATION SELECTOR-203
326 'VS204' => 0xE01BB, # VARIATION SELECTOR-204
327 'VS205' => 0xE01BC, # VARIATION SELECTOR-205
328 'VS206' => 0xE01BD, # VARIATION SELECTOR-206
329 'VS207' => 0xE01BE, # VARIATION SELECTOR-207
330 'VS208' => 0xE01BF, # VARIATION SELECTOR-208
331 'VS209' => 0xE01C0, # VARIATION SELECTOR-209
332 'VS210' => 0xE01C1, # VARIATION SELECTOR-210
333 'VS211' => 0xE01C2, # VARIATION SELECTOR-211
334 'VS212' => 0xE01C3, # VARIATION SELECTOR-212
335 'VS213' => 0xE01C4, # VARIATION SELECTOR-213
336 'VS214' => 0xE01C5, # VARIATION SELECTOR-214
337 'VS215' => 0xE01C6, # VARIATION SELECTOR-215
338 'VS216' => 0xE01C7, # VARIATION SELECTOR-216
339 'VS217' => 0xE01C8, # VARIATION SELECTOR-217
340 'VS218' => 0xE01C9, # VARIATION SELECTOR-218
341 'VS219' => 0xE01CA, # VARIATION SELECTOR-219
342 'VS220' => 0xE01CB, # VARIATION SELECTOR-220
343 'VS221' => 0xE01CC, # VARIATION SELECTOR-221
344 'VS222' => 0xE01CD, # VARIATION SELECTOR-222
345 'VS223' => 0xE01CE, # VARIATION SELECTOR-223
346 'VS224' => 0xE01CF, # VARIATION SELECTOR-224
347 'VS225' => 0xE01D0, # VARIATION SELECTOR-225
348 'VS226' => 0xE01D1, # VARIATION SELECTOR-226
349 'VS227' => 0xE01D2, # VARIATION SELECTOR-227
350 'VS228' => 0xE01D3, # VARIATION SELECTOR-228
351 'VS229' => 0xE01D4, # VARIATION SELECTOR-229
352 'VS230' => 0xE01D5, # VARIATION SELECTOR-230
353 'VS231' => 0xE01D6, # VARIATION SELECTOR-231
354 'VS232' => 0xE01D7, # VARIATION SELECTOR-232
355 'VS233' => 0xE01D8, # VARIATION SELECTOR-233
356 'VS234' => 0xE01D9, # VARIATION SELECTOR-234
357 'VS235' => 0xE01DA, # VARIATION SELECTOR-235
358 'VS236' => 0xE01DB, # VARIATION SELECTOR-236
359 'VS237' => 0xE01DC, # VARIATION SELECTOR-237
360 'VS238' => 0xE01DD, # VARIATION SELECTOR-238
361 'VS239' => 0xE01DE, # VARIATION SELECTOR-239
362 'VS240' => 0xE01DF, # VARIATION SELECTOR-240
363 'VS241' => 0xE01E0, # VARIATION SELECTOR-241
364 'VS242' => 0xE01E1, # VARIATION SELECTOR-242
365 'VS243' => 0xE01E2, # VARIATION SELECTOR-243
366 'VS244' => 0xE01E3, # VARIATION SELECTOR-244
367 'VS245' => 0xE01E4, # VARIATION SELECTOR-245
368 'VS246' => 0xE01E5, # VARIATION SELECTOR-246
369 'VS247' => 0xE01E6, # VARIATION SELECTOR-247
370 'VS248' => 0xE01E7, # VARIATION SELECTOR-248
371 'VS249' => 0xE01E8, # VARIATION SELECTOR-249
372 'VS250' => 0xE01E9, # VARIATION SELECTOR-250
373 'VS251' => 0xE01EA, # VARIATION SELECTOR-251
374 'VS252' => 0xE01EB, # VARIATION SELECTOR-252
375 'VS253' => 0xE01EC, # VARIATION SELECTOR-253
376 'VS254' => 0xE01ED, # VARIATION SELECTOR-254
377 'VS255' => 0xE01EE, # VARIATION SELECTOR-255
378 'VS256' => 0xE01EF, # VARIATION SELECTOR-256
379 'WJ' => 0x2060, # WORD JOINER
380 'ZWJ' => 0x200D, # ZERO WIDTH JOINER
381 'ZWNJ' => 0x200C, # ZERO WIDTH NON-JOINER
382 'ZWSP' => 0x200B, # ZERO WIDTH SPACE
383 );
52ea3e69 384
232cbbee 385my %deprecated_aliases = (
16036bcd
KW
386 # Pre-3.2 compatibility (only for the first 256 characters).
387 # Use of these gives deprecated message.
388 'HORIZONTAL TABULATION' => 0x09, # CHARACTER TABULATION
389 'VERTICAL TABULATION' => 0x0B, # LINE TABULATION
390 'FILE SEPARATOR' => 0x1C, # INFORMATION SEPARATOR FOUR
391 'GROUP SEPARATOR' => 0x1D, # INFORMATION SEPARATOR THREE
392 'RECORD SEPARATOR' => 0x1E, # INFORMATION SEPARATOR TWO
393 'UNIT SEPARATOR' => 0x1F, # INFORMATION SEPARATOR ONE
394 'HORIZONTAL TABULATION SET' => 0x88, # CHARACTER TABULATION SET
395 'HORIZONTAL TABULATION WITH JUSTIFICATION' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
396 'PARTIAL LINE DOWN' => 0x8B, # PARTIAL LINE FORWARD
397 'PARTIAL LINE UP' => 0x8C, # PARTIAL LINE BACKWARD
398 'VERTICAL TABULATION SET' => 0x8A, # LINE TABULATION SET
399 'REVERSE INDEX' => 0x8D, # REVERSE LINE FEED
400 );
52ea3e69 401
84374e30 402
cc26ddeb 403my $txt; # The table of official character names
281aa49e 404
84374e30
KW
405my %full_names_cache; # Holds already-looked-up names, so don't have to
406# re-look them up again. The previous versions of charnames had scoping
407# bugs. For example if we use script A in one scope and find and cache
408# what Z resolves to, we can't use that cache in a different scope that
409# uses script B instead of A, as Z might be an entirely different letter
410# there; or there might be different aliases in effect in different
411# scopes, or :short may be in effect or not effect in different scopes,
412# or various combinations thereof. This was solved in this version
413# mostly by moving things to %^H. But some things couldn't be moved
414# there. One of them was the cache of runtime looked-up names, in part
415# because %^H is read-only at runtime. I (khw) don't know why the cache
416# was run-time only in the previous versions: perhaps oversight; perhaps
417# that compile time looking doesn't happen in a loop so didn't think it
418# was worthwhile; perhaps not wanting to make the cache too large. But
419# I decided to make it compile time as well; this could easily be
420# changed.
421# Anyway, this hash is not scoped, and is added to at runtime. It
422# doesn't have scoping problems because the data in it is restricted to
423# official names, which are always invariant, and we only set it and
424# look at it at during :full lookups, so is unaffected by any other
425# scoped options. I put this in to maintain parity with the older
426# version. If desired, a %short_names cache could also be made, as well
427# as one for each script, say in %script_names_cache, with each key
428# being a hash for a script named in a 'use charnames' statement. I
429# decided not to do that for now, just because it's added complication,
430# and because I'm just trying to maintain parity, not extend it.
431
281aa49e
KW
432# Designed so that test decimal first, and then hex. Leading zeros
433# imply non-decimal, as do non-[0-9]
232cbbee
KW
434my $decimal_qr = qr/^[1-9]\d*$/;
435
436# Returns the hex number in $1.
437my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/;
423cee85 438
8878f897
T
439sub croak
440{
441 require Carp; goto &Carp::croak;
442} # croak
443
444sub carp
445{
446 require Carp; goto &Carp::carp;
447} # carp
448
cc26ddeb 449sub alias (@) # Set up a single alias
35c0985d 450{
35c0985d 451 my $alias = ref $_[0] ? $_[0] : { @_ };
232cbbee
KW
452 foreach my $name (keys %$alias) {
453 my $value = $alias->{$name};
52fb7278 454 next unless defined $value; # Omit if screwed up.
84374e30
KW
455
456 # Is slightly slower to just after this statement see if it is
457 # decimal, since we already know it is after having converted from
458 # hex, but makes the code easier to maintain, and is called
459 # infrequently, only at compile-time
460 if ($value !~ $decimal_qr && $value =~ $hex_qr) {
461 $value = CORE::hex $1;
462 }
232cbbee 463 if ($value =~ $decimal_qr) {
52fb7278 464 $^H{charnames_ord_aliases}{$name} = $value;
232cbbee
KW
465
466 # Use a canonical form.
52fb7278 467 $^H{charnames_inverse_ords}{sprintf("%04X", $value)} = $name;
232cbbee
KW
468 }
469 else {
52fb7278
KW
470 # XXX validate syntax when deprecation cycle complete. ie. start
471 # with an alpha only, etc.
472 $^H{charnames_name_aliases}{$name} = $value;
232cbbee
KW
473 }
474 }
35c0985d
MB
475} # alias
476
5a7fb30a
KW
477sub not_legal_use_bytes_msg {
478 my ($name, $ord) = @_;
479 return sprintf("Character 0x%04x with name '$name' is above 0xFF with 'use bytes' in effect", $ord);
480}
481
281aa49e 482sub alias_file ($) # Reads a file containing alias definitions
35c0985d 483{
51cf30b6
MB
484 my ($arg, $file) = @_;
485 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
486 $file = $arg;
487 }
488 elsif ($arg =~ m/^\w+$/) {
489 $file = "unicore/${arg}_alias.pl";
490 }
491 else {
492 croak "Charnames alias files can only have identifier characters";
493 }
35c0985d 494 if (my @alias = do $file) {
51cf30b6
MB
495 @alias == 1 && !defined $alias[0] and
496 croak "$file cannot be used as alias file for charnames";
497 @alias % 2 and
498 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
499 alias (@alias);
500 return (1);
501 }
502 0;
503} # alias_file
504
63098191 505
84374e30 506sub lookup_name ($;$) {
63098191
KW
507
508 # Finds the ordinal of a character name, first in the aliases, then in
bb679142
KW
509 # the large table. If not found, returns undef if runtime; if
510 # compile, complains and returns the Unicode replacement character.
63098191 511
84374e30
KW
512 my $runtime = (@_ > 1); # compile vs run time
513
e79869e1 514 my ($name, $hints_ref) = @_;
84374e30 515
16036bcd 516 my $ord;
e79869e1 517 my $save_input;
b177ca84 518
84374e30
KW
519 if ($runtime) {
520 # At runtime, but currently not at compile time, $^H gets
521 # stringified, so un-stringify back to the original data structures.
522 # These get thrown away by perl before the next invocation
523 # Also fill in the hash with the non-stringified data.
524
525 %{$^H{charnames_name_aliases}} = split ',', $hints_ref->{charnames_stringified_names};
526 %{$^H{charnames_ord_aliases}} = split ',', $hints_ref->{charnames_stringified_ords};
e79869e1 527 $^H{charnames_scripts} = $hints_ref->{charnames_scripts};
84374e30
KW
528 $^H{charnames_full} = $hints_ref->{charnames_full};
529 $^H{charnames_short} = $hints_ref->{charnames_short};
530 }
531
232cbbee
KW
532 # User alias should be checked first or else can't override ours, and if we
533 # add any, could conflict with theirs.
84374e30
KW
534 if (exists $^H{charnames_ord_aliases}{$name}) {
535 $ord = $^H{charnames_ord_aliases}{$name};
16036bcd 536 }
84374e30
KW
537 elsif (exists $^H{charnames_name_aliases}{$name}) {
538 $name = $^H{charnames_name_aliases}{$name};
e79869e1 539 $save_input = $name; # Cache the result for any error message
232cbbee
KW
540 }
541 elsif (exists $system_aliases{$name}) {
542 $ord = $system_aliases{$name};
52ea3e69 543 }
232cbbee 544 elsif (exists $deprecated_aliases{$name}) {
35c0985d 545 require warnings;
232cbbee
KW
546 warnings::warnif('deprecated', "Unicode character name \"$name\" is deprecated, use \"" . viacode($deprecated_aliases{$name}) . "\" instead");
547 $ord = $deprecated_aliases{$name};
52ea3e69 548 }
b177ca84 549
423cee85 550 my @off;
52ea3e69 551
16036bcd 552 if (! defined $ord) {
35c0985d 553
e85f9eed 554 # See if has looked this up earlier.
84374e30
KW
555 if ($^H{charnames_full} && exists $full_names_cache{$name}) {
556 $ord = $full_names_cache{$name};
35c0985d 557 }
84374e30 558 else {
35c0985d 559
84374e30
KW
560 ## Suck in the code/name list as a big string.
561 ## Lines look like:
562 ## "0052\t\tLATIN CAPITAL LETTER R\n"
563 $txt = do "unicore/Name.pl" unless $txt;
564
565 ## @off will hold the index into the code/name string of the start and
566 ## end of the name as we find it.
567
568 ## If :full, look for the name exactly; runtime implies full
8a684a5b 569 my $found_full_in_table = 0; # Tells us if can cache the result
84374e30 570 if ($^H{charnames_full}) {
52fb7278
KW
571 if ($txt =~ /\t\t\Q$name\E$/m) {
572 @off = ($-[0] + 2, $+[0]); # The 2 is for the 2 tabs
8a684a5b 573 $found_full_in_table = 1;
52fb7278 574 }
423cee85 575 }
b177ca84 576
e79869e1 577 # If we didn't get it above, keep looking
8a684a5b 578 if (! $found_full_in_table) {
84374e30 579
dc023ef4 580 # If :short is allowed, see if input is like "greek:Sigma".
e79869e1 581 my $scripts_trie;
52fb7278 582 if (($^H{charnames_short})
dc023ef4
KW
583 && $name =~ /^ \s* (.+?) \s* : \s* (.+?) \s* $ /xs)
584 {
e79869e1
KW
585 $scripts_trie = "\U\Q$1";
586 $name = $2;
dc023ef4
KW
587 }
588 else {
e79869e1 589 $scripts_trie = $^H{charnames_scripts};
dc023ef4
KW
590 }
591
e79869e1
KW
592 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
593 if ($txt !~
594 /\t\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U\Q$name\E $/xm)
595 {
596 # Here we still don't have it, give up.
597 return if $runtime;
52fb7278 598
e79869e1
KW
599 # May have zapped input name, get it again.
600 $name = (defined $save_input) ? $save_input : $_[0];
601 carp "Unknown charname '$name'";
602 return 0xFFFD;
603 }
52fb7278 604
e79869e1 605 @off = ($-[0] + 2, $+[0]);
52ea3e69 606 }
35c0985d 607
84374e30
KW
608 ##
609 ## Now know where in the string the name starts.
610 ## The code, in hex, is before that.
611 ##
612 ## The code can be 4-6 characters long, so we've got to sort of
613 ## go look for it, just after the newline that comes before $off[0].
614 ##
615 ## This would be much easier if unicore/Name.pl had info in
616 ## a name/code order, instead of code/name order.
617 ##
618 ## The +1 after the rindex() is to skip past the newline we're finding,
619 ## or, if the rindex() fails, to put us to an offset of zero.
620 ##
621 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
622
623 ## we know where it starts, so turn into number -
624 ## the ordinal for the char.
625 $ord = CORE::hex substr($txt, $hexstart, $off[0] - 2 - $hexstart);
626
627 # Cache the input so as to not have to search the large table
628 # again, but only if it came from the one search that we cache.
8a684a5b 629 $full_names_cache{$name} = $ord if $found_full_in_table;
35c0985d 630 }
423cee85 631 }
b177ca84 632
63098191
KW
633 return $ord if $runtime || $ord <= 255 || ! ($^H & $bytes::hint_bits);
634
635 # Here is compile time, "use bytes" is in effect, and the character
636 # won't fit in a byte
e85f9eed 637 # Prefer any official name over the input one.
e79869e1
KW
638 if (@off) {
639 $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
640 }
641 else {
642 $name = (defined $save_input) ? $save_input : $_[0];
643 }
5a7fb30a 644 croak not_legal_use_bytes_msg($name, $ord);
63098191
KW
645} # lookup_name
646
647sub charnames {
648 my $name = shift;
649
650 # For \N{...}. Looks up the character name and returns its ordinal if
651 # found, undef otherwise. If not in 'use bytes', forces into utf8
652
84374e30
KW
653 my $ord = lookup_name($name);
654 return if ! defined $ord;
63098191 655 return chr $ord if $^H & $bytes::hint_bits;
f0175764 656
52ea3e69 657 no warnings 'utf8'; # allow even illegal characters
bfa383d6 658 return pack "U", $ord;
63098191 659}
423cee85 660
b177ca84
JF
661sub import
662{
663 shift; ## ignore class name
664
35c0985d
MB
665 if (not @_) {
666 carp("`use charnames' needs explicit imports list");
b177ca84 667 }
423cee85 668 $^H{charnames} = \&charnames ;
84374e30
KW
669 $^H{charnames_ord_aliases} = {};
670 $^H{charnames_name_aliases} = {};
671 $^H{charnames_inverse_ords} = {};
b177ca84
JF
672
673 ##
674 ## fill %h keys with our @_ args.
675 ##
35c0985d 676 my ($promote, %h, @args) = (0);
e5c3f898
MG
677 while (my $arg = shift) {
678 if ($arg eq ":alias") {
51cf30b6 679 @_ or
52fb7278 680 croak ":alias needs an argument in charnames";
35c0985d
MB
681 my $alias = shift;
682 if (ref $alias) {
52fb7278
KW
683 ref $alias eq "HASH" or
684 croak "Only HASH reference supported as argument to :alias";
685 alias ($alias);
686 next;
35c0985d 687 }
51cf30b6 688 if ($alias =~ m{:(\w+)$}) {
52fb7278
KW
689 $1 eq "full" || $1 eq "short" and
690 croak ":alias cannot use existing pragma :$1 (reversed order?)";
691 alias_file ($1) and $promote = 1;
692 next;
35c0985d 693 }
51cf30b6
MB
694 alias_file ($alias);
695 next;
696 }
e5c3f898
MG
697 if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
698 warn "unsupported special '$arg' in charnames";
51cf30b6 699 next;
35c0985d 700 }
e5c3f898 701 push @args, $arg;
35c0985d
MB
702 }
703 @args == 0 && $promote and @args = (":full");
704 @h{@args} = (1) x @args;
b177ca84 705
423cee85
JH
706 $^H{charnames_full} = delete $h{':full'};
707 $^H{charnames_short} = delete $h{':short'};
e79869e1 708 my @scripts = map uc, keys %h;
b177ca84
JF
709
710 ##
711 ## If utf8? warnings are enabled, and some scripts were given,
281aa49e 712 ## see if at least we can find one letter from each script.
b177ca84 713 ##
e79869e1 714 if (warnings::enabled('utf8') && @scripts) {
35c0985d
MB
715 $txt = do "unicore/Name.pl" unless $txt;
716
e79869e1 717 for my $script (@scripts) {
35c0985d 718 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
52fb7278 719 warnings::warn('utf8', "No such script: '$script'");
e79869e1 720 $script = quotemeta $script; # Escape it, for use in the re.
b177ca84 721 }
35c0985d 722 }
bd62941a 723 }
84374e30
KW
724
725 # %^H gets stringified, so serialize it ourselves so can extract the
726 # real data back later.
727 $^H{charnames_stringified_ords} = join ",", %{$^H{charnames_ord_aliases}};
728 $^H{charnames_stringified_names} = join ",", %{$^H{charnames_name_aliases}};
729 $^H{charnames_stringified_inverse_ords} = join ",", %{$^H{charnames_inverse_ords}};
e79869e1 730 $^H{charnames_scripts} = join "|", @scripts; # Stringifiy them as a trie
35c0985d 731} # import
423cee85 732
84374e30
KW
733# Cache of already looked-up values. This is set to only contain
734# official values, and user aliases can't override them, so scoping is
735# not an issue.
736my %viacode;
63098191
KW
737
738sub viacode {
739
740 # Returns the name of the code point argument
4e2cda5d 741
35c0985d
MB
742 if (@_ != 1) {
743 carp "charnames::viacode() expects one argument";
bd5c3bd9 744 return;
35c0985d 745 }
f0175764 746
35c0985d 747 my $arg = shift;
b177ca84 748
e5432b89
KW
749 # This is derived from Unicode::UCD, where it is nearly the same as the
750 # function _getcode(), but here it makes sure that even a hex argument
751 # has the proper number of leading zeros, which is critical in
752 # matching against $txt below
281aa49e 753 # Must check if decimal first; see comments at that definition
35c0985d 754 my $hex;
232cbbee 755 if ($arg =~ $decimal_qr) {
35c0985d 756 $hex = sprintf "%04X", $arg;
232cbbee 757 } elsif ($arg =~ $hex_qr) {
e10d7780 758 # Below is the line that differs from the _getcode() source
c8002005 759 $hex = sprintf "%04X", hex $1;
35c0985d
MB
760 } else {
761 carp("unexpected arg \"$arg\" to charnames::viacode()");
762 return;
763 }
b177ca84 764
35c0985d 765 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 766
ac046fe1
KW
767 # If the code point is above the max in the table, there's no point
768 # looking through it. Checking the length first is slightly faster
769 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) {
770 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 771
ac046fe1
KW
772 # Return the official name, if exists. It's unclear to me (khw) at
773 # this juncture if it is better to return a user-defined override, so
774 # leaving it as is for now.
f3227b74
KW
775 if ($txt =~ m/^$hex\t\t/m) {
776
52fb7278
KW
777 # The name starts with the next character and goes up to the
778 # next new-line. Using capturing parentheses above instead of
779 # @+ more than doubles the execution time in Perl 5.13
f3227b74 780 $viacode{$hex} = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
52fb7278 781 return $viacode{$hex};
ac046fe1 782 }
232cbbee
KW
783 }
784
785 # See if there is a user name for it, before giving up completely.
84374e30
KW
786 # First get the scoped aliases.
787 my %code_point_aliases = split ',',
52fb7278 788 (caller(0))[10]->{charnames_stringified_inverse_ords};
84374e30 789 if (! exists $code_point_aliases{$hex}) {
ac046fe1
KW
790 if (CORE::hex($hex) > 0x10FFFF) {
791 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
792 }
793 return;
794 }
bd5c3bd9 795
84374e30 796 return $code_point_aliases{$hex};
35c0985d 797} # viacode
daf0d493
JH
798
799sub vianame
800{
35c0985d
MB
801 if (@_ != 1) {
802 carp "charnames::vianame() expects one name argument";
803 return ()
804 }
daf0d493 805
63098191
KW
806 # Looks up the character name and returns its ordinal if
807 # found, undef otherwise.
daf0d493 808
63098191 809 my $arg = shift;
dbc0d4f2 810
63098191 811 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d 812
63098191 813 # khw claims that this is bad. The function should return either a
bb679142 814 # an ord or a chr for all inputs; not be bipolar.
5a7fb30a
KW
815 my $ord = CORE::hex $1;
816 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
817 carp not_legal_use_bytes_msg($arg, $ord);
818 return;
63098191 819 }
daf0d493 820
84374e30 821 return lookup_name($arg, (caller(0))[10]);
35c0985d 822} # vianame
b177ca84 823
423cee85
JH
824
8251;
826__END__
827
828=head1 NAME
829
f12d74c0 830charnames - access to Unicode character names; define character names for C<\N{named}> string literal escapes
423cee85
JH
831
832=head1 SYNOPSIS
833
834 use charnames ':full';
4a2d328f 835 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
836
837 use charnames ':short';
4a2d328f 838 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
839
840 use charnames qw(cyrillic greek);
4a2d328f 841 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 842
35c0985d
MB
843 use charnames ":full", ":alias" => {
844 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
e5432b89 845 mychar => 0xE8000, # Private use area
76ae0c45 846 };
35c0985d 847 print "\N{e_ACUTE} is a small letter e with an acute.\n";
da9dec57 848 print "\\N{mychar} allows me to name private use characters.\n";
35c0985d 849
76ae0c45 850 use charnames ();
a23c04e4 851 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
16036bcd
KW
852 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
853 # "10330"
b177ca84 854
423cee85
JH
855=head1 DESCRIPTION
856
da9dec57
KW
857Pragma C<use charnames> is used to gain access to the names of the
858Unicode characters, and to allow you to define your own character names.
859
860All forms of the pragma enable use of the
861L</charnames::vianame(I<name>)> function for run-time lookup of a
862character name to get its ordinal (code point), and the inverse
863function, L</charnames::viacode(I<code>)>.
864
865Forms other than C<S<"use charnames ();">> enable the use of of
866C<\N{I<CHARNAME>}> sequences to compile a Unicode character into a
867string based on its name.
868
869Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
870also inserts a character into a string, but doesn't require the use of
871this pragma. The character it inserts is the one whose code point
872(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
873the Unicode (white background, black foreground) smiley face; it doesn't
874require this pragma, whereas the equivalent, C<"\N{WHITE SMILING FACE}">
875does.
876Also, C<\N{I<...>}> can mean a regex quantifier instead of a character
877name, when the I<...> is a number (or comma separated pair of numbers;
878see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
879
880The C<charnames> pragma supports arguments C<:full>, C<:short>, script
881names and customized aliases. If C<:full> is present, for expansion of
882C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
76ae0c45 883standard Unicode character names. If C<:short> is present, and
da9dec57
KW
884I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
885as a letter in script I<SCRIPT>. If C<use charnames> is used
886with script name arguments, then for C<\N{I<CHARNAME>}> the name
887I<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
888specified order). Customized aliases can override these, and are explained in
889L</CUSTOM ALIASES>.
423cee85 890
da9dec57 891For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>
d5448623 892this pragma looks for the names
423cee85
JH
893
894 SCRIPTNAME CAPITAL LETTER CHARNAME
895 SCRIPTNAME SMALL LETTER CHARNAME
896 SCRIPTNAME LETTER CHARNAME
897
da9dec57 898in the table of standard Unicode names. If I<CHARNAME> is lowercase,
daf0d493
JH
899then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
900is ignored.
901
da9dec57
KW
902Note that C<\N{...}> is compile-time; it's a special form of string
903constant used inside double-quotish strings; this means that you cannot
4e2cda5d 904use variables inside the C<\N{...}>. If you want similar run-time
da9dec57 905functionality, use L<charnames::vianame()|/charnames::vianame(I<name>)>.
423cee85 906
301a3cda 907For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
da9dec57
KW
908there are no official Unicode names but you can use instead the ISO 6429
909names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF,
1f31fcd4
KW
910ESC, ...). In Unicode 3.2 (as of Perl 5.8) some naming changes took
911place, and ISO 6429 was updated, see L</ALIASES>.
301a3cda 912
e5432b89
KW
913If the input name is unknown, C<\N{NAME}> raises a warning and
914substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
915
916It is a fatal error if C<use bytes> is in effect and the input name is
917that of a character that won't fit into a byte (i.e., whose ordinal is
918above 255).
919
da9dec57
KW
920Otherwise, any string that includes a C<\N{I<charname>}> or
921C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
922L<perlunicode/Byte and Character Semantics>).
923
5ffe0e96 924=head1 ALIASES
423cee85 925
5ffe0e96
MB
926A few aliases have been defined for convenience: instead of having
927to use the official names
423cee85 928
5ffe0e96
MB
929 LINE FEED (LF)
930 FORM FEED (FF)
931 CARRIAGE RETURN (CR)
932 NEXT LINE (NEL)
423cee85 933
e5432b89 934(yes, with parentheses), one can use
d5448623 935
5ffe0e96
MB
936 LINE FEED
937 FORM FEED
938 CARRIAGE RETURN
939 NEXT LINE
940 LF
941 FF
942 CR
943 NEL
944
16036bcd
KW
945All the other standard abbreviations for the controls, such as C<ACK> for
946C<ACKNOWLEDGE> also can be used.
947
5ffe0e96
MB
948One can also use
949
950 BYTE ORDER MARK
951 BOM
952
16036bcd
KW
953and these abbreviations
954
955 Abbreviation Full Name
956
957 CGJ COMBINING GRAPHEME JOINER
958 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
959 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
960 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
961 LRE LEFT-TO-RIGHT EMBEDDING
962 LRM LEFT-TO-RIGHT MARK
963 LRO LEFT-TO-RIGHT OVERRIDE
964 MMSP MEDIUM MATHEMATICAL SPACE
965 MVS MONGOLIAN VOWEL SEPARATOR
966 NBSP NO-BREAK SPACE
967 NNBSP NARROW NO-BREAK SPACE
968 PDF POP DIRECTIONAL FORMATTING
969 RLE RIGHT-TO-LEFT EMBEDDING
970 RLM RIGHT-TO-LEFT MARK
971 RLO RIGHT-TO-LEFT OVERRIDE
972 SHY SOFT HYPHEN
973 VS1 VARIATION SELECTOR-1
974 .
975 .
976 .
977 VS256 VARIATION SELECTOR-256
978 WJ WORD JOINER
979 ZWJ ZERO WIDTH JOINER
980 ZWNJ ZERO WIDTH NON-JOINER
981 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
982
983For backward compatibility one can use the old names for
984certain C0 and C1 controls
985
986 old new
987
5ffe0e96
MB
988 FILE SEPARATOR INFORMATION SEPARATOR FOUR
989 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
990 HORIZONTAL TABULATION CHARACTER TABULATION
991 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
992 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
993 WITH JUSTIFICATION
5ffe0e96
MB
994 PARTIAL LINE DOWN PARTIAL LINE FORWARD
995 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
996 RECORD SEPARATOR INFORMATION SEPARATOR TWO
997 REVERSE INDEX REVERSE LINE FEED
998 UNIT SEPARATOR INFORMATION SEPARATOR ONE
999 VERTICAL TABULATION LINE TABULATION
1000 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
1001
1002but the old names in addition to giving the character
1003will also give a warning about being deprecated.
423cee85 1004
16036bcd
KW
1005And finally, certain published variants are usable, including some for
1006controls that have no Unicode names:
1007
1f31fcd4
KW
1008 name character
1009
52fb7278 1010 END OF PROTECTED AREA END OF GUARDED AREA, U+0097
1f31fcd4
KW
1011 HIGH OCTET PRESET U+0081
1012 HOP U+0081
1013 IND U+0084
1014 INDEX U+0084
1015 PAD U+0080
1016 PADDING CHARACTER U+0080
1017 PRIVATE USE 1 PRIVATE USE ONE, U+0091
1018 PRIVATE USE 2 PRIVATE USE TWO, U+0092
1019 SGC U+0099
1020 SINGLE GRAPHIC CHARACTER INTRODUCER U+0099
1021 SINGLE-SHIFT 2 SINGLE SHIFT TWO, U+008E
1022 SINGLE-SHIFT 3 SINGLE SHIFT THREE, U+008F
1023 START OF PROTECTED AREA START OF GUARDED AREA, U+0096
16036bcd 1024
35c0985d
MB
1025=head1 CUSTOM ALIASES
1026
1f31fcd4
KW
1027You can add customized aliases to standard (C<:full>) Unicode naming
1028conventions. The aliases override any standard definitions, so, if
da9dec57
KW
1029you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
1030mean C<"B">, etc.
55bc7d3c
KW
1031
1032Note that an alias should not be something that is a legal curly
1033brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
e5432b89
KW
1034C<\N{123}> means to match 123 non-newline characters, and is not treated as a
1035charnames alias. Aliases are discouraged from beginning with anything
1036other than an alphabetic character and from containing anything other
1037than alphanumerics, spaces, dashes, parentheses, and underscores.
1038Currently they must be ASCII.
1039
1040An alias can map to either an official Unicode character name or to a
1041numeric code point (ordinal). The latter is useful for assigning names
1042to code points in Unicode private use areas such as U+E800 through
f12d74c0
KW
1043U+F8FF.
1044A numeric code point must be a non-negative integer or a string beginning
1045with C<"U+"> or C<"0x"> with the remainder considered to be a
1046hexadecimal integer. A literal numeric constant must be unsigned; it
1047will be interpreted as hex if it has a leading zero or contains
1048non-decimal hex digits; otherwise it will be interpreted as decimal.
232cbbee 1049
da9dec57 1050Aliases are added either by the use of anonymous hashes:
35c0985d 1051
da9dec57 1052 use charnames ":alias" => {
35c0985d 1053 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee 1054 mychar1 => 0xE8000,
35c0985d
MB
1055 };
1056 my $str = "\N{e_ACUTE}";
1057
da9dec57 1058or by using a file containing aliases:
35c0985d 1059
da9dec57 1060 use charnames ":alias" => "pro";
35c0985d 1061
da9dec57
KW
1062will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
1063file should return a list in plain perl:
35c0985d
MB
1064
1065 (
1066 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
1067 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
1068 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
1069 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
1070 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
1071 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
1072 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
f12d74c0 1073 mychar2 => "U+E8001",
35c0985d
MB
1074 );
1075
da9dec57
KW
1076Both these methods insert C<":full"> automatically as the first argument (if no
1077other argument is given), and you can give the C<":full"> explicitly as
1078well, like
35c0985d 1079
da9dec57 1080 use charnames ":full", ":alias" => "pro";
35c0985d 1081
da9dec57 1082=head1 charnames::viacode(I<code>)
b177ca84
JF
1083
1084Returns the full name of the character indicated by the numeric code.
da9dec57 1085For example,
b177ca84
JF
1086
1087 print charnames::viacode(0x2722);
1088
1089prints "FOUR TEARDROP-SPOKED ASTERISK".
1090
232cbbee
KW
1091The name returned is the official name for the code point, if
1092available, otherwise your custom alias for it. This means that your
1093alias will only be returned for code points that don't have an official
1094Unicode name (nor Unicode version 1 name), such as private use code
1095points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
da9dec57
KW
1096If you define more than one name for the code point, it is indeterminate
1097which one will be returned.
1098
1099The function returns C<undef> if no name is known for the code point.
1100In Unicode the proper name of these is the empty string, which
1101C<undef> stringifies to. (If you ask for a code point past the legal
1102Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
f12d74c0
KW
1103get C<undef> plus a warning.)
1104
1105The input number must be a non-negative integer or a string beginning
1106with C<"U+"> or C<"0x"> with the remainder considered to be a
1107hexadecimal integer. A literal numeric constant must be unsigned; it
1108will be interpreted as hex if it has a leading zero or contains
1109non-decimal hex digits; otherwise it will be interpreted as decimal.
daf0d493 1110
274085e3
PN
1111Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
1112SPACE", not "BYTE ORDER MARK".
1113
da9dec57 1114=head1 charnames::vianame(I<name>)
daf0d493
JH
1115
1116Returns the code point indicated by the name.
1f31fcd4 1117For example,
daf0d493
JH
1118
1119 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
1120
1121prints "2722".
1122
da9dec57 1123C<vianame> takes the identical inputs that C<\N{...}> does under the
84374e30
KW
1124L<C<:full> option|/DESCRIPTION> to C<charnames>. In addition, any other
1125options for the controlling C<"use charnames"> in the same scope apply,
1126like any L<script list, C<:short> option|/DESCRIPTION>, or L<custom
1127aliases|/CUSTOM ALIASES> you may have defined.
b177ca84 1128
1f31fcd4 1129There are just a few differences. The main one is that under
84374e30 1130most (see L</BUGS> for the others) circumstances, vianame returns
1f31fcd4
KW
1131an ord, whereas C<\\N{...}> is seamlessly placed as a chr into the
1132string in which it appears. This leads to a second difference.
1133Since an ord is returned, it can be that of any character, even one
f12d74c0 1134that isn't legal under the C<S<use bytes>> pragma.
1f31fcd4
KW
1135
1136The final difference is that if the input name is unknown C<vianame>
1137returns C<undef> instead of the REPLACEMENT CHARACTER, and it does not
1138raise a warning message.
b177ca84 1139
5ffe0e96 1140=head1 CUSTOM TRANSLATORS
52ea3e69 1141
5ffe0e96
MB
1142The mechanism of translation of C<\N{...}> escapes is general and not
1143hardwired into F<charnames.pm>. A module can install custom
1144translations (inside the scope which C<use>s the module) with the
1145following magic incantation:
52ea3e69 1146
5ffe0e96 1147 sub import {
52fb7278
KW
1148 shift;
1149 $^H{charnames} = \&translator;
5ffe0e96 1150 }
52ea3e69 1151
da9dec57 1152Here translator() is a subroutine which takes I<CHARNAME> as an
5ffe0e96 1153argument, and returns text to insert into the string instead of the
da9dec57 1154C<\N{I<CHARNAME>}> escape. Since the text to insert should be different
5ffe0e96
MB
1155in C<bytes> mode and out of it, the function should check the current
1156state of C<bytes>-flag as in:
52ea3e69 1157
52fb7278 1158 use bytes (); # for $bytes::hint_bits
5ffe0e96 1159 sub translator {
52fb7278
KW
1160 if ($^H & $bytes::hint_bits) {
1161 return bytes_translator(@_);
1162 }
1163 else {
1164 return utf8_translator(@_);
1165 }
5ffe0e96 1166 }
52ea3e69 1167
da9dec57 1168See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
f0175764 1169
1f31fcd4
KW
1170Of course, C<vianame> and C<viacode> would need to be overridden as
1171well.
1172
423cee85
JH
1173=head1 BUGS
1174
55bc7d3c 1175vianame returns a chr if the input name is of the form C<U+...>, and an ord
a0a3bc7f 1176otherwise. It is proposed to change this to always return an ord. Send email
1f31fcd4
KW
1177to C<perl5-porters@perl.org> to comment on this proposal. If S<C<use
1178bytes>> is in effect when a chr is returned, and if that chr won't fit
1179into a byte, C<undef> is returned instead.
55bc7d3c 1180
16036bcd
KW
1181Names must be ASCII characters only, which means that you are out of luck if
1182you want to create aliases in a language where some or all the characters of
1183the desired aliases are non-ASCII.
bee80e93 1184
fe749c9a
KW
1185Unicode standard named sequences are not recognized, such as
1186C<LATIN CAPITAL LETTER A WITH MACRON AND GRAVE>
1187(which should mean C<LATIN CAPITAL LETTER A WITH MACRON> with an additional
1188C<COMBINING GRAVE ACCENT>).
1189
f12d74c0
KW
1190Since evaluation of the translation function (see L</CUSTOM
1191TRANSLATORS>) happens in the middle of compilation (of a string
1192literal), the translation function should not do any C<eval>s or
1193C<require>s. This restriction should be lifted (but is low priority) in
1194a future version of Perl.
423cee85
JH
1195
1196=cut
0eacc33e 1197
52fb7278 1198# ex: set ts=8 sts=2 sw=2 et: