This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlpragma.pod: Clarify that runtime %^H is ro
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
63098191 5our $VERSION = '1.11';
b75c8c73 6
d5448623 7use bytes (); # for $bytes::hint_bits
423cee85 8
232cbbee 9my %system_aliases = (
16036bcd
KW
10 # Icky 3.2 names with parentheses.
11 'LINE FEED' => 0x0A, # LINE FEED (LF)
12 'FORM FEED' => 0x0C, # FORM FEED (FF)
13 'CARRIAGE RETURN' => 0x0D, # CARRIAGE RETURN (CR)
14 'NEXT LINE' => 0x85, # NEXT LINE (NEL)
15
16 # Some variant names from Wikipedia
17 'SINGLE-SHIFT 2' => 0x8E,
18 'SINGLE-SHIFT 3' => 0x8F,
19 'PRIVATE USE 1' => 0x91,
20 'PRIVATE USE 2' => 0x92,
21 'START OF PROTECTED AREA' => 0x96,
22 'END OF PROTECTED AREA' => 0x97,
23
24 # Convenience. Standard abbreviations for the controls
25 'NUL' => 0x00, # NULL
26 'SOH' => 0x01, # START OF HEADING
27 'STX' => 0x02, # START OF TEXT
28 'ETX' => 0x03, # END OF TEXT
29 'EOT' => 0x04, # END OF TRANSMISSION
30 'ENQ' => 0x05, # ENQUIRY
31 'ACK' => 0x06, # ACKNOWLEDGE
32 'BEL' => 0x07, # BELL
33 'BS' => 0x08, # BACKSPACE
34 'HT' => 0x09, # HORIZONTAL TABULATION
35 'LF' => 0x0A, # LINE FEED (LF)
36 'VT' => 0x0B, # VERTICAL TABULATION
37 'FF' => 0x0C, # FORM FEED (FF)
38 'CR' => 0x0D, # CARRIAGE RETURN (CR)
39 'SO' => 0x0E, # SHIFT OUT
40 'SI' => 0x0F, # SHIFT IN
41 'DLE' => 0x10, # DATA LINK ESCAPE
42 'DC1' => 0x11, # DEVICE CONTROL ONE
43 'DC2' => 0x12, # DEVICE CONTROL TWO
44 'DC3' => 0x13, # DEVICE CONTROL THREE
45 'DC4' => 0x14, # DEVICE CONTROL FOUR
46 'NAK' => 0x15, # NEGATIVE ACKNOWLEDGE
47 'SYN' => 0x16, # SYNCHRONOUS IDLE
48 'ETB' => 0x17, # END OF TRANSMISSION BLOCK
49 'CAN' => 0x18, # CANCEL
50 'EOM' => 0x19, # END OF MEDIUM
51 'SUB' => 0x1A, # SUBSTITUTE
52 'ESC' => 0x1B, # ESCAPE
53 'FS' => 0x1C, # FILE SEPARATOR
54 'GS' => 0x1D, # GROUP SEPARATOR
55 'RS' => 0x1E, # RECORD SEPARATOR
56 'US' => 0x1F, # UNIT SEPARATOR
57 'DEL' => 0x7F, # DELETE
58 'BPH' => 0x82, # BREAK PERMITTED HERE
59 'NBH' => 0x83, # NO BREAK HERE
60 'NEL' => 0x85, # NEXT LINE (NEL)
61 'SSA' => 0x86, # START OF SELECTED AREA
62 'ESA' => 0x87, # END OF SELECTED AREA
63 'HTS' => 0x88, # CHARACTER TABULATION SET
64 'HTJ' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
65 'VTS' => 0x8A, # LINE TABULATION SET
66 'PLD' => 0x8B, # PARTIAL LINE FORWARD
67 'PLU' => 0x8C, # PARTIAL LINE BACKWARD
68 'RI ' => 0x8D, # REVERSE LINE FEED
69 'SS2' => 0x8E, # SINGLE SHIFT TWO
70 'SS3' => 0x8F, # SINGLE SHIFT THREE
71 'DCS' => 0x90, # DEVICE CONTROL STRING
72 'PU1' => 0x91, # PRIVATE USE ONE
73 'PU2' => 0x92, # PRIVATE USE TWO
74 'STS' => 0x93, # SET TRANSMIT STATE
75 'CCH' => 0x94, # CANCEL CHARACTER
76 'MW ' => 0x95, # MESSAGE WAITING
77 'SPA' => 0x96, # START OF GUARDED AREA
78 'EPA' => 0x97, # END OF GUARDED AREA
79 'SOS' => 0x98, # START OF STRING
80 'SCI' => 0x9A, # SINGLE CHARACTER INTRODUCER
81 'CSI' => 0x9B, # CONTROL SEQUENCE INTRODUCER
82 'ST ' => 0x9C, # STRING TERMINATOR
83 'OSC' => 0x9D, # OPERATING SYSTEM COMMAND
84 'PM ' => 0x9E, # PRIVACY MESSAGE
85 'APC' => 0x9F, # APPLICATION PROGRAM COMMAND
86
87 # There are no names for these in the Unicode standard;
88 # perhaps should be deprecated, but then again there are
89 # no alternative names, so am not deprecating. And if
90 # did, the code would have to change to not recommend an
91 # alternative for these.
92 'PADDING CHARACTER' => 0x80,
93 'PAD' => 0x80,
94 'HIGH OCTET PRESET' => 0x81,
95 'HOP' => 0x81,
96 'INDEX' => 0x84,
97 'IND' => 0x84,
98 'SINGLE GRAPHIC CHARACTER INTRODUCER' => 0x99,
99 'SGC' => 0x99,
100
101 # More convenience. For further convenience,
102 # it is suggested some way of using the NamesList
103 # aliases be implemented, but there are ambiguities in
232cbbee 104 # NamesList.txt
16036bcd
KW
105 'BOM' => 0xFEFF, # BYTE ORDER MARK
106 'BYTE ORDER MARK'=> 0xFEFF,
107 'CGJ' => 0x034F, # COMBINING GRAPHEME JOINER
108 'FVS1' => 0x180B, # MONGOLIAN FREE VARIATION SELECTOR ONE
109 'FVS2' => 0x180C, # MONGOLIAN FREE VARIATION SELECTOR TWO
110 'FVS3' => 0x180D, # MONGOLIAN FREE VARIATION SELECTOR THREE
111 'LRE' => 0x202A, # LEFT-TO-RIGHT EMBEDDING
112 'LRM' => 0x200E, # LEFT-TO-RIGHT MARK
113 'LRO' => 0x202D, # LEFT-TO-RIGHT OVERRIDE
114 'MMSP' => 0x205F, # MEDIUM MATHEMATICAL SPACE
115 'MVS' => 0x180E, # MONGOLIAN VOWEL SEPARATOR
116 'NBSP' => 0x00A0, # NO-BREAK SPACE
117 'NNBSP' => 0x202F, # NARROW NO-BREAK SPACE
118 'PDF' => 0x202C, # POP DIRECTIONAL FORMATTING
119 'RLE' => 0x202B, # RIGHT-TO-LEFT EMBEDDING
120 'RLM' => 0x200F, # RIGHT-TO-LEFT MARK
121 'RLO' => 0x202E, # RIGHT-TO-LEFT OVERRIDE
122 'SHY' => 0x00AD, # SOFT HYPHEN
123 'VS1' => 0xFE00, # VARIATION SELECTOR-1
124 'VS2' => 0xFE01, # VARIATION SELECTOR-2
125 'VS3' => 0xFE02, # VARIATION SELECTOR-3
126 'VS4' => 0xFE03, # VARIATION SELECTOR-4
127 'VS5' => 0xFE04, # VARIATION SELECTOR-5
128 'VS6' => 0xFE05, # VARIATION SELECTOR-6
129 'VS7' => 0xFE06, # VARIATION SELECTOR-7
130 'VS8' => 0xFE07, # VARIATION SELECTOR-8
131 'VS9' => 0xFE08, # VARIATION SELECTOR-9
132 'VS10' => 0xFE09, # VARIATION SELECTOR-10
133 'VS11' => 0xFE0A, # VARIATION SELECTOR-11
134 'VS12' => 0xFE0B, # VARIATION SELECTOR-12
135 'VS13' => 0xFE0C, # VARIATION SELECTOR-13
136 'VS14' => 0xFE0D, # VARIATION SELECTOR-14
137 'VS15' => 0xFE0E, # VARIATION SELECTOR-15
138 'VS16' => 0xFE0F, # VARIATION SELECTOR-16
139 'VS17' => 0xE0100, # VARIATION SELECTOR-17
140 'VS18' => 0xE0101, # VARIATION SELECTOR-18
141 'VS19' => 0xE0102, # VARIATION SELECTOR-19
142 'VS20' => 0xE0103, # VARIATION SELECTOR-20
143 'VS21' => 0xE0104, # VARIATION SELECTOR-21
144 'VS22' => 0xE0105, # VARIATION SELECTOR-22
145 'VS23' => 0xE0106, # VARIATION SELECTOR-23
146 'VS24' => 0xE0107, # VARIATION SELECTOR-24
147 'VS25' => 0xE0108, # VARIATION SELECTOR-25
148 'VS26' => 0xE0109, # VARIATION SELECTOR-26
149 'VS27' => 0xE010A, # VARIATION SELECTOR-27
150 'VS28' => 0xE010B, # VARIATION SELECTOR-28
151 'VS29' => 0xE010C, # VARIATION SELECTOR-29
152 'VS30' => 0xE010D, # VARIATION SELECTOR-30
153 'VS31' => 0xE010E, # VARIATION SELECTOR-31
154 'VS32' => 0xE010F, # VARIATION SELECTOR-32
155 'VS33' => 0xE0110, # VARIATION SELECTOR-33
156 'VS34' => 0xE0111, # VARIATION SELECTOR-34
157 'VS35' => 0xE0112, # VARIATION SELECTOR-35
158 'VS36' => 0xE0113, # VARIATION SELECTOR-36
159 'VS37' => 0xE0114, # VARIATION SELECTOR-37
160 'VS38' => 0xE0115, # VARIATION SELECTOR-38
161 'VS39' => 0xE0116, # VARIATION SELECTOR-39
162 'VS40' => 0xE0117, # VARIATION SELECTOR-40
163 'VS41' => 0xE0118, # VARIATION SELECTOR-41
164 'VS42' => 0xE0119, # VARIATION SELECTOR-42
165 'VS43' => 0xE011A, # VARIATION SELECTOR-43
166 'VS44' => 0xE011B, # VARIATION SELECTOR-44
167 'VS45' => 0xE011C, # VARIATION SELECTOR-45
168 'VS46' => 0xE011D, # VARIATION SELECTOR-46
169 'VS47' => 0xE011E, # VARIATION SELECTOR-47
170 'VS48' => 0xE011F, # VARIATION SELECTOR-48
171 'VS49' => 0xE0120, # VARIATION SELECTOR-49
172 'VS50' => 0xE0121, # VARIATION SELECTOR-50
173 'VS51' => 0xE0122, # VARIATION SELECTOR-51
174 'VS52' => 0xE0123, # VARIATION SELECTOR-52
175 'VS53' => 0xE0124, # VARIATION SELECTOR-53
176 'VS54' => 0xE0125, # VARIATION SELECTOR-54
177 'VS55' => 0xE0126, # VARIATION SELECTOR-55
178 'VS56' => 0xE0127, # VARIATION SELECTOR-56
179 'VS57' => 0xE0128, # VARIATION SELECTOR-57
180 'VS58' => 0xE0129, # VARIATION SELECTOR-58
181 'VS59' => 0xE012A, # VARIATION SELECTOR-59
182 'VS60' => 0xE012B, # VARIATION SELECTOR-60
183 'VS61' => 0xE012C, # VARIATION SELECTOR-61
184 'VS62' => 0xE012D, # VARIATION SELECTOR-62
185 'VS63' => 0xE012E, # VARIATION SELECTOR-63
186 'VS64' => 0xE012F, # VARIATION SELECTOR-64
187 'VS65' => 0xE0130, # VARIATION SELECTOR-65
188 'VS66' => 0xE0131, # VARIATION SELECTOR-66
189 'VS67' => 0xE0132, # VARIATION SELECTOR-67
190 'VS68' => 0xE0133, # VARIATION SELECTOR-68
191 'VS69' => 0xE0134, # VARIATION SELECTOR-69
192 'VS70' => 0xE0135, # VARIATION SELECTOR-70
193 'VS71' => 0xE0136, # VARIATION SELECTOR-71
194 'VS72' => 0xE0137, # VARIATION SELECTOR-72
195 'VS73' => 0xE0138, # VARIATION SELECTOR-73
196 'VS74' => 0xE0139, # VARIATION SELECTOR-74
197 'VS75' => 0xE013A, # VARIATION SELECTOR-75
198 'VS76' => 0xE013B, # VARIATION SELECTOR-76
199 'VS77' => 0xE013C, # VARIATION SELECTOR-77
200 'VS78' => 0xE013D, # VARIATION SELECTOR-78
201 'VS79' => 0xE013E, # VARIATION SELECTOR-79
202 'VS80' => 0xE013F, # VARIATION SELECTOR-80
203 'VS81' => 0xE0140, # VARIATION SELECTOR-81
204 'VS82' => 0xE0141, # VARIATION SELECTOR-82
205 'VS83' => 0xE0142, # VARIATION SELECTOR-83
206 'VS84' => 0xE0143, # VARIATION SELECTOR-84
207 'VS85' => 0xE0144, # VARIATION SELECTOR-85
208 'VS86' => 0xE0145, # VARIATION SELECTOR-86
209 'VS87' => 0xE0146, # VARIATION SELECTOR-87
210 'VS88' => 0xE0147, # VARIATION SELECTOR-88
211 'VS89' => 0xE0148, # VARIATION SELECTOR-89
212 'VS90' => 0xE0149, # VARIATION SELECTOR-90
213 'VS91' => 0xE014A, # VARIATION SELECTOR-91
214 'VS92' => 0xE014B, # VARIATION SELECTOR-92
215 'VS93' => 0xE014C, # VARIATION SELECTOR-93
216 'VS94' => 0xE014D, # VARIATION SELECTOR-94
217 'VS95' => 0xE014E, # VARIATION SELECTOR-95
218 'VS96' => 0xE014F, # VARIATION SELECTOR-96
219 'VS97' => 0xE0150, # VARIATION SELECTOR-97
220 'VS98' => 0xE0151, # VARIATION SELECTOR-98
221 'VS99' => 0xE0152, # VARIATION SELECTOR-99
222 'VS100' => 0xE0153, # VARIATION SELECTOR-100
223 'VS101' => 0xE0154, # VARIATION SELECTOR-101
224 'VS102' => 0xE0155, # VARIATION SELECTOR-102
225 'VS103' => 0xE0156, # VARIATION SELECTOR-103
226 'VS104' => 0xE0157, # VARIATION SELECTOR-104
227 'VS105' => 0xE0158, # VARIATION SELECTOR-105
228 'VS106' => 0xE0159, # VARIATION SELECTOR-106
229 'VS107' => 0xE015A, # VARIATION SELECTOR-107
230 'VS108' => 0xE015B, # VARIATION SELECTOR-108
231 'VS109' => 0xE015C, # VARIATION SELECTOR-109
232 'VS110' => 0xE015D, # VARIATION SELECTOR-110
233 'VS111' => 0xE015E, # VARIATION SELECTOR-111
234 'VS112' => 0xE015F, # VARIATION SELECTOR-112
235 'VS113' => 0xE0160, # VARIATION SELECTOR-113
236 'VS114' => 0xE0161, # VARIATION SELECTOR-114
237 'VS115' => 0xE0162, # VARIATION SELECTOR-115
238 'VS116' => 0xE0163, # VARIATION SELECTOR-116
239 'VS117' => 0xE0164, # VARIATION SELECTOR-117
240 'VS118' => 0xE0165, # VARIATION SELECTOR-118
241 'VS119' => 0xE0166, # VARIATION SELECTOR-119
242 'VS120' => 0xE0167, # VARIATION SELECTOR-120
243 'VS121' => 0xE0168, # VARIATION SELECTOR-121
244 'VS122' => 0xE0169, # VARIATION SELECTOR-122
245 'VS123' => 0xE016A, # VARIATION SELECTOR-123
246 'VS124' => 0xE016B, # VARIATION SELECTOR-124
247 'VS125' => 0xE016C, # VARIATION SELECTOR-125
248 'VS126' => 0xE016D, # VARIATION SELECTOR-126
249 'VS127' => 0xE016E, # VARIATION SELECTOR-127
250 'VS128' => 0xE016F, # VARIATION SELECTOR-128
251 'VS129' => 0xE0170, # VARIATION SELECTOR-129
252 'VS130' => 0xE0171, # VARIATION SELECTOR-130
253 'VS131' => 0xE0172, # VARIATION SELECTOR-131
254 'VS132' => 0xE0173, # VARIATION SELECTOR-132
255 'VS133' => 0xE0174, # VARIATION SELECTOR-133
256 'VS134' => 0xE0175, # VARIATION SELECTOR-134
257 'VS135' => 0xE0176, # VARIATION SELECTOR-135
258 'VS136' => 0xE0177, # VARIATION SELECTOR-136
259 'VS137' => 0xE0178, # VARIATION SELECTOR-137
260 'VS138' => 0xE0179, # VARIATION SELECTOR-138
261 'VS139' => 0xE017A, # VARIATION SELECTOR-139
262 'VS140' => 0xE017B, # VARIATION SELECTOR-140
263 'VS141' => 0xE017C, # VARIATION SELECTOR-141
264 'VS142' => 0xE017D, # VARIATION SELECTOR-142
265 'VS143' => 0xE017E, # VARIATION SELECTOR-143
266 'VS144' => 0xE017F, # VARIATION SELECTOR-144
267 'VS145' => 0xE0180, # VARIATION SELECTOR-145
268 'VS146' => 0xE0181, # VARIATION SELECTOR-146
269 'VS147' => 0xE0182, # VARIATION SELECTOR-147
270 'VS148' => 0xE0183, # VARIATION SELECTOR-148
271 'VS149' => 0xE0184, # VARIATION SELECTOR-149
272 'VS150' => 0xE0185, # VARIATION SELECTOR-150
273 'VS151' => 0xE0186, # VARIATION SELECTOR-151
274 'VS152' => 0xE0187, # VARIATION SELECTOR-152
275 'VS153' => 0xE0188, # VARIATION SELECTOR-153
276 'VS154' => 0xE0189, # VARIATION SELECTOR-154
277 'VS155' => 0xE018A, # VARIATION SELECTOR-155
278 'VS156' => 0xE018B, # VARIATION SELECTOR-156
279 'VS157' => 0xE018C, # VARIATION SELECTOR-157
280 'VS158' => 0xE018D, # VARIATION SELECTOR-158
281 'VS159' => 0xE018E, # VARIATION SELECTOR-159
282 'VS160' => 0xE018F, # VARIATION SELECTOR-160
283 'VS161' => 0xE0190, # VARIATION SELECTOR-161
284 'VS162' => 0xE0191, # VARIATION SELECTOR-162
285 'VS163' => 0xE0192, # VARIATION SELECTOR-163
286 'VS164' => 0xE0193, # VARIATION SELECTOR-164
287 'VS165' => 0xE0194, # VARIATION SELECTOR-165
288 'VS166' => 0xE0195, # VARIATION SELECTOR-166
289 'VS167' => 0xE0196, # VARIATION SELECTOR-167
290 'VS168' => 0xE0197, # VARIATION SELECTOR-168
291 'VS169' => 0xE0198, # VARIATION SELECTOR-169
292 'VS170' => 0xE0199, # VARIATION SELECTOR-170
293 'VS171' => 0xE019A, # VARIATION SELECTOR-171
294 'VS172' => 0xE019B, # VARIATION SELECTOR-172
295 'VS173' => 0xE019C, # VARIATION SELECTOR-173
296 'VS174' => 0xE019D, # VARIATION SELECTOR-174
297 'VS175' => 0xE019E, # VARIATION SELECTOR-175
298 'VS176' => 0xE019F, # VARIATION SELECTOR-176
299 'VS177' => 0xE01A0, # VARIATION SELECTOR-177
300 'VS178' => 0xE01A1, # VARIATION SELECTOR-178
301 'VS179' => 0xE01A2, # VARIATION SELECTOR-179
302 'VS180' => 0xE01A3, # VARIATION SELECTOR-180
303 'VS181' => 0xE01A4, # VARIATION SELECTOR-181
304 'VS182' => 0xE01A5, # VARIATION SELECTOR-182
305 'VS183' => 0xE01A6, # VARIATION SELECTOR-183
306 'VS184' => 0xE01A7, # VARIATION SELECTOR-184
307 'VS185' => 0xE01A8, # VARIATION SELECTOR-185
308 'VS186' => 0xE01A9, # VARIATION SELECTOR-186
309 'VS187' => 0xE01AA, # VARIATION SELECTOR-187
310 'VS188' => 0xE01AB, # VARIATION SELECTOR-188
311 'VS189' => 0xE01AC, # VARIATION SELECTOR-189
312 'VS190' => 0xE01AD, # VARIATION SELECTOR-190
313 'VS191' => 0xE01AE, # VARIATION SELECTOR-191
314 'VS192' => 0xE01AF, # VARIATION SELECTOR-192
315 'VS193' => 0xE01B0, # VARIATION SELECTOR-193
316 'VS194' => 0xE01B1, # VARIATION SELECTOR-194
317 'VS195' => 0xE01B2, # VARIATION SELECTOR-195
318 'VS196' => 0xE01B3, # VARIATION SELECTOR-196
319 'VS197' => 0xE01B4, # VARIATION SELECTOR-197
320 'VS198' => 0xE01B5, # VARIATION SELECTOR-198
321 'VS199' => 0xE01B6, # VARIATION SELECTOR-199
322 'VS200' => 0xE01B7, # VARIATION SELECTOR-200
323 'VS201' => 0xE01B8, # VARIATION SELECTOR-201
324 'VS202' => 0xE01B9, # VARIATION SELECTOR-202
325 'VS203' => 0xE01BA, # VARIATION SELECTOR-203
326 'VS204' => 0xE01BB, # VARIATION SELECTOR-204
327 'VS205' => 0xE01BC, # VARIATION SELECTOR-205
328 'VS206' => 0xE01BD, # VARIATION SELECTOR-206
329 'VS207' => 0xE01BE, # VARIATION SELECTOR-207
330 'VS208' => 0xE01BF, # VARIATION SELECTOR-208
331 'VS209' => 0xE01C0, # VARIATION SELECTOR-209
332 'VS210' => 0xE01C1, # VARIATION SELECTOR-210
333 'VS211' => 0xE01C2, # VARIATION SELECTOR-211
334 'VS212' => 0xE01C3, # VARIATION SELECTOR-212
335 'VS213' => 0xE01C4, # VARIATION SELECTOR-213
336 'VS214' => 0xE01C5, # VARIATION SELECTOR-214
337 'VS215' => 0xE01C6, # VARIATION SELECTOR-215
338 'VS216' => 0xE01C7, # VARIATION SELECTOR-216
339 'VS217' => 0xE01C8, # VARIATION SELECTOR-217
340 'VS218' => 0xE01C9, # VARIATION SELECTOR-218
341 'VS219' => 0xE01CA, # VARIATION SELECTOR-219
342 'VS220' => 0xE01CB, # VARIATION SELECTOR-220
343 'VS221' => 0xE01CC, # VARIATION SELECTOR-221
344 'VS222' => 0xE01CD, # VARIATION SELECTOR-222
345 'VS223' => 0xE01CE, # VARIATION SELECTOR-223
346 'VS224' => 0xE01CF, # VARIATION SELECTOR-224
347 'VS225' => 0xE01D0, # VARIATION SELECTOR-225
348 'VS226' => 0xE01D1, # VARIATION SELECTOR-226
349 'VS227' => 0xE01D2, # VARIATION SELECTOR-227
350 'VS228' => 0xE01D3, # VARIATION SELECTOR-228
351 'VS229' => 0xE01D4, # VARIATION SELECTOR-229
352 'VS230' => 0xE01D5, # VARIATION SELECTOR-230
353 'VS231' => 0xE01D6, # VARIATION SELECTOR-231
354 'VS232' => 0xE01D7, # VARIATION SELECTOR-232
355 'VS233' => 0xE01D8, # VARIATION SELECTOR-233
356 'VS234' => 0xE01D9, # VARIATION SELECTOR-234
357 'VS235' => 0xE01DA, # VARIATION SELECTOR-235
358 'VS236' => 0xE01DB, # VARIATION SELECTOR-236
359 'VS237' => 0xE01DC, # VARIATION SELECTOR-237
360 'VS238' => 0xE01DD, # VARIATION SELECTOR-238
361 'VS239' => 0xE01DE, # VARIATION SELECTOR-239
362 'VS240' => 0xE01DF, # VARIATION SELECTOR-240
363 'VS241' => 0xE01E0, # VARIATION SELECTOR-241
364 'VS242' => 0xE01E1, # VARIATION SELECTOR-242
365 'VS243' => 0xE01E2, # VARIATION SELECTOR-243
366 'VS244' => 0xE01E3, # VARIATION SELECTOR-244
367 'VS245' => 0xE01E4, # VARIATION SELECTOR-245
368 'VS246' => 0xE01E5, # VARIATION SELECTOR-246
369 'VS247' => 0xE01E6, # VARIATION SELECTOR-247
370 'VS248' => 0xE01E7, # VARIATION SELECTOR-248
371 'VS249' => 0xE01E8, # VARIATION SELECTOR-249
372 'VS250' => 0xE01E9, # VARIATION SELECTOR-250
373 'VS251' => 0xE01EA, # VARIATION SELECTOR-251
374 'VS252' => 0xE01EB, # VARIATION SELECTOR-252
375 'VS253' => 0xE01EC, # VARIATION SELECTOR-253
376 'VS254' => 0xE01ED, # VARIATION SELECTOR-254
377 'VS255' => 0xE01EE, # VARIATION SELECTOR-255
378 'VS256' => 0xE01EF, # VARIATION SELECTOR-256
379 'WJ' => 0x2060, # WORD JOINER
380 'ZWJ' => 0x200D, # ZERO WIDTH JOINER
381 'ZWNJ' => 0x200C, # ZERO WIDTH NON-JOINER
382 'ZWSP' => 0x200B, # ZERO WIDTH SPACE
383 );
52ea3e69 384
232cbbee 385my %deprecated_aliases = (
16036bcd
KW
386 # Pre-3.2 compatibility (only for the first 256 characters).
387 # Use of these gives deprecated message.
388 'HORIZONTAL TABULATION' => 0x09, # CHARACTER TABULATION
389 'VERTICAL TABULATION' => 0x0B, # LINE TABULATION
390 'FILE SEPARATOR' => 0x1C, # INFORMATION SEPARATOR FOUR
391 'GROUP SEPARATOR' => 0x1D, # INFORMATION SEPARATOR THREE
392 'RECORD SEPARATOR' => 0x1E, # INFORMATION SEPARATOR TWO
393 'UNIT SEPARATOR' => 0x1F, # INFORMATION SEPARATOR ONE
394 'HORIZONTAL TABULATION SET' => 0x88, # CHARACTER TABULATION SET
395 'HORIZONTAL TABULATION WITH JUSTIFICATION' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION
396 'PARTIAL LINE DOWN' => 0x8B, # PARTIAL LINE FORWARD
397 'PARTIAL LINE UP' => 0x8C, # PARTIAL LINE BACKWARD
398 'VERTICAL TABULATION SET' => 0x8A, # LINE TABULATION SET
399 'REVERSE INDEX' => 0x8D, # REVERSE LINE FEED
400 );
52ea3e69 401
cc26ddeb 402my $txt; # The table of official character names
281aa49e
KW
403
404# Designed so that test decimal first, and then hex. Leading zeros
405# imply non-decimal, as do non-[0-9]
232cbbee
KW
406my $decimal_qr = qr/^[1-9]\d*$/;
407
408# Returns the hex number in $1.
409my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/;
423cee85 410
8878f897
T
411sub croak
412{
413 require Carp; goto &Carp::croak;
414} # croak
415
416sub carp
417{
418 require Carp; goto &Carp::carp;
419} # carp
420
cc26ddeb 421sub alias (@) # Set up a single alias
35c0985d 422{
35c0985d 423 my $alias = ref $_[0] ? $_[0] : { @_ };
232cbbee
KW
424 foreach my $name (keys %$alias) {
425 my $value = $alias->{$name};
426 if ($value =~ $decimal_qr) {
427 $user_numeric_aliases{$name} = $value;
428
429 # Use a canonical form.
430 $inverse_user_aliases{sprintf("%04X", $value)} = $name;
431 }
432 elsif ($value =~ $hex_qr) {
b342e77e 433 my $decimal = CORE::hex $1;
232cbbee
KW
434 $user_numeric_aliases{$name} = $decimal;
435
436 # Must convert to decimal and back to guarantee canonical form
437 $inverse_user_aliases{sprintf("%04X", $decimal)} = $name;
438 }
439 else {
440 $user_name_aliases{$name} = $value;
441 }
442 }
35c0985d
MB
443} # alias
444
5a7fb30a
KW
445sub not_legal_use_bytes_msg {
446 my ($name, $ord) = @_;
447 return sprintf("Character 0x%04x with name '$name' is above 0xFF with 'use bytes' in effect", $ord);
448}
449
281aa49e 450sub alias_file ($) # Reads a file containing alias definitions
35c0985d 451{
51cf30b6
MB
452 my ($arg, $file) = @_;
453 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
454 $file = $arg;
455 }
456 elsif ($arg =~ m/^\w+$/) {
457 $file = "unicore/${arg}_alias.pl";
458 }
459 else {
460 croak "Charnames alias files can only have identifier characters";
461 }
35c0985d 462 if (my @alias = do $file) {
51cf30b6
MB
463 @alias == 1 && !defined $alias[0] and
464 croak "$file cannot be used as alias file for charnames";
465 @alias % 2 and
466 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
467 alias (@alias);
468 return (1);
469 }
470 0;
471} # alias_file
472
63098191
KW
473
474sub lookup_name {
b177ca84 475 my $name = shift;
63098191
KW
476 my $runtime = shift; # compile vs run time
477
478 # Finds the ordinal of a character name, first in the aliases, then in
bb679142
KW
479 # the large table. If not found, returns undef if runtime; if
480 # compile, complains and returns the Unicode replacement character.
63098191 481
16036bcd 482 my $ord;
b177ca84 483
232cbbee
KW
484 # User alias should be checked first or else can't override ours, and if we
485 # add any, could conflict with theirs.
486 if (exists $user_numeric_aliases{$name}) {
487 $ord = $user_numeric_aliases{$name};
16036bcd 488 }
232cbbee
KW
489 elsif (exists $user_name_aliases{$name}) {
490 $name = $user_name_aliases{$name};
491 }
492 elsif (exists $system_aliases{$name}) {
493 $ord = $system_aliases{$name};
52ea3e69 494 }
232cbbee 495 elsif (exists $deprecated_aliases{$name}) {
35c0985d 496 require warnings;
232cbbee
KW
497 warnings::warnif('deprecated', "Unicode character name \"$name\" is deprecated, use \"" . viacode($deprecated_aliases{$name}) . "\" instead");
498 $ord = $deprecated_aliases{$name};
52ea3e69 499 }
b177ca84 500
423cee85 501 my @off;
52ea3e69 502
16036bcd 503 if (! defined $ord) {
35c0985d
MB
504 ## Suck in the code/name list as a big string.
505 ## Lines look like:
506 ## "0052\t\tLATIN CAPITAL LETTER R\n"
507 $txt = do "unicore/Name.pl" unless $txt;
508
509 ## @off will hold the index into the code/name string of the start and
510 ## end of the name as we find it.
511
63098191
KW
512 ## If :full, look for the name exactly; runtime implies full
513 if (($runtime || $^H{charnames_full}) && $txt =~ /\t\t\Q$name\E$/m) {
514 @off = ($-[0] + 2, $+[0]); # The 2 is for the 2 tabs
35c0985d
MB
515 }
516
517 ## If we didn't get above, and :short allowed, look for the short name.
518 ## The short name is like "greek:Sigma"
519 unless (@off) {
63098191
KW
520 if (($runtime || $^H{charnames_short}) && $name =~ /^(.+?):(.+)/s) {
521 my ($script, $cname) = ($1, $2);
522 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
523 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
524 @off = ($-[0] + 2, $+[0]);
525 }
423cee85 526 }
35c0985d 527 }
b177ca84 528
35c0985d
MB
529 ## If we still don't have it, check for the name among the loaded
530 ## scripts.
63098191 531 if (! $runtime && not @off) {
35c0985d
MB
532 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
533 for my $script (@{$^H{charnames_scripts}}) {
63098191
KW
534 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
535 @off = ($-[0] + 2, $+[0]);
536 last;
537 }
52ea3e69 538 }
35c0985d
MB
539 }
540
541 ## If we don't have it by now, give up.
542 unless (@off) {
63098191 543 return if $runtime;
35c0985d 544 carp "Unknown charname '$name'";
29eb6cbd 545 return 0xFFFD;
35c0985d
MB
546 }
547
548 ##
549 ## Now know where in the string the name starts.
550 ## The code, in hex, is before that.
551 ##
552 ## The code can be 4-6 characters long, so we've got to sort of
553 ## go look for it, just after the newline that comes before $off[0].
554 ##
555 ## This would be much easier if unicore/Name.pl had info in
556 ## a name/code order, instead of code/name order.
557 ##
558 ## The +1 after the rindex() is to skip past the newline we're finding,
559 ## or, if the rindex() fails, to put us to an offset of zero.
560 ##
561 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
562
563 ## we know where it starts, so turn into number -
564 ## the ordinal for the char.
63098191 565 $ord = CORE::hex substr($txt, $hexstart, $off[0] - 2 - $hexstart);
423cee85 566 }
b177ca84 567
63098191
KW
568 return $ord if $runtime || $ord <= 255 || ! ($^H & $bytes::hint_bits);
569
570 # Here is compile time, "use bytes" is in effect, and the character
571 # won't fit in a byte
281aa49e 572 # Get the official name if have one
5a7fb30a 573 $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
5a7fb30a 574 croak not_legal_use_bytes_msg($name, $ord);
63098191
KW
575} # lookup_name
576
577sub charnames {
578 my $name = shift;
579
580 # For \N{...}. Looks up the character name and returns its ordinal if
581 # found, undef otherwise. If not in 'use bytes', forces into utf8
582
583 my $ord = lookup_name($name, 0); # 0 means compile-time
584 return unless defined $ord;
585 return chr $ord if $^H & $bytes::hint_bits;
f0175764 586
52ea3e69 587 no warnings 'utf8'; # allow even illegal characters
bfa383d6 588 return pack "U", $ord;
63098191 589}
423cee85 590
b177ca84
JF
591sub import
592{
593 shift; ## ignore class name
594
35c0985d
MB
595 if (not @_) {
596 carp("`use charnames' needs explicit imports list");
b177ca84 597 }
423cee85 598 $^H{charnames} = \&charnames ;
b177ca84
JF
599
600 ##
601 ## fill %h keys with our @_ args.
602 ##
35c0985d 603 my ($promote, %h, @args) = (0);
e5c3f898
MG
604 while (my $arg = shift) {
605 if ($arg eq ":alias") {
51cf30b6
MB
606 @_ or
607 croak ":alias needs an argument in charnames";
35c0985d
MB
608 my $alias = shift;
609 if (ref $alias) {
610 ref $alias eq "HASH" or
51cf30b6 611 croak "Only HASH reference supported as argument to :alias";
35c0985d
MB
612 alias ($alias);
613 next;
614 }
51cf30b6
MB
615 if ($alias =~ m{:(\w+)$}) {
616 $1 eq "full" || $1 eq "short" and
617 croak ":alias cannot use existing pragma :$1 (reversed order?)";
618 alias_file ($1) and $promote = 1;
619 next;
35c0985d 620 }
51cf30b6
MB
621 alias_file ($alias);
622 next;
623 }
e5c3f898
MG
624 if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
625 warn "unsupported special '$arg' in charnames";
51cf30b6 626 next;
35c0985d 627 }
e5c3f898 628 push @args, $arg;
35c0985d
MB
629 }
630 @args == 0 && $promote and @args = (":full");
631 @h{@args} = (1) x @args;
b177ca84 632
423cee85
JH
633 $^H{charnames_full} = delete $h{':full'};
634 $^H{charnames_short} = delete $h{':short'};
635 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
636
637 ##
638 ## If utf8? warnings are enabled, and some scripts were given,
281aa49e 639 ## see if at least we can find one letter from each script.
b177ca84 640 ##
35c0985d
MB
641 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
642 $txt = do "unicore/Name.pl" unless $txt;
643
644 for my $script (@{$^H{charnames_scripts}}) {
645 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
646 warnings::warn('utf8', "No such script: '$script'");
b177ca84 647 }
35c0985d 648 }
bd62941a 649 }
35c0985d 650} # import
423cee85 651
63098191
KW
652my %viacode; # Cache of already-found codes
653
654sub viacode {
655
656 # Returns the name of the code point argument
4e2cda5d 657
35c0985d
MB
658 if (@_ != 1) {
659 carp "charnames::viacode() expects one argument";
bd5c3bd9 660 return;
35c0985d 661 }
f0175764 662
35c0985d 663 my $arg = shift;
b177ca84 664
e5432b89
KW
665 # This is derived from Unicode::UCD, where it is nearly the same as the
666 # function _getcode(), but here it makes sure that even a hex argument
667 # has the proper number of leading zeros, which is critical in
668 # matching against $txt below
281aa49e 669 # Must check if decimal first; see comments at that definition
35c0985d 670 my $hex;
232cbbee 671 if ($arg =~ $decimal_qr) {
35c0985d 672 $hex = sprintf "%04X", $arg;
232cbbee 673 } elsif ($arg =~ $hex_qr) {
e10d7780 674 # Below is the line that differs from the _getcode() source
c8002005 675 $hex = sprintf "%04X", hex $1;
35c0985d
MB
676 } else {
677 carp("unexpected arg \"$arg\" to charnames::viacode()");
678 return;
679 }
b177ca84 680
35c0985d 681 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 682
ac046fe1
KW
683 # If the code point is above the max in the table, there's no point
684 # looking through it. Checking the length first is slightly faster
685 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) {
686 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 687
ac046fe1
KW
688 # Return the official name, if exists. It's unclear to me (khw) at
689 # this juncture if it is better to return a user-defined override, so
690 # leaving it as is for now.
f3227b74
KW
691 if ($txt =~ m/^$hex\t\t/m) {
692
693 # The name starts with the next character and goes up to the
694 # next new-line. Using capturing parentheses above instead of
695 # @$+ more than doubles the execution time in Perl 5.13
696 $viacode{$hex} = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
697 return $viacode{$hex};
ac046fe1 698 }
232cbbee
KW
699 }
700
701 # See if there is a user name for it, before giving up completely.
ac046fe1
KW
702 if (! exists $inverse_user_aliases{$hex}) {
703 if (CORE::hex($hex) > 0x10FFFF) {
704 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
705 }
706 return;
707 }
bd5c3bd9 708
232cbbee
KW
709 $viacode{$hex} = $inverse_user_aliases{$hex};
710 return $inverse_user_aliases{$hex};
35c0985d 711} # viacode
daf0d493 712
63098191 713my %vianame; # Cache of already-found names
4e2cda5d 714
daf0d493
JH
715sub vianame
716{
35c0985d
MB
717 if (@_ != 1) {
718 carp "charnames::vianame() expects one name argument";
719 return ()
720 }
daf0d493 721
63098191
KW
722 # Looks up the character name and returns its ordinal if
723 # found, undef otherwise.
daf0d493 724
63098191 725 my $arg = shift;
dbc0d4f2 726
63098191 727 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d 728
63098191 729 # khw claims that this is bad. The function should return either a
bb679142 730 # an ord or a chr for all inputs; not be bipolar.
5a7fb30a
KW
731 my $ord = CORE::hex $1;
732 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
733 carp not_legal_use_bytes_msg($arg, $ord);
734 return;
63098191 735 }
daf0d493 736
63098191
KW
737 if (! exists $vianame{$arg}) {
738 $vianame{$arg} = lookup_name($arg, 1); # 1 means run-time
35c0985d 739 }
63098191
KW
740
741 return $vianame{$arg};
35c0985d 742} # vianame
b177ca84 743
423cee85
JH
744
7451;
746__END__
747
748=head1 NAME
749
f12d74c0 750charnames - access to Unicode character names; define character names for C<\N{named}> string literal escapes
423cee85
JH
751
752=head1 SYNOPSIS
753
754 use charnames ':full';
4a2d328f 755 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
756
757 use charnames ':short';
4a2d328f 758 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
759
760 use charnames qw(cyrillic greek);
4a2d328f 761 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 762
35c0985d
MB
763 use charnames ":full", ":alias" => {
764 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
e5432b89 765 mychar => 0xE8000, # Private use area
76ae0c45 766 };
35c0985d 767 print "\N{e_ACUTE} is a small letter e with an acute.\n";
da9dec57 768 print "\\N{mychar} allows me to name private use characters.\n";
35c0985d 769
76ae0c45 770 use charnames ();
a23c04e4 771 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
16036bcd
KW
772 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
773 # "10330"
b177ca84 774
423cee85
JH
775=head1 DESCRIPTION
776
da9dec57
KW
777Pragma C<use charnames> is used to gain access to the names of the
778Unicode characters, and to allow you to define your own character names.
779
780All forms of the pragma enable use of the
781L</charnames::vianame(I<name>)> function for run-time lookup of a
782character name to get its ordinal (code point), and the inverse
783function, L</charnames::viacode(I<code>)>.
784
785Forms other than C<S<"use charnames ();">> enable the use of of
786C<\N{I<CHARNAME>}> sequences to compile a Unicode character into a
787string based on its name.
788
789Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
790also inserts a character into a string, but doesn't require the use of
791this pragma. The character it inserts is the one whose code point
792(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
793the Unicode (white background, black foreground) smiley face; it doesn't
794require this pragma, whereas the equivalent, C<"\N{WHITE SMILING FACE}">
795does.
796Also, C<\N{I<...>}> can mean a regex quantifier instead of a character
797name, when the I<...> is a number (or comma separated pair of numbers;
798see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
799
800The C<charnames> pragma supports arguments C<:full>, C<:short>, script
801names and customized aliases. If C<:full> is present, for expansion of
802C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
76ae0c45 803standard Unicode character names. If C<:short> is present, and
da9dec57
KW
804I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
805as a letter in script I<SCRIPT>. If C<use charnames> is used
806with script name arguments, then for C<\N{I<CHARNAME>}> the name
807I<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
808specified order). Customized aliases can override these, and are explained in
809L</CUSTOM ALIASES>.
423cee85 810
da9dec57 811For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>
d5448623 812this pragma looks for the names
423cee85
JH
813
814 SCRIPTNAME CAPITAL LETTER CHARNAME
815 SCRIPTNAME SMALL LETTER CHARNAME
816 SCRIPTNAME LETTER CHARNAME
817
da9dec57 818in the table of standard Unicode names. If I<CHARNAME> is lowercase,
daf0d493
JH
819then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
820is ignored.
821
da9dec57
KW
822Note that C<\N{...}> is compile-time; it's a special form of string
823constant used inside double-quotish strings; this means that you cannot
4e2cda5d 824use variables inside the C<\N{...}>. If you want similar run-time
da9dec57 825functionality, use L<charnames::vianame()|/charnames::vianame(I<name>)>.
423cee85 826
301a3cda 827For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
da9dec57
KW
828there are no official Unicode names but you can use instead the ISO 6429
829names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF,
1f31fcd4
KW
830ESC, ...). In Unicode 3.2 (as of Perl 5.8) some naming changes took
831place, and ISO 6429 was updated, see L</ALIASES>.
301a3cda 832
e5432b89
KW
833If the input name is unknown, C<\N{NAME}> raises a warning and
834substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
835
836It is a fatal error if C<use bytes> is in effect and the input name is
837that of a character that won't fit into a byte (i.e., whose ordinal is
838above 255).
839
da9dec57
KW
840Otherwise, any string that includes a C<\N{I<charname>}> or
841C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
842L<perlunicode/Byte and Character Semantics>).
843
5ffe0e96 844=head1 ALIASES
423cee85 845
5ffe0e96
MB
846A few aliases have been defined for convenience: instead of having
847to use the official names
423cee85 848
5ffe0e96
MB
849 LINE FEED (LF)
850 FORM FEED (FF)
851 CARRIAGE RETURN (CR)
852 NEXT LINE (NEL)
423cee85 853
e5432b89 854(yes, with parentheses), one can use
d5448623 855
5ffe0e96
MB
856 LINE FEED
857 FORM FEED
858 CARRIAGE RETURN
859 NEXT LINE
860 LF
861 FF
862 CR
863 NEL
864
16036bcd
KW
865All the other standard abbreviations for the controls, such as C<ACK> for
866C<ACKNOWLEDGE> also can be used.
867
5ffe0e96
MB
868One can also use
869
870 BYTE ORDER MARK
871 BOM
872
16036bcd
KW
873and these abbreviations
874
875 Abbreviation Full Name
876
877 CGJ COMBINING GRAPHEME JOINER
878 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
879 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
880 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
881 LRE LEFT-TO-RIGHT EMBEDDING
882 LRM LEFT-TO-RIGHT MARK
883 LRO LEFT-TO-RIGHT OVERRIDE
884 MMSP MEDIUM MATHEMATICAL SPACE
885 MVS MONGOLIAN VOWEL SEPARATOR
886 NBSP NO-BREAK SPACE
887 NNBSP NARROW NO-BREAK SPACE
888 PDF POP DIRECTIONAL FORMATTING
889 RLE RIGHT-TO-LEFT EMBEDDING
890 RLM RIGHT-TO-LEFT MARK
891 RLO RIGHT-TO-LEFT OVERRIDE
892 SHY SOFT HYPHEN
893 VS1 VARIATION SELECTOR-1
894 .
895 .
896 .
897 VS256 VARIATION SELECTOR-256
898 WJ WORD JOINER
899 ZWJ ZERO WIDTH JOINER
900 ZWNJ ZERO WIDTH NON-JOINER
901 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
902
903For backward compatibility one can use the old names for
904certain C0 and C1 controls
905
906 old new
907
5ffe0e96
MB
908 FILE SEPARATOR INFORMATION SEPARATOR FOUR
909 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
910 HORIZONTAL TABULATION CHARACTER TABULATION
911 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
912 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
913 WITH JUSTIFICATION
5ffe0e96
MB
914 PARTIAL LINE DOWN PARTIAL LINE FORWARD
915 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
916 RECORD SEPARATOR INFORMATION SEPARATOR TWO
917 REVERSE INDEX REVERSE LINE FEED
918 UNIT SEPARATOR INFORMATION SEPARATOR ONE
919 VERTICAL TABULATION LINE TABULATION
920 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
921
922but the old names in addition to giving the character
923will also give a warning about being deprecated.
423cee85 924
16036bcd
KW
925And finally, certain published variants are usable, including some for
926controls that have no Unicode names:
927
1f31fcd4
KW
928 name character
929
930 END OF PROTECTED AREA END OF GUARDED AREA, U+0097
931 HIGH OCTET PRESET U+0081
932 HOP U+0081
933 IND U+0084
934 INDEX U+0084
935 PAD U+0080
936 PADDING CHARACTER U+0080
937 PRIVATE USE 1 PRIVATE USE ONE, U+0091
938 PRIVATE USE 2 PRIVATE USE TWO, U+0092
939 SGC U+0099
940 SINGLE GRAPHIC CHARACTER INTRODUCER U+0099
941 SINGLE-SHIFT 2 SINGLE SHIFT TWO, U+008E
942 SINGLE-SHIFT 3 SINGLE SHIFT THREE, U+008F
943 START OF PROTECTED AREA START OF GUARDED AREA, U+0096
16036bcd 944
35c0985d
MB
945=head1 CUSTOM ALIASES
946
1f31fcd4
KW
947You can add customized aliases to standard (C<:full>) Unicode naming
948conventions. The aliases override any standard definitions, so, if
da9dec57
KW
949you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
950mean C<"B">, etc.
55bc7d3c
KW
951
952Note that an alias should not be something that is a legal curly
953brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
e5432b89
KW
954C<\N{123}> means to match 123 non-newline characters, and is not treated as a
955charnames alias. Aliases are discouraged from beginning with anything
956other than an alphabetic character and from containing anything other
957than alphanumerics, spaces, dashes, parentheses, and underscores.
958Currently they must be ASCII.
959
960An alias can map to either an official Unicode character name or to a
961numeric code point (ordinal). The latter is useful for assigning names
962to code points in Unicode private use areas such as U+E800 through
f12d74c0
KW
963U+F8FF.
964A numeric code point must be a non-negative integer or a string beginning
965with C<"U+"> or C<"0x"> with the remainder considered to be a
966hexadecimal integer. A literal numeric constant must be unsigned; it
967will be interpreted as hex if it has a leading zero or contains
968non-decimal hex digits; otherwise it will be interpreted as decimal.
232cbbee 969
da9dec57 970Aliases are added either by the use of anonymous hashes:
35c0985d 971
da9dec57 972 use charnames ":alias" => {
35c0985d 973 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee 974 mychar1 => 0xE8000,
35c0985d
MB
975 };
976 my $str = "\N{e_ACUTE}";
977
da9dec57 978or by using a file containing aliases:
35c0985d 979
da9dec57 980 use charnames ":alias" => "pro";
35c0985d 981
da9dec57
KW
982will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
983file should return a list in plain perl:
35c0985d
MB
984
985 (
986 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
987 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
988 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
989 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
990 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
991 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
992 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
f12d74c0 993 mychar2 => "U+E8001",
35c0985d
MB
994 );
995
da9dec57
KW
996Both these methods insert C<":full"> automatically as the first argument (if no
997other argument is given), and you can give the C<":full"> explicitly as
998well, like
35c0985d 999
da9dec57 1000 use charnames ":full", ":alias" => "pro";
35c0985d 1001
da9dec57 1002=head1 charnames::viacode(I<code>)
b177ca84
JF
1003
1004Returns the full name of the character indicated by the numeric code.
da9dec57 1005For example,
b177ca84
JF
1006
1007 print charnames::viacode(0x2722);
1008
1009prints "FOUR TEARDROP-SPOKED ASTERISK".
1010
232cbbee
KW
1011The name returned is the official name for the code point, if
1012available, otherwise your custom alias for it. This means that your
1013alias will only be returned for code points that don't have an official
1014Unicode name (nor Unicode version 1 name), such as private use code
1015points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
da9dec57
KW
1016If you define more than one name for the code point, it is indeterminate
1017which one will be returned.
1018
1019The function returns C<undef> if no name is known for the code point.
1020In Unicode the proper name of these is the empty string, which
1021C<undef> stringifies to. (If you ask for a code point past the legal
1022Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
f12d74c0
KW
1023get C<undef> plus a warning.)
1024
1025The input number must be a non-negative integer or a string beginning
1026with C<"U+"> or C<"0x"> with the remainder considered to be a
1027hexadecimal integer. A literal numeric constant must be unsigned; it
1028will be interpreted as hex if it has a leading zero or contains
1029non-decimal hex digits; otherwise it will be interpreted as decimal.
daf0d493 1030
274085e3
PN
1031Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
1032SPACE", not "BYTE ORDER MARK".
1033
da9dec57 1034=head1 charnames::vianame(I<name>)
daf0d493
JH
1035
1036Returns the code point indicated by the name.
1f31fcd4 1037For example,
daf0d493
JH
1038
1039 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
1040
1041prints "2722".
1042
da9dec57
KW
1043C<vianame> takes the identical inputs that C<\N{...}> does under the
1044L<C<:full> and C<:short>|/DESCRIPTION> options to the C<charnames>
1045pragma, including any L<custom aliases|/CUSTOM ALIASES> you may have
1046defined.
b177ca84 1047
1f31fcd4
KW
1048There are just a few differences. The main one is that under
1049most circumstances, (see L</BUGS> for the other ones), vianame returns
1050an ord, whereas C<\\N{...}> is seamlessly placed as a chr into the
1051string in which it appears. This leads to a second difference.
1052Since an ord is returned, it can be that of any character, even one
f12d74c0 1053that isn't legal under the C<S<use bytes>> pragma.
1f31fcd4
KW
1054
1055The final difference is that if the input name is unknown C<vianame>
1056returns C<undef> instead of the REPLACEMENT CHARACTER, and it does not
1057raise a warning message.
b177ca84 1058
5ffe0e96 1059=head1 CUSTOM TRANSLATORS
52ea3e69 1060
5ffe0e96
MB
1061The mechanism of translation of C<\N{...}> escapes is general and not
1062hardwired into F<charnames.pm>. A module can install custom
1063translations (inside the scope which C<use>s the module) with the
1064following magic incantation:
52ea3e69 1065
5ffe0e96
MB
1066 sub import {
1067 shift;
1068 $^H{charnames} = \&translator;
1069 }
52ea3e69 1070
da9dec57 1071Here translator() is a subroutine which takes I<CHARNAME> as an
5ffe0e96 1072argument, and returns text to insert into the string instead of the
da9dec57 1073C<\N{I<CHARNAME>}> escape. Since the text to insert should be different
5ffe0e96
MB
1074in C<bytes> mode and out of it, the function should check the current
1075state of C<bytes>-flag as in:
52ea3e69 1076
5ffe0e96
MB
1077 use bytes (); # for $bytes::hint_bits
1078 sub translator {
1079 if ($^H & $bytes::hint_bits) {
1080 return bytes_translator(@_);
1081 }
1082 else {
1083 return utf8_translator(@_);
1084 }
1085 }
52ea3e69 1086
da9dec57 1087See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
f0175764 1088
1f31fcd4
KW
1089Of course, C<vianame> and C<viacode> would need to be overridden as
1090well.
1091
423cee85
JH
1092=head1 BUGS
1093
55bc7d3c 1094vianame returns a chr if the input name is of the form C<U+...>, and an ord
a0a3bc7f 1095otherwise. It is proposed to change this to always return an ord. Send email
1f31fcd4
KW
1096to C<perl5-porters@perl.org> to comment on this proposal. If S<C<use
1097bytes>> is in effect when a chr is returned, and if that chr won't fit
1098into a byte, C<undef> is returned instead.
55bc7d3c 1099
da9dec57
KW
1100All the Hangul syllable characters are treated as having no names, as
1101are almost all the CJK Unicode characters that have their code points as
1102part of their names.
55bc7d3c 1103
16036bcd
KW
1104Names must be ASCII characters only, which means that you are out of luck if
1105you want to create aliases in a language where some or all the characters of
1106the desired aliases are non-ASCII.
bee80e93 1107
fe749c9a
KW
1108Unicode standard named sequences are not recognized, such as
1109C<LATIN CAPITAL LETTER A WITH MACRON AND GRAVE>
1110(which should mean C<LATIN CAPITAL LETTER A WITH MACRON> with an additional
1111C<COMBINING GRAVE ACCENT>).
1112
f12d74c0
KW
1113Since evaluation of the translation function (see L</CUSTOM
1114TRANSLATORS>) happens in the middle of compilation (of a string
1115literal), the translation function should not do any C<eval>s or
1116C<require>s. This restriction should be lifted (but is low priority) in
1117a future version of Perl.
423cee85
JH
1118
1119=cut
0eacc33e
KW
1120
1121# ex: set ts=8 sts=2 sw=2 noet: