This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Teach Perl about Unicode named character sequences
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
fb121860 5our $VERSION = '1.16';
b75c8c73 6
52fb7278 7use bytes (); # for $bytes::hint_bits
423cee85 8
fb121860
KW
9# The hashes are stored as utf8 strings. This makes it easier to deal with
10# sequences. I (khw) also tried making Name.pl utf8, but it slowed things
11# down by a factor of 7. I then tried making Name.pl store the ut8
12# equivalents but not calling them utf8. That led to similar speed as leaving
13# it alone, but since that is harder for a human to parse, I left it as-is.
14
232cbbee 15my %system_aliases = (
16036bcd 16 # Icky 3.2 names with parentheses.
fb121860
KW
17 'LINE FEED' => pack("U", 0x0A), # LINE FEED (LF)
18 'FORM FEED' => pack("U", 0x0C), # FORM FEED (FF)
19 'CARRIAGE RETURN' => pack("U", 0x0D), # CARRIAGE RETURN (CR)
20 'NEXT LINE' => pack("U", 0x85), # NEXT LINE (NEL)
16036bcd
KW
21
22 # Some variant names from Wikipedia
fb121860
KW
23 'SINGLE-SHIFT 2' => pack("U", 0x8E),
24 'SINGLE-SHIFT 3' => pack("U", 0x8F),
25 'PRIVATE USE 1' => pack("U", 0x91),
26 'PRIVATE USE 2' => pack("U", 0x92),
27 'START OF PROTECTED AREA' => pack("U", 0x96),
28 'END OF PROTECTED AREA' => pack("U", 0x97),
16036bcd
KW
29
30 # Convenience. Standard abbreviations for the controls
fb121860
KW
31 'NUL' => pack("U", 0x00), # NULL
32 'SOH' => pack("U", 0x01), # START OF HEADING
33 'STX' => pack("U", 0x02), # START OF TEXT
34 'ETX' => pack("U", 0x03), # END OF TEXT
35 'EOT' => pack("U", 0x04), # END OF TRANSMISSION
36 'ENQ' => pack("U", 0x05), # ENQUIRY
37 'ACK' => pack("U", 0x06), # ACKNOWLEDGE
38 'BEL' => pack("U", 0x07), # BELL
39 'BS' => pack("U", 0x08), # BACKSPACE
40 'HT' => pack("U", 0x09), # HORIZONTAL TABULATION
41 'LF' => pack("U", 0x0A), # LINE FEED (LF)
42 'VT' => pack("U", 0x0B), # VERTICAL TABULATION
43 'FF' => pack("U", 0x0C), # FORM FEED (FF)
44 'CR' => pack("U", 0x0D), # CARRIAGE RETURN (CR)
45 'SO' => pack("U", 0x0E), # SHIFT OUT
46 'SI' => pack("U", 0x0F), # SHIFT IN
47 'DLE' => pack("U", 0x10), # DATA LINK ESCAPE
48 'DC1' => pack("U", 0x11), # DEVICE CONTROL ONE
49 'DC2' => pack("U", 0x12), # DEVICE CONTROL TWO
50 'DC3' => pack("U", 0x13), # DEVICE CONTROL THREE
51 'DC4' => pack("U", 0x14), # DEVICE CONTROL FOUR
52 'NAK' => pack("U", 0x15), # NEGATIVE ACKNOWLEDGE
53 'SYN' => pack("U", 0x16), # SYNCHRONOUS IDLE
54 'ETB' => pack("U", 0x17), # END OF TRANSMISSION BLOCK
55 'CAN' => pack("U", 0x18), # CANCEL
56 'EOM' => pack("U", 0x19), # END OF MEDIUM
57 'SUB' => pack("U", 0x1A), # SUBSTITUTE
58 'ESC' => pack("U", 0x1B), # ESCAPE
59 'FS' => pack("U", 0x1C), # FILE SEPARATOR
60 'GS' => pack("U", 0x1D), # GROUP SEPARATOR
61 'RS' => pack("U", 0x1E), # RECORD SEPARATOR
62 'US' => pack("U", 0x1F), # UNIT SEPARATOR
63 'DEL' => pack("U", 0x7F), # DELETE
64 'BPH' => pack("U", 0x82), # BREAK PERMITTED HERE
65 'NBH' => pack("U", 0x83), # NO BREAK HERE
66 'NEL' => pack("U", 0x85), # NEXT LINE (NEL)
67 'SSA' => pack("U", 0x86), # START OF SELECTED AREA
68 'ESA' => pack("U", 0x87), # END OF SELECTED AREA
69 'HTS' => pack("U", 0x88), # CHARACTER TABULATION SET
70 'HTJ' => pack("U", 0x89), # CHARACTER TABULATION WITH JUSTIFICATION
71 'VTS' => pack("U", 0x8A), # LINE TABULATION SET
72 'PLD' => pack("U", 0x8B), # PARTIAL LINE FORWARD
73 'PLU' => pack("U", 0x8C), # PARTIAL LINE BACKWARD
74 'RI ' => pack("U", 0x8D), # REVERSE LINE FEED
75 'SS2' => pack("U", 0x8E), # SINGLE SHIFT TWO
76 'SS3' => pack("U", 0x8F), # SINGLE SHIFT THREE
77 'DCS' => pack("U", 0x90), # DEVICE CONTROL STRING
78 'PU1' => pack("U", 0x91), # PRIVATE USE ONE
79 'PU2' => pack("U", 0x92), # PRIVATE USE TWO
80 'STS' => pack("U", 0x93), # SET TRANSMIT STATE
81 'CCH' => pack("U", 0x94), # CANCEL CHARACTER
82 'MW ' => pack("U", 0x95), # MESSAGE WAITING
83 'SPA' => pack("U", 0x96), # START OF GUARDED AREA
84 'EPA' => pack("U", 0x97), # END OF GUARDED AREA
85 'SOS' => pack("U", 0x98), # START OF STRING
86 'SCI' => pack("U", 0x9A), # SINGLE CHARACTER INTRODUCER
87 'CSI' => pack("U", 0x9B), # CONTROL SEQUENCE INTRODUCER
88 'ST ' => pack("U", 0x9C), # STRING TERMINATOR
89 'OSC' => pack("U", 0x9D), # OPERATING SYSTEM COMMAND
90 'PM ' => pack("U", 0x9E), # PRIVACY MESSAGE
91 'APC' => pack("U", 0x9F), # APPLICATION PROGRAM COMMAND
16036bcd
KW
92
93 # There are no names for these in the Unicode standard;
94 # perhaps should be deprecated, but then again there are
95 # no alternative names, so am not deprecating. And if
96 # did, the code would have to change to not recommend an
97 # alternative for these.
fb121860
KW
98 'PADDING CHARACTER' => pack("U", 0x80),
99 'PAD' => pack("U", 0x80),
100 'HIGH OCTET PRESET' => pack("U", 0x81),
101 'HOP' => pack("U", 0x81),
102 'INDEX' => pack("U", 0x84),
103 'IND' => pack("U", 0x84),
104 'SINGLE GRAPHIC CHARACTER INTRODUCER' => pack("U", 0x99),
105 'SGC' => pack("U", 0x99),
16036bcd
KW
106
107 # More convenience. For further convenience,
108 # it is suggested some way of using the NamesList
109 # aliases be implemented, but there are ambiguities in
232cbbee 110 # NamesList.txt
fb121860
KW
111 'BOM' => pack("U", 0xFEFF), # BYTE ORDER MARK
112 'BYTE ORDER MARK'=> pack("U", 0xFEFF),
113 'CGJ' => pack("U", 0x034F), # COMBINING GRAPHEME JOINER
114 'FVS1' => pack("U", 0x180B), # MONGOLIAN FREE VARIATION SELECTOR ONE
115 'FVS2' => pack("U", 0x180C), # MONGOLIAN FREE VARIATION SELECTOR TWO
116 'FVS3' => pack("U", 0x180D), # MONGOLIAN FREE VARIATION SELECTOR THREE
117 'LRE' => pack("U", 0x202A), # LEFT-TO-RIGHT EMBEDDING
118 'LRM' => pack("U", 0x200E), # LEFT-TO-RIGHT MARK
119 'LRO' => pack("U", 0x202D), # LEFT-TO-RIGHT OVERRIDE
120 'MMSP' => pack("U", 0x205F), # MEDIUM MATHEMATICAL SPACE
121 'MVS' => pack("U", 0x180E), # MONGOLIAN VOWEL SEPARATOR
122 'NBSP' => pack("U", 0x00A0), # NO-BREAK SPACE
123 'NNBSP' => pack("U", 0x202F), # NARROW NO-BREAK SPACE
124 'PDF' => pack("U", 0x202C), # POP DIRECTIONAL FORMATTING
125 'RLE' => pack("U", 0x202B), # RIGHT-TO-LEFT EMBEDDING
126 'RLM' => pack("U", 0x200F), # RIGHT-TO-LEFT MARK
127 'RLO' => pack("U", 0x202E), # RIGHT-TO-LEFT OVERRIDE
128 'SHY' => pack("U", 0x00AD), # SOFT HYPHEN
129 'VS1' => pack("U", 0xFE00), # VARIATION SELECTOR-1
130 'VS2' => pack("U", 0xFE01), # VARIATION SELECTOR-2
131 'VS3' => pack("U", 0xFE02), # VARIATION SELECTOR-3
132 'VS4' => pack("U", 0xFE03), # VARIATION SELECTOR-4
133 'VS5' => pack("U", 0xFE04), # VARIATION SELECTOR-5
134 'VS6' => pack("U", 0xFE05), # VARIATION SELECTOR-6
135 'VS7' => pack("U", 0xFE06), # VARIATION SELECTOR-7
136 'VS8' => pack("U", 0xFE07), # VARIATION SELECTOR-8
137 'VS9' => pack("U", 0xFE08), # VARIATION SELECTOR-9
138 'VS10' => pack("U", 0xFE09), # VARIATION SELECTOR-10
139 'VS11' => pack("U", 0xFE0A), # VARIATION SELECTOR-11
140 'VS12' => pack("U", 0xFE0B), # VARIATION SELECTOR-12
141 'VS13' => pack("U", 0xFE0C), # VARIATION SELECTOR-13
142 'VS14' => pack("U", 0xFE0D), # VARIATION SELECTOR-14
143 'VS15' => pack("U", 0xFE0E), # VARIATION SELECTOR-15
144 'VS16' => pack("U", 0xFE0F), # VARIATION SELECTOR-16
145 'VS17' => pack("U", 0xE0100), # VARIATION SELECTOR-17
146 'VS18' => pack("U", 0xE0101), # VARIATION SELECTOR-18
147 'VS19' => pack("U", 0xE0102), # VARIATION SELECTOR-19
148 'VS20' => pack("U", 0xE0103), # VARIATION SELECTOR-20
149 'VS21' => pack("U", 0xE0104), # VARIATION SELECTOR-21
150 'VS22' => pack("U", 0xE0105), # VARIATION SELECTOR-22
151 'VS23' => pack("U", 0xE0106), # VARIATION SELECTOR-23
152 'VS24' => pack("U", 0xE0107), # VARIATION SELECTOR-24
153 'VS25' => pack("U", 0xE0108), # VARIATION SELECTOR-25
154 'VS26' => pack("U", 0xE0109), # VARIATION SELECTOR-26
155 'VS27' => pack("U", 0xE010A), # VARIATION SELECTOR-27
156 'VS28' => pack("U", 0xE010B), # VARIATION SELECTOR-28
157 'VS29' => pack("U", 0xE010C), # VARIATION SELECTOR-29
158 'VS30' => pack("U", 0xE010D), # VARIATION SELECTOR-30
159 'VS31' => pack("U", 0xE010E), # VARIATION SELECTOR-31
160 'VS32' => pack("U", 0xE010F), # VARIATION SELECTOR-32
161 'VS33' => pack("U", 0xE0110), # VARIATION SELECTOR-33
162 'VS34' => pack("U", 0xE0111), # VARIATION SELECTOR-34
163 'VS35' => pack("U", 0xE0112), # VARIATION SELECTOR-35
164 'VS36' => pack("U", 0xE0113), # VARIATION SELECTOR-36
165 'VS37' => pack("U", 0xE0114), # VARIATION SELECTOR-37
166 'VS38' => pack("U", 0xE0115), # VARIATION SELECTOR-38
167 'VS39' => pack("U", 0xE0116), # VARIATION SELECTOR-39
168 'VS40' => pack("U", 0xE0117), # VARIATION SELECTOR-40
169 'VS41' => pack("U", 0xE0118), # VARIATION SELECTOR-41
170 'VS42' => pack("U", 0xE0119), # VARIATION SELECTOR-42
171 'VS43' => pack("U", 0xE011A), # VARIATION SELECTOR-43
172 'VS44' => pack("U", 0xE011B), # VARIATION SELECTOR-44
173 'VS45' => pack("U", 0xE011C), # VARIATION SELECTOR-45
174 'VS46' => pack("U", 0xE011D), # VARIATION SELECTOR-46
175 'VS47' => pack("U", 0xE011E), # VARIATION SELECTOR-47
176 'VS48' => pack("U", 0xE011F), # VARIATION SELECTOR-48
177 'VS49' => pack("U", 0xE0120), # VARIATION SELECTOR-49
178 'VS50' => pack("U", 0xE0121), # VARIATION SELECTOR-50
179 'VS51' => pack("U", 0xE0122), # VARIATION SELECTOR-51
180 'VS52' => pack("U", 0xE0123), # VARIATION SELECTOR-52
181 'VS53' => pack("U", 0xE0124), # VARIATION SELECTOR-53
182 'VS54' => pack("U", 0xE0125), # VARIATION SELECTOR-54
183 'VS55' => pack("U", 0xE0126), # VARIATION SELECTOR-55
184 'VS56' => pack("U", 0xE0127), # VARIATION SELECTOR-56
185 'VS57' => pack("U", 0xE0128), # VARIATION SELECTOR-57
186 'VS58' => pack("U", 0xE0129), # VARIATION SELECTOR-58
187 'VS59' => pack("U", 0xE012A), # VARIATION SELECTOR-59
188 'VS60' => pack("U", 0xE012B), # VARIATION SELECTOR-60
189 'VS61' => pack("U", 0xE012C), # VARIATION SELECTOR-61
190 'VS62' => pack("U", 0xE012D), # VARIATION SELECTOR-62
191 'VS63' => pack("U", 0xE012E), # VARIATION SELECTOR-63
192 'VS64' => pack("U", 0xE012F), # VARIATION SELECTOR-64
193 'VS65' => pack("U", 0xE0130), # VARIATION SELECTOR-65
194 'VS66' => pack("U", 0xE0131), # VARIATION SELECTOR-66
195 'VS67' => pack("U", 0xE0132), # VARIATION SELECTOR-67
196 'VS68' => pack("U", 0xE0133), # VARIATION SELECTOR-68
197 'VS69' => pack("U", 0xE0134), # VARIATION SELECTOR-69
198 'VS70' => pack("U", 0xE0135), # VARIATION SELECTOR-70
199 'VS71' => pack("U", 0xE0136), # VARIATION SELECTOR-71
200 'VS72' => pack("U", 0xE0137), # VARIATION SELECTOR-72
201 'VS73' => pack("U", 0xE0138), # VARIATION SELECTOR-73
202 'VS74' => pack("U", 0xE0139), # VARIATION SELECTOR-74
203 'VS75' => pack("U", 0xE013A), # VARIATION SELECTOR-75
204 'VS76' => pack("U", 0xE013B), # VARIATION SELECTOR-76
205 'VS77' => pack("U", 0xE013C), # VARIATION SELECTOR-77
206 'VS78' => pack("U", 0xE013D), # VARIATION SELECTOR-78
207 'VS79' => pack("U", 0xE013E), # VARIATION SELECTOR-79
208 'VS80' => pack("U", 0xE013F), # VARIATION SELECTOR-80
209 'VS81' => pack("U", 0xE0140), # VARIATION SELECTOR-81
210 'VS82' => pack("U", 0xE0141), # VARIATION SELECTOR-82
211 'VS83' => pack("U", 0xE0142), # VARIATION SELECTOR-83
212 'VS84' => pack("U", 0xE0143), # VARIATION SELECTOR-84
213 'VS85' => pack("U", 0xE0144), # VARIATION SELECTOR-85
214 'VS86' => pack("U", 0xE0145), # VARIATION SELECTOR-86
215 'VS87' => pack("U", 0xE0146), # VARIATION SELECTOR-87
216 'VS88' => pack("U", 0xE0147), # VARIATION SELECTOR-88
217 'VS89' => pack("U", 0xE0148), # VARIATION SELECTOR-89
218 'VS90' => pack("U", 0xE0149), # VARIATION SELECTOR-90
219 'VS91' => pack("U", 0xE014A), # VARIATION SELECTOR-91
220 'VS92' => pack("U", 0xE014B), # VARIATION SELECTOR-92
221 'VS93' => pack("U", 0xE014C), # VARIATION SELECTOR-93
222 'VS94' => pack("U", 0xE014D), # VARIATION SELECTOR-94
223 'VS95' => pack("U", 0xE014E), # VARIATION SELECTOR-95
224 'VS96' => pack("U", 0xE014F), # VARIATION SELECTOR-96
225 'VS97' => pack("U", 0xE0150), # VARIATION SELECTOR-97
226 'VS98' => pack("U", 0xE0151), # VARIATION SELECTOR-98
227 'VS99' => pack("U", 0xE0152), # VARIATION SELECTOR-99
228 'VS100' => pack("U", 0xE0153), # VARIATION SELECTOR-100
229 'VS101' => pack("U", 0xE0154), # VARIATION SELECTOR-101
230 'VS102' => pack("U", 0xE0155), # VARIATION SELECTOR-102
231 'VS103' => pack("U", 0xE0156), # VARIATION SELECTOR-103
232 'VS104' => pack("U", 0xE0157), # VARIATION SELECTOR-104
233 'VS105' => pack("U", 0xE0158), # VARIATION SELECTOR-105
234 'VS106' => pack("U", 0xE0159), # VARIATION SELECTOR-106
235 'VS107' => pack("U", 0xE015A), # VARIATION SELECTOR-107
236 'VS108' => pack("U", 0xE015B), # VARIATION SELECTOR-108
237 'VS109' => pack("U", 0xE015C), # VARIATION SELECTOR-109
238 'VS110' => pack("U", 0xE015D), # VARIATION SELECTOR-110
239 'VS111' => pack("U", 0xE015E), # VARIATION SELECTOR-111
240 'VS112' => pack("U", 0xE015F), # VARIATION SELECTOR-112
241 'VS113' => pack("U", 0xE0160), # VARIATION SELECTOR-113
242 'VS114' => pack("U", 0xE0161), # VARIATION SELECTOR-114
243 'VS115' => pack("U", 0xE0162), # VARIATION SELECTOR-115
244 'VS116' => pack("U", 0xE0163), # VARIATION SELECTOR-116
245 'VS117' => pack("U", 0xE0164), # VARIATION SELECTOR-117
246 'VS118' => pack("U", 0xE0165), # VARIATION SELECTOR-118
247 'VS119' => pack("U", 0xE0166), # VARIATION SELECTOR-119
248 'VS120' => pack("U", 0xE0167), # VARIATION SELECTOR-120
249 'VS121' => pack("U", 0xE0168), # VARIATION SELECTOR-121
250 'VS122' => pack("U", 0xE0169), # VARIATION SELECTOR-122
251 'VS123' => pack("U", 0xE016A), # VARIATION SELECTOR-123
252 'VS124' => pack("U", 0xE016B), # VARIATION SELECTOR-124
253 'VS125' => pack("U", 0xE016C), # VARIATION SELECTOR-125
254 'VS126' => pack("U", 0xE016D), # VARIATION SELECTOR-126
255 'VS127' => pack("U", 0xE016E), # VARIATION SELECTOR-127
256 'VS128' => pack("U", 0xE016F), # VARIATION SELECTOR-128
257 'VS129' => pack("U", 0xE0170), # VARIATION SELECTOR-129
258 'VS130' => pack("U", 0xE0171), # VARIATION SELECTOR-130
259 'VS131' => pack("U", 0xE0172), # VARIATION SELECTOR-131
260 'VS132' => pack("U", 0xE0173), # VARIATION SELECTOR-132
261 'VS133' => pack("U", 0xE0174), # VARIATION SELECTOR-133
262 'VS134' => pack("U", 0xE0175), # VARIATION SELECTOR-134
263 'VS135' => pack("U", 0xE0176), # VARIATION SELECTOR-135
264 'VS136' => pack("U", 0xE0177), # VARIATION SELECTOR-136
265 'VS137' => pack("U", 0xE0178), # VARIATION SELECTOR-137
266 'VS138' => pack("U", 0xE0179), # VARIATION SELECTOR-138
267 'VS139' => pack("U", 0xE017A), # VARIATION SELECTOR-139
268 'VS140' => pack("U", 0xE017B), # VARIATION SELECTOR-140
269 'VS141' => pack("U", 0xE017C), # VARIATION SELECTOR-141
270 'VS142' => pack("U", 0xE017D), # VARIATION SELECTOR-142
271 'VS143' => pack("U", 0xE017E), # VARIATION SELECTOR-143
272 'VS144' => pack("U", 0xE017F), # VARIATION SELECTOR-144
273 'VS145' => pack("U", 0xE0180), # VARIATION SELECTOR-145
274 'VS146' => pack("U", 0xE0181), # VARIATION SELECTOR-146
275 'VS147' => pack("U", 0xE0182), # VARIATION SELECTOR-147
276 'VS148' => pack("U", 0xE0183), # VARIATION SELECTOR-148
277 'VS149' => pack("U", 0xE0184), # VARIATION SELECTOR-149
278 'VS150' => pack("U", 0xE0185), # VARIATION SELECTOR-150
279 'VS151' => pack("U", 0xE0186), # VARIATION SELECTOR-151
280 'VS152' => pack("U", 0xE0187), # VARIATION SELECTOR-152
281 'VS153' => pack("U", 0xE0188), # VARIATION SELECTOR-153
282 'VS154' => pack("U", 0xE0189), # VARIATION SELECTOR-154
283 'VS155' => pack("U", 0xE018A), # VARIATION SELECTOR-155
284 'VS156' => pack("U", 0xE018B), # VARIATION SELECTOR-156
285 'VS157' => pack("U", 0xE018C), # VARIATION SELECTOR-157
286 'VS158' => pack("U", 0xE018D), # VARIATION SELECTOR-158
287 'VS159' => pack("U", 0xE018E), # VARIATION SELECTOR-159
288 'VS160' => pack("U", 0xE018F), # VARIATION SELECTOR-160
289 'VS161' => pack("U", 0xE0190), # VARIATION SELECTOR-161
290 'VS162' => pack("U", 0xE0191), # VARIATION SELECTOR-162
291 'VS163' => pack("U", 0xE0192), # VARIATION SELECTOR-163
292 'VS164' => pack("U", 0xE0193), # VARIATION SELECTOR-164
293 'VS165' => pack("U", 0xE0194), # VARIATION SELECTOR-165
294 'VS166' => pack("U", 0xE0195), # VARIATION SELECTOR-166
295 'VS167' => pack("U", 0xE0196), # VARIATION SELECTOR-167
296 'VS168' => pack("U", 0xE0197), # VARIATION SELECTOR-168
297 'VS169' => pack("U", 0xE0198), # VARIATION SELECTOR-169
298 'VS170' => pack("U", 0xE0199), # VARIATION SELECTOR-170
299 'VS171' => pack("U", 0xE019A), # VARIATION SELECTOR-171
300 'VS172' => pack("U", 0xE019B), # VARIATION SELECTOR-172
301 'VS173' => pack("U", 0xE019C), # VARIATION SELECTOR-173
302 'VS174' => pack("U", 0xE019D), # VARIATION SELECTOR-174
303 'VS175' => pack("U", 0xE019E), # VARIATION SELECTOR-175
304 'VS176' => pack("U", 0xE019F), # VARIATION SELECTOR-176
305 'VS177' => pack("U", 0xE01A0), # VARIATION SELECTOR-177
306 'VS178' => pack("U", 0xE01A1), # VARIATION SELECTOR-178
307 'VS179' => pack("U", 0xE01A2), # VARIATION SELECTOR-179
308 'VS180' => pack("U", 0xE01A3), # VARIATION SELECTOR-180
309 'VS181' => pack("U", 0xE01A4), # VARIATION SELECTOR-181
310 'VS182' => pack("U", 0xE01A5), # VARIATION SELECTOR-182
311 'VS183' => pack("U", 0xE01A6), # VARIATION SELECTOR-183
312 'VS184' => pack("U", 0xE01A7), # VARIATION SELECTOR-184
313 'VS185' => pack("U", 0xE01A8), # VARIATION SELECTOR-185
314 'VS186' => pack("U", 0xE01A9), # VARIATION SELECTOR-186
315 'VS187' => pack("U", 0xE01AA), # VARIATION SELECTOR-187
316 'VS188' => pack("U", 0xE01AB), # VARIATION SELECTOR-188
317 'VS189' => pack("U", 0xE01AC), # VARIATION SELECTOR-189
318 'VS190' => pack("U", 0xE01AD), # VARIATION SELECTOR-190
319 'VS191' => pack("U", 0xE01AE), # VARIATION SELECTOR-191
320 'VS192' => pack("U", 0xE01AF), # VARIATION SELECTOR-192
321 'VS193' => pack("U", 0xE01B0), # VARIATION SELECTOR-193
322 'VS194' => pack("U", 0xE01B1), # VARIATION SELECTOR-194
323 'VS195' => pack("U", 0xE01B2), # VARIATION SELECTOR-195
324 'VS196' => pack("U", 0xE01B3), # VARIATION SELECTOR-196
325 'VS197' => pack("U", 0xE01B4), # VARIATION SELECTOR-197
326 'VS198' => pack("U", 0xE01B5), # VARIATION SELECTOR-198
327 'VS199' => pack("U", 0xE01B6), # VARIATION SELECTOR-199
328 'VS200' => pack("U", 0xE01B7), # VARIATION SELECTOR-200
329 'VS201' => pack("U", 0xE01B8), # VARIATION SELECTOR-201
330 'VS202' => pack("U", 0xE01B9), # VARIATION SELECTOR-202
331 'VS203' => pack("U", 0xE01BA), # VARIATION SELECTOR-203
332 'VS204' => pack("U", 0xE01BB), # VARIATION SELECTOR-204
333 'VS205' => pack("U", 0xE01BC), # VARIATION SELECTOR-205
334 'VS206' => pack("U", 0xE01BD), # VARIATION SELECTOR-206
335 'VS207' => pack("U", 0xE01BE), # VARIATION SELECTOR-207
336 'VS208' => pack("U", 0xE01BF), # VARIATION SELECTOR-208
337 'VS209' => pack("U", 0xE01C0), # VARIATION SELECTOR-209
338 'VS210' => pack("U", 0xE01C1), # VARIATION SELECTOR-210
339 'VS211' => pack("U", 0xE01C2), # VARIATION SELECTOR-211
340 'VS212' => pack("U", 0xE01C3), # VARIATION SELECTOR-212
341 'VS213' => pack("U", 0xE01C4), # VARIATION SELECTOR-213
342 'VS214' => pack("U", 0xE01C5), # VARIATION SELECTOR-214
343 'VS215' => pack("U", 0xE01C6), # VARIATION SELECTOR-215
344 'VS216' => pack("U", 0xE01C7), # VARIATION SELECTOR-216
345 'VS217' => pack("U", 0xE01C8), # VARIATION SELECTOR-217
346 'VS218' => pack("U", 0xE01C9), # VARIATION SELECTOR-218
347 'VS219' => pack("U", 0xE01CA), # VARIATION SELECTOR-219
348 'VS220' => pack("U", 0xE01CB), # VARIATION SELECTOR-220
349 'VS221' => pack("U", 0xE01CC), # VARIATION SELECTOR-221
350 'VS222' => pack("U", 0xE01CD), # VARIATION SELECTOR-222
351 'VS223' => pack("U", 0xE01CE), # VARIATION SELECTOR-223
352 'VS224' => pack("U", 0xE01CF), # VARIATION SELECTOR-224
353 'VS225' => pack("U", 0xE01D0), # VARIATION SELECTOR-225
354 'VS226' => pack("U", 0xE01D1), # VARIATION SELECTOR-226
355 'VS227' => pack("U", 0xE01D2), # VARIATION SELECTOR-227
356 'VS228' => pack("U", 0xE01D3), # VARIATION SELECTOR-228
357 'VS229' => pack("U", 0xE01D4), # VARIATION SELECTOR-229
358 'VS230' => pack("U", 0xE01D5), # VARIATION SELECTOR-230
359 'VS231' => pack("U", 0xE01D6), # VARIATION SELECTOR-231
360 'VS232' => pack("U", 0xE01D7), # VARIATION SELECTOR-232
361 'VS233' => pack("U", 0xE01D8), # VARIATION SELECTOR-233
362 'VS234' => pack("U", 0xE01D9), # VARIATION SELECTOR-234
363 'VS235' => pack("U", 0xE01DA), # VARIATION SELECTOR-235
364 'VS236' => pack("U", 0xE01DB), # VARIATION SELECTOR-236
365 'VS237' => pack("U", 0xE01DC), # VARIATION SELECTOR-237
366 'VS238' => pack("U", 0xE01DD), # VARIATION SELECTOR-238
367 'VS239' => pack("U", 0xE01DE), # VARIATION SELECTOR-239
368 'VS240' => pack("U", 0xE01DF), # VARIATION SELECTOR-240
369 'VS241' => pack("U", 0xE01E0), # VARIATION SELECTOR-241
370 'VS242' => pack("U", 0xE01E1), # VARIATION SELECTOR-242
371 'VS243' => pack("U", 0xE01E2), # VARIATION SELECTOR-243
372 'VS244' => pack("U", 0xE01E3), # VARIATION SELECTOR-244
373 'VS245' => pack("U", 0xE01E4), # VARIATION SELECTOR-245
374 'VS246' => pack("U", 0xE01E5), # VARIATION SELECTOR-246
375 'VS247' => pack("U", 0xE01E6), # VARIATION SELECTOR-247
376 'VS248' => pack("U", 0xE01E7), # VARIATION SELECTOR-248
377 'VS249' => pack("U", 0xE01E8), # VARIATION SELECTOR-249
378 'VS250' => pack("U", 0xE01E9), # VARIATION SELECTOR-250
379 'VS251' => pack("U", 0xE01EA), # VARIATION SELECTOR-251
380 'VS252' => pack("U", 0xE01EB), # VARIATION SELECTOR-252
381 'VS253' => pack("U", 0xE01EC), # VARIATION SELECTOR-253
382 'VS254' => pack("U", 0xE01ED), # VARIATION SELECTOR-254
383 'VS255' => pack("U", 0xE01EE), # VARIATION SELECTOR-255
384 'VS256' => pack("U", 0xE01EF), # VARIATION SELECTOR-256
385 'WJ' => pack("U", 0x2060), # WORD JOINER
386 'ZWJ' => pack("U", 0x200D), # ZERO WIDTH JOINER
387 'ZWNJ' => pack("U", 0x200C), # ZERO WIDTH NON-JOINER
388 'ZWSP' => pack("U", 0x200B), # ZERO WIDTH SPACE
16036bcd 389 );
52ea3e69 390
232cbbee 391my %deprecated_aliases = (
16036bcd
KW
392 # Pre-3.2 compatibility (only for the first 256 characters).
393 # Use of these gives deprecated message.
fb121860
KW
394 'HORIZONTAL TABULATION' => pack("U", 0x09), # CHARACTER TABULATION
395 'VERTICAL TABULATION' => pack("U", 0x0B), # LINE TABULATION
396 'FILE SEPARATOR' => pack("U", 0x1C), # INFORMATION SEPARATOR FOUR
397 'GROUP SEPARATOR' => pack("U", 0x1D), # INFORMATION SEPARATOR THREE
398 'RECORD SEPARATOR' => pack("U", 0x1E), # INFORMATION SEPARATOR TWO
399 'UNIT SEPARATOR' => pack("U", 0x1F), # INFORMATION SEPARATOR ONE
400 'HORIZONTAL TABULATION SET' => pack("U", 0x88), # CHARACTER TABULATION SET
401 'HORIZONTAL TABULATION WITH JUSTIFICATION' => pack("U", 0x89), # CHARACTER TABULATION WITH JUSTIFICATION
402 'PARTIAL LINE DOWN' => pack("U", 0x8B), # PARTIAL LINE FORWARD
403 'PARTIAL LINE UP' => pack("U", 0x8C), # PARTIAL LINE BACKWARD
404 'VERTICAL TABULATION SET' => pack("U", 0x8A), # LINE TABULATION SET
405 'REVERSE INDEX' => pack("U", 0x8D), # REVERSE LINE FEED
16036bcd 406 );
52ea3e69 407
84374e30 408
cc26ddeb 409my $txt; # The table of official character names
281aa49e 410
84374e30
KW
411my %full_names_cache; # Holds already-looked-up names, so don't have to
412# re-look them up again. The previous versions of charnames had scoping
413# bugs. For example if we use script A in one scope and find and cache
414# what Z resolves to, we can't use that cache in a different scope that
415# uses script B instead of A, as Z might be an entirely different letter
416# there; or there might be different aliases in effect in different
417# scopes, or :short may be in effect or not effect in different scopes,
418# or various combinations thereof. This was solved in this version
419# mostly by moving things to %^H. But some things couldn't be moved
420# there. One of them was the cache of runtime looked-up names, in part
421# because %^H is read-only at runtime. I (khw) don't know why the cache
422# was run-time only in the previous versions: perhaps oversight; perhaps
423# that compile time looking doesn't happen in a loop so didn't think it
424# was worthwhile; perhaps not wanting to make the cache too large. But
425# I decided to make it compile time as well; this could easily be
426# changed.
427# Anyway, this hash is not scoped, and is added to at runtime. It
428# doesn't have scoping problems because the data in it is restricted to
429# official names, which are always invariant, and we only set it and
430# look at it at during :full lookups, so is unaffected by any other
431# scoped options. I put this in to maintain parity with the older
432# version. If desired, a %short_names cache could also be made, as well
433# as one for each script, say in %script_names_cache, with each key
434# being a hash for a script named in a 'use charnames' statement. I
435# decided not to do that for now, just because it's added complication,
436# and because I'm just trying to maintain parity, not extend it.
437
281aa49e
KW
438# Designed so that test decimal first, and then hex. Leading zeros
439# imply non-decimal, as do non-[0-9]
232cbbee
KW
440my $decimal_qr = qr/^[1-9]\d*$/;
441
442# Returns the hex number in $1.
443my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/;
423cee85 444
8878f897
T
445sub croak
446{
447 require Carp; goto &Carp::croak;
448} # croak
449
450sub carp
451{
452 require Carp; goto &Carp::carp;
453} # carp
454
cc26ddeb 455sub alias (@) # Set up a single alias
35c0985d 456{
35c0985d 457 my $alias = ref $_[0] ? $_[0] : { @_ };
232cbbee
KW
458 foreach my $name (keys %$alias) {
459 my $value = $alias->{$name};
52fb7278 460 next unless defined $value; # Omit if screwed up.
84374e30
KW
461
462 # Is slightly slower to just after this statement see if it is
463 # decimal, since we already know it is after having converted from
464 # hex, but makes the code easier to maintain, and is called
465 # infrequently, only at compile-time
466 if ($value !~ $decimal_qr && $value =~ $hex_qr) {
467 $value = CORE::hex $1;
468 }
232cbbee 469 if ($value =~ $decimal_qr) {
fb121860
KW
470 no warnings 'utf8'; # Allow even illegal characters
471 $^H{charnames_ord_aliases}{$name} = pack("U", $value);
232cbbee
KW
472
473 # Use a canonical form.
b1c167a3 474 $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
232cbbee
KW
475 }
476 else {
52fb7278
KW
477 # XXX validate syntax when deprecation cycle complete. ie. start
478 # with an alpha only, etc.
479 $^H{charnames_name_aliases}{$name} = $value;
232cbbee
KW
480 }
481 }
35c0985d
MB
482} # alias
483
5a7fb30a 484sub not_legal_use_bytes_msg {
fb121860
KW
485 my ($name, $utf8) = @_;
486 my $return;
487
488 if (length($utf8) == 1) {
489 $return = sprintf("Character 0x%04x with name '%s' is", ord $utf8, $name);
490 } else {
491 $return = sprintf("String with name '%s' (and ordinals %s) contains character(s)", $name, join(" ", map { sprintf "0x%04X", ord $_ } split(//, $utf8)));
492 }
493 return $return . " above 0xFF with 'use bytes' in effect";
5a7fb30a
KW
494}
495
281aa49e 496sub alias_file ($) # Reads a file containing alias definitions
35c0985d 497{
51cf30b6
MB
498 my ($arg, $file) = @_;
499 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
500 $file = $arg;
501 }
502 elsif ($arg =~ m/^\w+$/) {
503 $file = "unicore/${arg}_alias.pl";
504 }
505 else {
506 croak "Charnames alias files can only have identifier characters";
507 }
35c0985d 508 if (my @alias = do $file) {
51cf30b6
MB
509 @alias == 1 && !defined $alias[0] and
510 croak "$file cannot be used as alias file for charnames";
511 @alias % 2 and
512 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
513 alias (@alias);
514 return (1);
515 }
516 0;
517} # alias_file
518
03f95285
KW
519# For use when don't import anything. This structure must be kept in
520# sync with the one that import() fills up.
521my %dummy_H = (
522 charnames_stringified_names => "",
523 charnames_stringified_ords => "",
524 charnames_scripts => "",
525 charnames_full => 1,
526 charnames_short => 0,
527 );
528
63098191 529
fb121860
KW
530sub lookup_name ($$$) {
531 my ($name, $wants_ord, $runtime) = @_;
63098191 532
fb121860
KW
533 # Lookup the name or sequence $name in the tables. If $wants_ord is false,
534 # returns the string equivalent of $name; if true, returns the ordinal value
535 # instead, but in this case $name must not be a sequence; otherwise undef is
536 # returned and a warning raised. $runtime is 0 if compiletime, otherwise
537 # gives the number of stack frames to go back to get the application caller
538 # info.
539 # If $name is not found, returns undef in runtime with no warning; and in
540 # compiletime, the Unicode replacement character, with a warning.
63098191 541
fb121860
KW
542 # It looks first in the aliases, then in the large table of official Unicode
543 # names.
84374e30 544
9deebca3 545 my $utf8; # The string result
e79869e1 546 my $save_input;
b177ca84 547
84374e30 548 if ($runtime) {
03f95285 549
fb121860
KW
550 my $hints_ref = (caller($runtime))[10];
551
03f95285
KW
552 # If we didn't import anything (which happens with 'use charnames ()',
553 # substitute a dummy structure.
554 $hints_ref = \%dummy_H if ! defined $hints_ref
555 || ! defined $hints_ref->{charnames_full};
556
84374e30
KW
557 # At runtime, but currently not at compile time, $^H gets
558 # stringified, so un-stringify back to the original data structures.
559 # These get thrown away by perl before the next invocation
560 # Also fill in the hash with the non-stringified data.
03f95285 561 # N.B. New fields must be also added to %dummy_H
84374e30 562
03f95285
KW
563 %{$^H{charnames_name_aliases}} = split ',',
564 $hints_ref->{charnames_stringified_names};
565 %{$^H{charnames_ord_aliases}} = split ',',
566 $hints_ref->{charnames_stringified_ords};
e79869e1 567 $^H{charnames_scripts} = $hints_ref->{charnames_scripts};
84374e30
KW
568 $^H{charnames_full} = $hints_ref->{charnames_full};
569 $^H{charnames_short} = $hints_ref->{charnames_short};
570 }
571
232cbbee 572 # User alias should be checked first or else can't override ours, and if we
9deebca3 573 # were to add any, could conflict with theirs.
84374e30 574 if (exists $^H{charnames_ord_aliases}{$name}) {
f1ccd77d 575 $utf8 = $^H{charnames_ord_aliases}{$name};
16036bcd 576 }
84374e30
KW
577 elsif (exists $^H{charnames_name_aliases}{$name}) {
578 $name = $^H{charnames_name_aliases}{$name};
e79869e1 579 $save_input = $name; # Cache the result for any error message
232cbbee
KW
580 }
581 elsif (exists $system_aliases{$name}) {
f1ccd77d 582 $utf8 = $system_aliases{$name};
52ea3e69 583 }
232cbbee 584 elsif (exists $deprecated_aliases{$name}) {
35c0985d 585 require warnings;
fb121860 586 warnings::warnif('deprecated', "Unicode character name \"$name\" is deprecated, use \"" . viacode(ord $deprecated_aliases{$name}) . "\" instead");
f1ccd77d 587 $utf8 = $deprecated_aliases{$name};
52ea3e69 588 }
b177ca84 589
423cee85 590 my @off;
52ea3e69 591
f1ccd77d 592 if (! defined $utf8) {
35c0985d 593
9deebca3 594 # See if has looked this input up earlier.
84374e30 595 if ($^H{charnames_full} && exists $full_names_cache{$name}) {
f1ccd77d 596 $utf8 = $full_names_cache{$name};
35c0985d 597 }
84374e30 598 else {
35c0985d 599
84374e30
KW
600 ## Suck in the code/name list as a big string.
601 ## Lines look like:
73d9566f 602 ## "00052\tLATIN CAPITAL LETTER R\n"
fb121860
KW
603 # or
604 # "0052 0303\tLATIN CAPITAL LETTER R WITH TILDE\n"
84374e30
KW
605 $txt = do "unicore/Name.pl" unless $txt;
606
607 ## @off will hold the index into the code/name string of the start and
608 ## end of the name as we find it.
609
610 ## If :full, look for the name exactly; runtime implies full
8a684a5b 611 my $found_full_in_table = 0; # Tells us if can cache the result
84374e30 612 if ($^H{charnames_full}) {
5bd59e57
KW
613
614 # See if the name is one which is algorithmically determinable.
615 # The subroutine is included in Name.pl. The table contained in
616 # $txt doesn't contain these. Experiments show that checking
617 # for these before checking for the regular names has no
618 # noticeable impact on performance for the regular names, but
619 # the other way around slows down finding these immensely.
620 # Algorithmically determinables are not placed in the cache (that
621 # $found_full_in_table indicates) because that uses up memory,
622 # and finding these again is fast.
fb121860
KW
623 if (defined (my $ord = name_to_code_point_special($name))) {
624 $utf8 = pack("U", $ord);
625 }
626 else {
5bd59e57
KW
627
628 # Not algorthmically determinable; look up in the table.
73d9566f
KW
629 if ($txt =~ /\t\Q$name\E$/m) {
630 @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
5bd59e57
KW
631 $found_full_in_table = 1;
632 }
52fb7278 633 }
423cee85 634 }
b177ca84 635
e79869e1 636 # If we didn't get it above, keep looking
f1ccd77d 637 if (! $found_full_in_table && ! defined $utf8) {
84374e30 638
dc023ef4 639 # If :short is allowed, see if input is like "greek:Sigma".
e79869e1 640 my $scripts_trie;
52fb7278 641 if (($^H{charnames_short})
dc023ef4
KW
642 && $name =~ /^ \s* (.+?) \s* : \s* (.+?) \s* $ /xs)
643 {
e79869e1
KW
644 $scripts_trie = "\U\Q$1";
645 $name = $2;
dc023ef4 646 }
9deebca3 647 else { # Otherwise look in allowed scripts
e79869e1 648 $scripts_trie = $^H{charnames_scripts};
dc023ef4
KW
649 }
650
e79869e1
KW
651 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
652 if ($txt !~
73d9566f 653 /\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U\Q$name\E $/xm)
e79869e1
KW
654 {
655 # Here we still don't have it, give up.
656 return if $runtime;
52fb7278 657
e79869e1
KW
658 # May have zapped input name, get it again.
659 $name = (defined $save_input) ? $save_input : $_[0];
660 carp "Unknown charname '$name'";
fb121860 661 return ($wants_ord) ? 0xFFFD : pack("U", 0xFFFD);
e79869e1 662 }
52fb7278 663
73d9566f 664 @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
52ea3e69 665 }
35c0985d 666
f1ccd77d 667 if (! defined $utf8) {
b1c167a3 668
fb121860
KW
669 # Here, we haven't set up the output, but we know where in the string
670 # the name starts. The string is set up so that for single characters
671 # (and not named sequences), the name is preceeded immediately by a
672 # tab and 5 hex digits for its code, with a \n before those. Named
673 # sequences won't have the 7th preceeding character be a \n.
674 # (Actually, for the very first entry in the table this isn't strictly
675 # true: subtracting 7 will yield -1, and the substr below will
676 # therefore yield the very last character in the table, which should
677 # also be a \n, so the statement works anyway.)
678 if (substr($txt, $off[0] - 7, 1) eq "\n") {
679 $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5));
680 }
681 else {
682
683 # Here, is a named sequence. Need to go looking for the beginning,
684 # which is just after the \n from the previous entry in the table.
685 # The +1 skips past that newline, or, if the rindex() fails, to put
686 # us to an offset of zero.
687 my $charstart = rindex($txt, "\n", $off[0] - 7) + 1;
688 $utf8 = pack("U*", map { CORE::hex }
689 split " ", substr($txt, $charstart, $off[0] - $charstart - 1));
690 }
5bd59e57 691 }
84374e30
KW
692
693 # Cache the input so as to not have to search the large table
694 # again, but only if it came from the one search that we cache.
f1ccd77d 695 $full_names_cache{$name} = $utf8 if $found_full_in_table;
35c0985d 696 }
423cee85 697 }
b177ca84 698
63098191 699
fb121860
KW
700 # Here, have the utf8. If the return is to be an ord, must be any single
701 # character.
702 if ($wants_ord) {
703 return ord($utf8) if length $utf8 == 1;
704 }
705 else {
706
707 # Here, wants string output. If utf8 is acceptable, just return what
708 # we've got; otherwise attempt to convert it to non-utf8 and return that.
709 my $in_bytes = ($runtime)
710 ? (caller $runtime)[8] & $bytes::hint_bits
711 : $^H & $bytes::hint_bits;
712 return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg
713 # means don't die on failure
714 }
715
716 # Here, there is an error: either there are too many characters, or the
717 # result string needs to be non-utf8, and at least one character requires
718 # utf8. Prefer any official name over the input one for the error message.
e79869e1
KW
719 if (@off) {
720 $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
721 }
722 else {
723 $name = (defined $save_input) ? $save_input : $_[0];
724 }
fb121860
KW
725
726 if ($wants_ord) {
727 # Only way to get here in this case is if result too long. Message
728 # assumes that our only caller that requires single char result is
729 # vianame.
730 carp "charnames::vianame() doesn't handle named sequences ($name). Use charnames::string_vianame() instead";
731 return;
732 }
733
734 # Only other possible failure here is from use bytes.
735 if ($runtime) {
736 carp not_legal_use_bytes_msg($name, $utf8);
737 return;
738 } else {
739 croak not_legal_use_bytes_msg($name, $utf8);
740 }
741
63098191
KW
742} # lookup_name
743
744sub charnames {
63098191 745
9deebca3
KW
746 # For \N{...}. Looks up the character name and returns the string
747 # representation of it.
63098191 748
fb121860
KW
749 # The first 0 arg means wants a string returned; the second that we are in
750 # compile time
751 return lookup_name($_[0], 0, 0);
63098191 752}
423cee85 753
b177ca84
JF
754sub import
755{
756 shift; ## ignore class name
757
35c0985d
MB
758 if (not @_) {
759 carp("`use charnames' needs explicit imports list");
b177ca84 760 }
423cee85 761 $^H{charnames} = \&charnames ;
84374e30
KW
762 $^H{charnames_ord_aliases} = {};
763 $^H{charnames_name_aliases} = {};
764 $^H{charnames_inverse_ords} = {};
03f95285
KW
765 # New fields must be added to %dummy_H, and the code in lookup_name()
766 # that copies fields from the runtime structure
b177ca84
JF
767
768 ##
769 ## fill %h keys with our @_ args.
770 ##
35c0985d 771 my ($promote, %h, @args) = (0);
e5c3f898
MG
772 while (my $arg = shift) {
773 if ($arg eq ":alias") {
51cf30b6 774 @_ or
52fb7278 775 croak ":alias needs an argument in charnames";
35c0985d
MB
776 my $alias = shift;
777 if (ref $alias) {
52fb7278
KW
778 ref $alias eq "HASH" or
779 croak "Only HASH reference supported as argument to :alias";
780 alias ($alias);
781 next;
35c0985d 782 }
51cf30b6 783 if ($alias =~ m{:(\w+)$}) {
52fb7278
KW
784 $1 eq "full" || $1 eq "short" and
785 croak ":alias cannot use existing pragma :$1 (reversed order?)";
786 alias_file ($1) and $promote = 1;
787 next;
35c0985d 788 }
51cf30b6
MB
789 alias_file ($alias);
790 next;
791 }
e5c3f898
MG
792 if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
793 warn "unsupported special '$arg' in charnames";
51cf30b6 794 next;
35c0985d 795 }
e5c3f898 796 push @args, $arg;
35c0985d
MB
797 }
798 @args == 0 && $promote and @args = (":full");
799 @h{@args} = (1) x @args;
b177ca84 800
03f95285
KW
801 $^H{charnames_full} = delete $h{':full'} || 0; # Don't leave undefined,
802 # as tested for in
803 # lookup_names
804 $^H{charnames_short} = delete $h{':short'} || 0;
e79869e1 805 my @scripts = map uc, keys %h;
b177ca84
JF
806
807 ##
808 ## If utf8? warnings are enabled, and some scripts were given,
281aa49e 809 ## see if at least we can find one letter from each script.
b177ca84 810 ##
e79869e1 811 if (warnings::enabled('utf8') && @scripts) {
35c0985d
MB
812 $txt = do "unicore/Name.pl" unless $txt;
813
e79869e1 814 for my $script (@scripts) {
73d9566f 815 if (not $txt =~ m/\t$script (?:CAPITAL |SMALL )?LETTER /) {
52fb7278 816 warnings::warn('utf8', "No such script: '$script'");
e79869e1 817 $script = quotemeta $script; # Escape it, for use in the re.
b177ca84 818 }
35c0985d 819 }
bd62941a 820 }
84374e30
KW
821
822 # %^H gets stringified, so serialize it ourselves so can extract the
823 # real data back later.
824 $^H{charnames_stringified_ords} = join ",", %{$^H{charnames_ord_aliases}};
825 $^H{charnames_stringified_names} = join ",", %{$^H{charnames_name_aliases}};
826 $^H{charnames_stringified_inverse_ords} = join ",", %{$^H{charnames_inverse_ords}};
e79869e1 827 $^H{charnames_scripts} = join "|", @scripts; # Stringifiy them as a trie
35c0985d 828} # import
423cee85 829
84374e30
KW
830# Cache of already looked-up values. This is set to only contain
831# official values, and user aliases can't override them, so scoping is
832# not an issue.
833my %viacode;
63098191
KW
834
835sub viacode {
836
837 # Returns the name of the code point argument
4e2cda5d 838
35c0985d
MB
839 if (@_ != 1) {
840 carp "charnames::viacode() expects one argument";
bd5c3bd9 841 return;
35c0985d 842 }
f0175764 843
35c0985d 844 my $arg = shift;
b177ca84 845
e5432b89
KW
846 # This is derived from Unicode::UCD, where it is nearly the same as the
847 # function _getcode(), but here it makes sure that even a hex argument
848 # has the proper number of leading zeros, which is critical in
849 # matching against $txt below
281aa49e 850 # Must check if decimal first; see comments at that definition
35c0985d 851 my $hex;
232cbbee 852 if ($arg =~ $decimal_qr) {
b1c167a3 853 $hex = sprintf "%05X", $arg;
232cbbee 854 } elsif ($arg =~ $hex_qr) {
e10d7780 855 # Below is the line that differs from the _getcode() source
b1c167a3 856 $hex = sprintf "%05X", hex $1;
35c0985d
MB
857 } else {
858 carp("unexpected arg \"$arg\" to charnames::viacode()");
859 return;
860 }
b177ca84 861
35c0985d 862 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 863
ac046fe1
KW
864 # If the code point is above the max in the table, there's no point
865 # looking through it. Checking the length first is slightly faster
866 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) {
867 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 868
5bd59e57
KW
869 # See if the name is algorithmically determinable.
870 my $algorithmic = code_point_to_name_special(CORE::hex $hex);
871 if (defined $algorithmic) {
872 $viacode{$hex} = $algorithmic;
873 return $algorithmic;
874 }
875
ac046fe1
KW
876 # Return the official name, if exists. It's unclear to me (khw) at
877 # this juncture if it is better to return a user-defined override, so
878 # leaving it as is for now.
73d9566f 879 if ($txt =~ m/^$hex\t/m) {
f3227b74 880
52fb7278
KW
881 # The name starts with the next character and goes up to the
882 # next new-line. Using capturing parentheses above instead of
883 # @+ more than doubles the execution time in Perl 5.13
f3227b74 884 $viacode{$hex} = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
52fb7278 885 return $viacode{$hex};
ac046fe1 886 }
232cbbee
KW
887 }
888
889 # See if there is a user name for it, before giving up completely.
03f95285
KW
890 # First get the scoped aliases, give up if have none.
891 my $H_ref = (caller(0))[10];
892 return if ! defined $H_ref
893 || ! exists $H_ref->{charnames_stringified_inverse_ords};
894
84374e30 895 my %code_point_aliases = split ',',
03f95285 896 $H_ref->{charnames_stringified_inverse_ords};
84374e30 897 if (! exists $code_point_aliases{$hex}) {
ac046fe1
KW
898 if (CORE::hex($hex) > 0x10FFFF) {
899 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
900 }
901 return;
902 }
bd5c3bd9 903
84374e30 904 return $code_point_aliases{$hex};
35c0985d 905} # viacode
daf0d493
JH
906
907sub vianame
908{
35c0985d
MB
909 if (@_ != 1) {
910 carp "charnames::vianame() expects one name argument";
911 return ()
912 }
daf0d493 913
63098191
KW
914 # Looks up the character name and returns its ordinal if
915 # found, undef otherwise.
daf0d493 916
63098191 917 my $arg = shift;
dbc0d4f2 918
63098191 919 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d 920
fb121860
KW
921 # khw claims that this is poor interface design. The function should
922 # return either a an ord or a chr for all inputs; not be bipolar. But
923 # can't change it because of backward compatibility. New code can use
924 # string_vianame() instead.
5a7fb30a
KW
925 my $ord = CORE::hex $1;
926 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
fb121860 927 carp not_legal_use_bytes_msg($arg, chr $ord);
5a7fb30a 928 return;
63098191 929 }
daf0d493 930
fb121860
KW
931 # The first 1 arg means wants an ord returned; the second that we are in
932 # runtime, and this is the first level routine called from the user
933 return lookup_name($arg, 1, 1);
35c0985d 934} # vianame
b177ca84 935
fb121860
KW
936sub string_vianame {
937
938 # Looks up the character name and returns its string representation if
939 # found, undef otherwise.
940
941 if (@_ != 1) {
942 carp "charnames::string_vianame() expects one name argument";
943 return;
944 }
945
946 my $arg = shift;
947
948 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
949
950 my $ord = CORE::hex $1;
951 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
952
953 carp not_legal_use_bytes_msg($arg, chr $ord);
954 return;
955 }
956
957 # The 0 arg means wants a string returned; the 1 arg means that we are in
958 # runtime, and this is the first level routine called from the user
959 return lookup_name($arg, 0, 1);
960} # string_vianame
961
962
423cee85
JH
963
9641;
965__END__
966
967=head1 NAME
968
fb121860 969charnames - access to Unicode character names and named character sequences; also define character names
423cee85
JH
970
971=head1 SYNOPSIS
972
973 use charnames ':full';
4a2d328f 974 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
fb121860
KW
975 print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}",
976 " is an officially named sequence of two Unicode characters\n";
423cee85
JH
977
978 use charnames ':short';
4a2d328f 979 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
980
981 use charnames qw(cyrillic greek);
4a2d328f 982 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 983
35c0985d
MB
984 use charnames ":full", ":alias" => {
985 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
e5432b89 986 mychar => 0xE8000, # Private use area
76ae0c45 987 };
35c0985d 988 print "\N{e_ACUTE} is a small letter e with an acute.\n";
da9dec57 989 print "\\N{mychar} allows me to name private use characters.\n";
35c0985d 990
76ae0c45 991 use charnames ();
a23c04e4 992 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
16036bcd
KW
993 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
994 # "10330"
fb121860
KW
995 print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on
996 # ASCII platforms;
997 # 193 on EBCDIC
998 print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A"
b177ca84 999
423cee85
JH
1000=head1 DESCRIPTION
1001
da9dec57 1002Pragma C<use charnames> is used to gain access to the names of the
fb121860
KW
1003Unicode characters and named character sequences, and to allow you to define
1004your own character and character sequence names.
1005
1006All forms of the pragma enable use of the following 3 functions:
1007
1008=over
1009
1010=item *
1011
1012L</charnames::string_vianame(I<name>)> for run-time lookup of a
1013either a character name or a named character sequence, returning its string
1014representation
1015
1016=item *
1017
1018L</charnames::vianame(I<name>)> for run-time lookup of a
1019character name (but not a named character sequence) to get its ordinal value
1020(code point)
da9dec57 1021
fb121860 1022=item *
da9dec57 1023
fb121860
KW
1024L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its
1025Unicode name.
1026
1027=back
1028
1029All forms other than C<S<"use charnames ();">> also enable the use of
da9dec57 1030C<\N{I<CHARNAME>}> sequences to compile a Unicode character into a
8ebef31d 1031string, based on its name.
da9dec57
KW
1032
1033Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
1034also inserts a character into a string, but doesn't require the use of
1035this pragma. The character it inserts is the one whose code point
1036(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
1037the Unicode (white background, black foreground) smiley face; it doesn't
1038require this pragma, whereas the equivalent, C<"\N{WHITE SMILING FACE}">
1039does.
1040Also, C<\N{I<...>}> can mean a regex quantifier instead of a character
8ebef31d
KW
1041name, when the I<...> is a number (or comma separated pair of numbers
1042(see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
da9dec57
KW
1043
1044The C<charnames> pragma supports arguments C<:full>, C<:short>, script
1045names and customized aliases. If C<:full> is present, for expansion of
1046C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
76ae0c45 1047standard Unicode character names. If C<:short> is present, and
da9dec57
KW
1048I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
1049as a letter in script I<SCRIPT>. If C<use charnames> is used
1050with script name arguments, then for C<\N{I<CHARNAME>}> the name
1051I<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
1052specified order). Customized aliases can override these, and are explained in
1053L</CUSTOM ALIASES>.
423cee85 1054
da9dec57 1055For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>
d5448623 1056this pragma looks for the names
423cee85
JH
1057
1058 SCRIPTNAME CAPITAL LETTER CHARNAME
1059 SCRIPTNAME SMALL LETTER CHARNAME
1060 SCRIPTNAME LETTER CHARNAME
1061
da9dec57 1062in the table of standard Unicode names. If I<CHARNAME> is lowercase,
daf0d493
JH
1063then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
1064is ignored.
1065
da9dec57
KW
1066Note that C<\N{...}> is compile-time; it's a special form of string
1067constant used inside double-quotish strings; this means that you cannot
4e2cda5d 1068use variables inside the C<\N{...}>. If you want similar run-time
fb121860
KW
1069functionality, use
1070L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>.
423cee85 1071
301a3cda 1072For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
da9dec57
KW
1073there are no official Unicode names but you can use instead the ISO 6429
1074names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF,
1f31fcd4
KW
1075ESC, ...). In Unicode 3.2 (as of Perl 5.8) some naming changes took
1076place, and ISO 6429 was updated, see L</ALIASES>.
301a3cda 1077
e5432b89
KW
1078If the input name is unknown, C<\N{NAME}> raises a warning and
1079substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
1080
8ebef31d
KW
1081For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the
1082input name is that of a character that won't fit into a byte (i.e., whose
1083ordinal is above 255).
e5432b89 1084
da9dec57
KW
1085Otherwise, any string that includes a C<\N{I<charname>}> or
1086C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
1087L<perlunicode/Byte and Character Semantics>).
1088
5ffe0e96 1089=head1 ALIASES
423cee85 1090
5ffe0e96
MB
1091A few aliases have been defined for convenience: instead of having
1092to use the official names
423cee85 1093
5ffe0e96
MB
1094 LINE FEED (LF)
1095 FORM FEED (FF)
1096 CARRIAGE RETURN (CR)
1097 NEXT LINE (NEL)
423cee85 1098
e5432b89 1099(yes, with parentheses), one can use
d5448623 1100
5ffe0e96
MB
1101 LINE FEED
1102 FORM FEED
1103 CARRIAGE RETURN
1104 NEXT LINE
1105 LF
1106 FF
1107 CR
1108 NEL
1109
16036bcd
KW
1110All the other standard abbreviations for the controls, such as C<ACK> for
1111C<ACKNOWLEDGE> also can be used.
1112
5ffe0e96
MB
1113One can also use
1114
1115 BYTE ORDER MARK
1116 BOM
1117
16036bcd
KW
1118and these abbreviations
1119
1120 Abbreviation Full Name
1121
1122 CGJ COMBINING GRAPHEME JOINER
1123 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
1124 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
1125 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
1126 LRE LEFT-TO-RIGHT EMBEDDING
1127 LRM LEFT-TO-RIGHT MARK
1128 LRO LEFT-TO-RIGHT OVERRIDE
1129 MMSP MEDIUM MATHEMATICAL SPACE
1130 MVS MONGOLIAN VOWEL SEPARATOR
1131 NBSP NO-BREAK SPACE
1132 NNBSP NARROW NO-BREAK SPACE
1133 PDF POP DIRECTIONAL FORMATTING
1134 RLE RIGHT-TO-LEFT EMBEDDING
1135 RLM RIGHT-TO-LEFT MARK
1136 RLO RIGHT-TO-LEFT OVERRIDE
1137 SHY SOFT HYPHEN
1138 VS1 VARIATION SELECTOR-1
1139 .
1140 .
1141 .
1142 VS256 VARIATION SELECTOR-256
1143 WJ WORD JOINER
1144 ZWJ ZERO WIDTH JOINER
1145 ZWNJ ZERO WIDTH NON-JOINER
1146 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
1147
1148For backward compatibility one can use the old names for
1149certain C0 and C1 controls
1150
1151 old new
1152
5ffe0e96
MB
1153 FILE SEPARATOR INFORMATION SEPARATOR FOUR
1154 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
1155 HORIZONTAL TABULATION CHARACTER TABULATION
1156 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
1157 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
1158 WITH JUSTIFICATION
5ffe0e96
MB
1159 PARTIAL LINE DOWN PARTIAL LINE FORWARD
1160 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
1161 RECORD SEPARATOR INFORMATION SEPARATOR TWO
1162 REVERSE INDEX REVERSE LINE FEED
1163 UNIT SEPARATOR INFORMATION SEPARATOR ONE
1164 VERTICAL TABULATION LINE TABULATION
1165 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
1166
1167but the old names in addition to giving the character
1168will also give a warning about being deprecated.
423cee85 1169
16036bcd
KW
1170And finally, certain published variants are usable, including some for
1171controls that have no Unicode names:
1172
1f31fcd4
KW
1173 name character
1174
52fb7278 1175 END OF PROTECTED AREA END OF GUARDED AREA, U+0097
1f31fcd4
KW
1176 HIGH OCTET PRESET U+0081
1177 HOP U+0081
1178 IND U+0084
1179 INDEX U+0084
1180 PAD U+0080
1181 PADDING CHARACTER U+0080
1182 PRIVATE USE 1 PRIVATE USE ONE, U+0091
1183 PRIVATE USE 2 PRIVATE USE TWO, U+0092
1184 SGC U+0099
1185 SINGLE GRAPHIC CHARACTER INTRODUCER U+0099
1186 SINGLE-SHIFT 2 SINGLE SHIFT TWO, U+008E
1187 SINGLE-SHIFT 3 SINGLE SHIFT THREE, U+008F
1188 START OF PROTECTED AREA START OF GUARDED AREA, U+0096
16036bcd 1189
35c0985d
MB
1190=head1 CUSTOM ALIASES
1191
1f31fcd4
KW
1192You can add customized aliases to standard (C<:full>) Unicode naming
1193conventions. The aliases override any standard definitions, so, if
da9dec57
KW
1194you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
1195mean C<"B">, etc.
55bc7d3c
KW
1196
1197Note that an alias should not be something that is a legal curly
1198brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
e5432b89
KW
1199C<\N{123}> means to match 123 non-newline characters, and is not treated as a
1200charnames alias. Aliases are discouraged from beginning with anything
1201other than an alphabetic character and from containing anything other
1202than alphanumerics, spaces, dashes, parentheses, and underscores.
1203Currently they must be ASCII.
1204
1205An alias can map to either an official Unicode character name or to a
1206numeric code point (ordinal). The latter is useful for assigning names
1207to code points in Unicode private use areas such as U+E800 through
f12d74c0
KW
1208U+F8FF.
1209A numeric code point must be a non-negative integer or a string beginning
1210with C<"U+"> or C<"0x"> with the remainder considered to be a
1211hexadecimal integer. A literal numeric constant must be unsigned; it
1212will be interpreted as hex if it has a leading zero or contains
1213non-decimal hex digits; otherwise it will be interpreted as decimal.
232cbbee 1214
da9dec57 1215Aliases are added either by the use of anonymous hashes:
35c0985d 1216
da9dec57 1217 use charnames ":alias" => {
35c0985d 1218 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee 1219 mychar1 => 0xE8000,
35c0985d
MB
1220 };
1221 my $str = "\N{e_ACUTE}";
1222
da9dec57 1223or by using a file containing aliases:
35c0985d 1224
da9dec57 1225 use charnames ":alias" => "pro";
35c0985d 1226
8ebef31d 1227This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
da9dec57 1228file should return a list in plain perl:
35c0985d
MB
1229
1230 (
1231 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
1232 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
1233 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
1234 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
1235 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
1236 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
1237 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
f12d74c0 1238 mychar2 => "U+E8001",
35c0985d
MB
1239 );
1240
da9dec57
KW
1241Both these methods insert C<":full"> automatically as the first argument (if no
1242other argument is given), and you can give the C<":full"> explicitly as
1243well, like
35c0985d 1244
da9dec57 1245 use charnames ":full", ":alias" => "pro";
35c0985d 1246
8ebef31d
KW
1247Also, both these methods currently allow only a single character to be named.
1248To name a sequence of characters, use a
1249L<custom translator|/CUSTOM TRANSLATORS> (described below).
1250
da9dec57 1251=head1 charnames::viacode(I<code>)
b177ca84
JF
1252
1253Returns the full name of the character indicated by the numeric code.
da9dec57 1254For example,
b177ca84
JF
1255
1256 print charnames::viacode(0x2722);
1257
1258prints "FOUR TEARDROP-SPOKED ASTERISK".
1259
232cbbee 1260The name returned is the official name for the code point, if
8ebef31d 1261available; otherwise your custom alias for it. This means that your
232cbbee
KW
1262alias will only be returned for code points that don't have an official
1263Unicode name (nor Unicode version 1 name), such as private use code
1264points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
da9dec57
KW
1265If you define more than one name for the code point, it is indeterminate
1266which one will be returned.
1267
1268The function returns C<undef> if no name is known for the code point.
1269In Unicode the proper name of these is the empty string, which
1270C<undef> stringifies to. (If you ask for a code point past the legal
1271Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
f12d74c0
KW
1272get C<undef> plus a warning.)
1273
1274The input number must be a non-negative integer or a string beginning
1275with C<"U+"> or C<"0x"> with the remainder considered to be a
1276hexadecimal integer. A literal numeric constant must be unsigned; it
1277will be interpreted as hex if it has a leading zero or contains
1278non-decimal hex digits; otherwise it will be interpreted as decimal.
daf0d493 1279
274085e3
PN
1280Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
1281SPACE", not "BYTE ORDER MARK".
1282
fb121860 1283=head1 charnames::string_vianame(I<name>)
daf0d493 1284
fb121860
KW
1285This is a runtime equivalent to C<\N{...}>. I<name> can be any expression
1286that evaluates to a name accepted by C<\N{...}> under the L<C<:full>
1287option|/DESCRIPTION> to C<charnames>. In addition, any other options for the
1288controlling C<"use charnames"> in the same scope apply, like any L<script
1289list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM ALIASES> you
1290may have defined.
daf0d493 1291
fb121860
KW
1292The only difference is that if the input name is unknown, C<string_vianame>
1293returns C<undef> instead of the REPLACEMENT CHARACTER and does not raise a
1294warning message.
daf0d493 1295
fb121860
KW
1296=head1 charnames::vianame(I<name>)
1297
1298This is similar to C<string_vianame>. The main difference is that under most
1299circumstances (see L</BUGS> for the others), vianame returns an ordinal code
1300point, whereas C<string_vianame> returns a string. For example,
daf0d493 1301
fb121860 1302 printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
b177ca84 1303
fb121860 1304prints "U+2722".
1f31fcd4 1305
fb121860
KW
1306This leads to the other two differences. Since a single code point is
1307returned, the function can't handle named character sequences, as these are
1308composed of multiple characters. And, the code point can be that of any
1309character, even ones that aren't legal under the C<S<use bytes>> pragma,
b177ca84 1310
5ffe0e96 1311=head1 CUSTOM TRANSLATORS
52ea3e69 1312
5ffe0e96
MB
1313The mechanism of translation of C<\N{...}> escapes is general and not
1314hardwired into F<charnames.pm>. A module can install custom
1315translations (inside the scope which C<use>s the module) with the
1316following magic incantation:
52ea3e69 1317
5ffe0e96 1318 sub import {
52fb7278
KW
1319 shift;
1320 $^H{charnames} = \&translator;
5ffe0e96 1321 }
52ea3e69 1322
da9dec57 1323Here translator() is a subroutine which takes I<CHARNAME> as an
5ffe0e96 1324argument, and returns text to insert into the string instead of the
da9dec57 1325C<\N{I<CHARNAME>}> escape. Since the text to insert should be different
5ffe0e96
MB
1326in C<bytes> mode and out of it, the function should check the current
1327state of C<bytes>-flag as in:
52ea3e69 1328
52fb7278 1329 use bytes (); # for $bytes::hint_bits
5ffe0e96 1330 sub translator {
52fb7278
KW
1331 if ($^H & $bytes::hint_bits) {
1332 return bytes_translator(@_);
1333 }
1334 else {
1335 return utf8_translator(@_);
1336 }
5ffe0e96 1337 }
52ea3e69 1338
da9dec57 1339See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
f0175764 1340
1f31fcd4
KW
1341Of course, C<vianame> and C<viacode> would need to be overridden as
1342well.
1343
423cee85
JH
1344=head1 BUGS
1345
8ebef31d
KW
1346vianame normally returns an ordinal code point, but when the input name is of
1347the form C<U+...>, it returns a chr instead. In this case, if C<use bytes> is
1348in effect and the character won't fit into a byte, it returns C<undef> and
1349raises a warning.
55bc7d3c 1350
16036bcd
KW
1351Names must be ASCII characters only, which means that you are out of luck if
1352you want to create aliases in a language where some or all the characters of
1353the desired aliases are non-ASCII.
bee80e93 1354
f12d74c0
KW
1355Since evaluation of the translation function (see L</CUSTOM
1356TRANSLATORS>) happens in the middle of compilation (of a string
1357literal), the translation function should not do any C<eval>s or
1358C<require>s. This restriction should be lifted (but is low priority) in
1359a future version of Perl.
423cee85
JH
1360
1361=cut
0eacc33e 1362
52fb7278 1363# ex: set ts=8 sts=2 sw=2 et: