This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
don't depend on threads to do a watchdog when testing threads
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
e3ec0a15 5our $VERSION = '1.25';
a03f0b9f 6use unicore::Name; # mktables-generated algorithmically-defined names
b75c8c73 7
52fb7278 8use bytes (); # for $bytes::hint_bits
423cee85 9
38f4139d
KW
10# Translate between Unicode character names and their code points.
11#
12# The official names with their code points are stored in a table in
13# lib/unicore/Name.pl which is read in as a large string (almost 3/4 Mb in
14# Unicode 6.0). Each code point/name combination is separated by a \n in the
15# string. (Some of the CJK and the Hangul syllable names are determined
a03f0b9f
KW
16# instead algorithmically via subroutines stored instead in
17# lib/unicore/Name.pm). Because of the large size of this table, it isn't
18# converted into hashes for faster lookup.
38f4139d
KW
19#
20# But, user defined aliases are stored in their own hashes, as are Perl
21# extensions to the official names. These are checked first before looking at
22# the official table.
23#
24# Basically, the table is grepped for the input code point (viacode()) or
25# name (the other functions), and the corresponding value on the same line is
26# returned. The grepping is done by turning the input into a regular
27# expression. Thus, the same table does double duty, used by both name and
28# code point lookup. (If we were to have hashes, we would need two, one for
29# each lookup direction.)
30#
31# For loose name matching, the logical thing would be to have a table
32# with all the ignorable characters squeezed out, and then grep it with the
33# similiarly-squeezed input name. (And this is in fact how the lookups are
34# done with the small Perl extension hashes.) But since we need to be able to
35# go from code point to official name, the original table would still need to
36# exist. Due to the large size of the table, it was decided to not read
37# another very large string into memory for a second table. Instead, the
38# regular expression of the input name is modified to have optional spaces and
39# dashes between characters. For example, in strict matching, the regular
40# expression would be:
41# qr/\tDIGIT ONE$/m
42# Under loose matching, the blank would be squeezed out, and the re would be:
43# qr/\tD[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m
44# which matches a blank or dash between any characters in the official table.
45#
46# This is also how script lookup is done. Basically the re looks like
47# qr/ (?:LATIN|GREEK|CYRILLIC) (?:SMALL )?LETTER $name/
48# where $name is the loose or strict regex for the remainder of the name.
49
fb121860
KW
50# The hashes are stored as utf8 strings. This makes it easier to deal with
51# sequences. I (khw) also tried making Name.pl utf8, but it slowed things
52# down by a factor of 7. I then tried making Name.pl store the ut8
53# equivalents but not calling them utf8. That led to similar speed as leaving
54# it alone, but since that is harder for a human to parse, I left it as-is.
55
232cbbee 56my %system_aliases = (
69ccf208 57 # Synonyms for the icky 3.2 names that have parentheses.
bcc08981
KW
58 'LINE FEED' => pack("U", 0x0A), # LINE FEED (LF)
59 'FORM FEED' => pack("U", 0x0C), # FORM FEED (FF)
60 'CARRIAGE RETURN' => pack("U", 0x0D), # CARRIAGE RETURN (CR)
61 'NEXT LINE' => pack("U", 0x85), # NEXT LINE (NEL)
62
63 # Some variant names from Wikipedia
64 'SINGLE-SHIFT 2' => pack("U", 0x8E),
65 'SINGLE-SHIFT 3' => pack("U", 0x8F),
66 'PRIVATE USE 1' => pack("U", 0x91),
67 'PRIVATE USE 2' => pack("U", 0x92),
68 'START OF PROTECTED AREA' => pack("U", 0x96),
69 'END OF PROTECTED AREA' => pack("U", 0x97),
70
71 # Convenience. Standard abbreviations for the controls
72 'NUL' => pack("U", 0x00), # NULL
73 'SOH' => pack("U", 0x01), # START OF HEADING
74 'STX' => pack("U", 0x02), # START OF TEXT
75 'ETX' => pack("U", 0x03), # END OF TEXT
76 'EOT' => pack("U", 0x04), # END OF TRANSMISSION
77 'ENQ' => pack("U", 0x05), # ENQUIRY
78 'ACK' => pack("U", 0x06), # ACKNOWLEDGE
3ffed8c2 79 'BEL' => pack("U", 0x07), # ALERT; formerly BELL
bcc08981
KW
80 'BS' => pack("U", 0x08), # BACKSPACE
81 'HT' => pack("U", 0x09), # HORIZONTAL TABULATION
82 'LF' => pack("U", 0x0A), # LINE FEED (LF)
83 'VT' => pack("U", 0x0B), # VERTICAL TABULATION
84 'FF' => pack("U", 0x0C), # FORM FEED (FF)
85 'CR' => pack("U", 0x0D), # CARRIAGE RETURN (CR)
86 'SO' => pack("U", 0x0E), # SHIFT OUT
87 'SI' => pack("U", 0x0F), # SHIFT IN
88 'DLE' => pack("U", 0x10), # DATA LINK ESCAPE
89 'DC1' => pack("U", 0x11), # DEVICE CONTROL ONE
90 'DC2' => pack("U", 0x12), # DEVICE CONTROL TWO
91 'DC3' => pack("U", 0x13), # DEVICE CONTROL THREE
92 'DC4' => pack("U", 0x14), # DEVICE CONTROL FOUR
93 'NAK' => pack("U", 0x15), # NEGATIVE ACKNOWLEDGE
94 'SYN' => pack("U", 0x16), # SYNCHRONOUS IDLE
95 'ETB' => pack("U", 0x17), # END OF TRANSMISSION BLOCK
96 'CAN' => pack("U", 0x18), # CANCEL
97 'EOM' => pack("U", 0x19), # END OF MEDIUM
98 'SUB' => pack("U", 0x1A), # SUBSTITUTE
99 'ESC' => pack("U", 0x1B), # ESCAPE
100 'FS' => pack("U", 0x1C), # FILE SEPARATOR
101 'GS' => pack("U", 0x1D), # GROUP SEPARATOR
102 'RS' => pack("U", 0x1E), # RECORD SEPARATOR
103 'US' => pack("U", 0x1F), # UNIT SEPARATOR
104 'DEL' => pack("U", 0x7F), # DELETE
105 'BPH' => pack("U", 0x82), # BREAK PERMITTED HERE
106 'NBH' => pack("U", 0x83), # NO BREAK HERE
107 'NEL' => pack("U", 0x85), # NEXT LINE (NEL)
108 'SSA' => pack("U", 0x86), # START OF SELECTED AREA
109 'ESA' => pack("U", 0x87), # END OF SELECTED AREA
110 'HTS' => pack("U", 0x88), # CHARACTER TABULATION SET
111 'HTJ' => pack("U", 0x89), # CHARACTER TABULATION WITH JUSTIFICATION
112 'VTS' => pack("U", 0x8A), # LINE TABULATION SET
113 'PLD' => pack("U", 0x8B), # PARTIAL LINE FORWARD
114 'PLU' => pack("U", 0x8C), # PARTIAL LINE BACKWARD
25aa614d 115 'RI' => pack("U", 0x8D), # REVERSE LINE FEED
bcc08981
KW
116 'SS2' => pack("U", 0x8E), # SINGLE SHIFT TWO
117 'SS3' => pack("U", 0x8F), # SINGLE SHIFT THREE
118 'DCS' => pack("U", 0x90), # DEVICE CONTROL STRING
119 'PU1' => pack("U", 0x91), # PRIVATE USE ONE
120 'PU2' => pack("U", 0x92), # PRIVATE USE TWO
121 'STS' => pack("U", 0x93), # SET TRANSMIT STATE
122 'CCH' => pack("U", 0x94), # CANCEL CHARACTER
25aa614d 123 'MW' => pack("U", 0x95), # MESSAGE WAITING
bcc08981
KW
124 'SPA' => pack("U", 0x96), # START OF GUARDED AREA
125 'EPA' => pack("U", 0x97), # END OF GUARDED AREA
126 'SOS' => pack("U", 0x98), # START OF STRING
127 'SCI' => pack("U", 0x9A), # SINGLE CHARACTER INTRODUCER
128 'CSI' => pack("U", 0x9B), # CONTROL SEQUENCE INTRODUCER
25aa614d 129 'ST' => pack("U", 0x9C), # STRING TERMINATOR
bcc08981 130 'OSC' => pack("U", 0x9D), # OPERATING SYSTEM COMMAND
25aa614d 131 'PM' => pack("U", 0x9E), # PRIVACY MESSAGE
bcc08981
KW
132 'APC' => pack("U", 0x9F), # APPLICATION PROGRAM COMMAND
133
81965e2b
KW
134 # There are no names for these in the Unicode standard; perhaps should be
135 # deprecated, but then again there are no alternative names, so am not
136 # deprecating. And if did, the code would have to change to not recommend
137 # an alternative for these.
bcc08981
KW
138 'PADDING CHARACTER' => pack("U", 0x80),
139 'PAD' => pack("U", 0x80),
140 'HIGH OCTET PRESET' => pack("U", 0x81),
141 'HOP' => pack("U", 0x81),
142 'INDEX' => pack("U", 0x84),
143 'IND' => pack("U", 0x84),
144 'SINGLE GRAPHIC CHARACTER INTRODUCER' => pack("U", 0x99),
145 'SGC' => pack("U", 0x99),
146
81965e2b
KW
147 # More convenience. For further convenience, it is suggested some way of
148 # using the NamesList aliases be implemented, but there are ambiguities in
bcc08981
KW
149 # NamesList.txt
150 'BOM' => pack("U", 0xFEFF), # BYTE ORDER MARK
151 'BYTE ORDER MARK'=> pack("U", 0xFEFF),
152 'CGJ' => pack("U", 0x034F), # COMBINING GRAPHEME JOINER
153 'FVS1' => pack("U", 0x180B), # MONGOLIAN FREE VARIATION SELECTOR ONE
154 'FVS2' => pack("U", 0x180C), # MONGOLIAN FREE VARIATION SELECTOR TWO
155 'FVS3' => pack("U", 0x180D), # MONGOLIAN FREE VARIATION SELECTOR THREE
156 'LRE' => pack("U", 0x202A), # LEFT-TO-RIGHT EMBEDDING
157 'LRM' => pack("U", 0x200E), # LEFT-TO-RIGHT MARK
158 'LRO' => pack("U", 0x202D), # LEFT-TO-RIGHT OVERRIDE
159 'MMSP' => pack("U", 0x205F), # MEDIUM MATHEMATICAL SPACE
160 'MVS' => pack("U", 0x180E), # MONGOLIAN VOWEL SEPARATOR
161 'NBSP' => pack("U", 0x00A0), # NO-BREAK SPACE
162 'NNBSP' => pack("U", 0x202F), # NARROW NO-BREAK SPACE
163 'PDF' => pack("U", 0x202C), # POP DIRECTIONAL FORMATTING
164 'RLE' => pack("U", 0x202B), # RIGHT-TO-LEFT EMBEDDING
165 'RLM' => pack("U", 0x200F), # RIGHT-TO-LEFT MARK
166 'RLO' => pack("U", 0x202E), # RIGHT-TO-LEFT OVERRIDE
167 'SHY' => pack("U", 0x00AD), # SOFT HYPHEN
168 'VS1' => pack("U", 0xFE00), # VARIATION SELECTOR-1
169 'VS2' => pack("U", 0xFE01), # VARIATION SELECTOR-2
170 'VS3' => pack("U", 0xFE02), # VARIATION SELECTOR-3
171 'VS4' => pack("U", 0xFE03), # VARIATION SELECTOR-4
172 'VS5' => pack("U", 0xFE04), # VARIATION SELECTOR-5
173 'VS6' => pack("U", 0xFE05), # VARIATION SELECTOR-6
174 'VS7' => pack("U", 0xFE06), # VARIATION SELECTOR-7
175 'VS8' => pack("U", 0xFE07), # VARIATION SELECTOR-8
176 'VS9' => pack("U", 0xFE08), # VARIATION SELECTOR-9
177 'VS10' => pack("U", 0xFE09), # VARIATION SELECTOR-10
178 'VS11' => pack("U", 0xFE0A), # VARIATION SELECTOR-11
179 'VS12' => pack("U", 0xFE0B), # VARIATION SELECTOR-12
180 'VS13' => pack("U", 0xFE0C), # VARIATION SELECTOR-13
181 'VS14' => pack("U", 0xFE0D), # VARIATION SELECTOR-14
182 'VS15' => pack("U", 0xFE0E), # VARIATION SELECTOR-15
183 'VS16' => pack("U", 0xFE0F), # VARIATION SELECTOR-16
184 'VS17' => pack("U", 0xE0100), # VARIATION SELECTOR-17
185 'VS18' => pack("U", 0xE0101), # VARIATION SELECTOR-18
186 'VS19' => pack("U", 0xE0102), # VARIATION SELECTOR-19
187 'VS20' => pack("U", 0xE0103), # VARIATION SELECTOR-20
188 'VS21' => pack("U", 0xE0104), # VARIATION SELECTOR-21
189 'VS22' => pack("U", 0xE0105), # VARIATION SELECTOR-22
190 'VS23' => pack("U", 0xE0106), # VARIATION SELECTOR-23
191 'VS24' => pack("U", 0xE0107), # VARIATION SELECTOR-24
192 'VS25' => pack("U", 0xE0108), # VARIATION SELECTOR-25
193 'VS26' => pack("U", 0xE0109), # VARIATION SELECTOR-26
194 'VS27' => pack("U", 0xE010A), # VARIATION SELECTOR-27
195 'VS28' => pack("U", 0xE010B), # VARIATION SELECTOR-28
196 'VS29' => pack("U", 0xE010C), # VARIATION SELECTOR-29
197 'VS30' => pack("U", 0xE010D), # VARIATION SELECTOR-30
198 'VS31' => pack("U", 0xE010E), # VARIATION SELECTOR-31
199 'VS32' => pack("U", 0xE010F), # VARIATION SELECTOR-32
200 'VS33' => pack("U", 0xE0110), # VARIATION SELECTOR-33
201 'VS34' => pack("U", 0xE0111), # VARIATION SELECTOR-34
202 'VS35' => pack("U", 0xE0112), # VARIATION SELECTOR-35
203 'VS36' => pack("U", 0xE0113), # VARIATION SELECTOR-36
204 'VS37' => pack("U", 0xE0114), # VARIATION SELECTOR-37
205 'VS38' => pack("U", 0xE0115), # VARIATION SELECTOR-38
206 'VS39' => pack("U", 0xE0116), # VARIATION SELECTOR-39
207 'VS40' => pack("U", 0xE0117), # VARIATION SELECTOR-40
208 'VS41' => pack("U", 0xE0118), # VARIATION SELECTOR-41
209 'VS42' => pack("U", 0xE0119), # VARIATION SELECTOR-42
210 'VS43' => pack("U", 0xE011A), # VARIATION SELECTOR-43
211 'VS44' => pack("U", 0xE011B), # VARIATION SELECTOR-44
212 'VS45' => pack("U", 0xE011C), # VARIATION SELECTOR-45
213 'VS46' => pack("U", 0xE011D), # VARIATION SELECTOR-46
214 'VS47' => pack("U", 0xE011E), # VARIATION SELECTOR-47
215 'VS48' => pack("U", 0xE011F), # VARIATION SELECTOR-48
216 'VS49' => pack("U", 0xE0120), # VARIATION SELECTOR-49
217 'VS50' => pack("U", 0xE0121), # VARIATION SELECTOR-50
218 'VS51' => pack("U", 0xE0122), # VARIATION SELECTOR-51
219 'VS52' => pack("U", 0xE0123), # VARIATION SELECTOR-52
220 'VS53' => pack("U", 0xE0124), # VARIATION SELECTOR-53
221 'VS54' => pack("U", 0xE0125), # VARIATION SELECTOR-54
222 'VS55' => pack("U", 0xE0126), # VARIATION SELECTOR-55
223 'VS56' => pack("U", 0xE0127), # VARIATION SELECTOR-56
224 'VS57' => pack("U", 0xE0128), # VARIATION SELECTOR-57
225 'VS58' => pack("U", 0xE0129), # VARIATION SELECTOR-58
226 'VS59' => pack("U", 0xE012A), # VARIATION SELECTOR-59
227 'VS60' => pack("U", 0xE012B), # VARIATION SELECTOR-60
228 'VS61' => pack("U", 0xE012C), # VARIATION SELECTOR-61
229 'VS62' => pack("U", 0xE012D), # VARIATION SELECTOR-62
230 'VS63' => pack("U", 0xE012E), # VARIATION SELECTOR-63
231 'VS64' => pack("U", 0xE012F), # VARIATION SELECTOR-64
232 'VS65' => pack("U", 0xE0130), # VARIATION SELECTOR-65
233 'VS66' => pack("U", 0xE0131), # VARIATION SELECTOR-66
234 'VS67' => pack("U", 0xE0132), # VARIATION SELECTOR-67
235 'VS68' => pack("U", 0xE0133), # VARIATION SELECTOR-68
236 'VS69' => pack("U", 0xE0134), # VARIATION SELECTOR-69
237 'VS70' => pack("U", 0xE0135), # VARIATION SELECTOR-70
238 'VS71' => pack("U", 0xE0136), # VARIATION SELECTOR-71
239 'VS72' => pack("U", 0xE0137), # VARIATION SELECTOR-72
240 'VS73' => pack("U", 0xE0138), # VARIATION SELECTOR-73
241 'VS74' => pack("U", 0xE0139), # VARIATION SELECTOR-74
242 'VS75' => pack("U", 0xE013A), # VARIATION SELECTOR-75
243 'VS76' => pack("U", 0xE013B), # VARIATION SELECTOR-76
244 'VS77' => pack("U", 0xE013C), # VARIATION SELECTOR-77
245 'VS78' => pack("U", 0xE013D), # VARIATION SELECTOR-78
246 'VS79' => pack("U", 0xE013E), # VARIATION SELECTOR-79
247 'VS80' => pack("U", 0xE013F), # VARIATION SELECTOR-80
248 'VS81' => pack("U", 0xE0140), # VARIATION SELECTOR-81
249 'VS82' => pack("U", 0xE0141), # VARIATION SELECTOR-82
250 'VS83' => pack("U", 0xE0142), # VARIATION SELECTOR-83
251 'VS84' => pack("U", 0xE0143), # VARIATION SELECTOR-84
252 'VS85' => pack("U", 0xE0144), # VARIATION SELECTOR-85
253 'VS86' => pack("U", 0xE0145), # VARIATION SELECTOR-86
254 'VS87' => pack("U", 0xE0146), # VARIATION SELECTOR-87
255 'VS88' => pack("U", 0xE0147), # VARIATION SELECTOR-88
256 'VS89' => pack("U", 0xE0148), # VARIATION SELECTOR-89
257 'VS90' => pack("U", 0xE0149), # VARIATION SELECTOR-90
258 'VS91' => pack("U", 0xE014A), # VARIATION SELECTOR-91
259 'VS92' => pack("U", 0xE014B), # VARIATION SELECTOR-92
260 'VS93' => pack("U", 0xE014C), # VARIATION SELECTOR-93
261 'VS94' => pack("U", 0xE014D), # VARIATION SELECTOR-94
262 'VS95' => pack("U", 0xE014E), # VARIATION SELECTOR-95
263 'VS96' => pack("U", 0xE014F), # VARIATION SELECTOR-96
264 'VS97' => pack("U", 0xE0150), # VARIATION SELECTOR-97
265 'VS98' => pack("U", 0xE0151), # VARIATION SELECTOR-98
266 'VS99' => pack("U", 0xE0152), # VARIATION SELECTOR-99
267 'VS100' => pack("U", 0xE0153), # VARIATION SELECTOR-100
268 'VS101' => pack("U", 0xE0154), # VARIATION SELECTOR-101
269 'VS102' => pack("U", 0xE0155), # VARIATION SELECTOR-102
270 'VS103' => pack("U", 0xE0156), # VARIATION SELECTOR-103
271 'VS104' => pack("U", 0xE0157), # VARIATION SELECTOR-104
272 'VS105' => pack("U", 0xE0158), # VARIATION SELECTOR-105
273 'VS106' => pack("U", 0xE0159), # VARIATION SELECTOR-106
274 'VS107' => pack("U", 0xE015A), # VARIATION SELECTOR-107
275 'VS108' => pack("U", 0xE015B), # VARIATION SELECTOR-108
276 'VS109' => pack("U", 0xE015C), # VARIATION SELECTOR-109
277 'VS110' => pack("U", 0xE015D), # VARIATION SELECTOR-110
278 'VS111' => pack("U", 0xE015E), # VARIATION SELECTOR-111
279 'VS112' => pack("U", 0xE015F), # VARIATION SELECTOR-112
280 'VS113' => pack("U", 0xE0160), # VARIATION SELECTOR-113
281 'VS114' => pack("U", 0xE0161), # VARIATION SELECTOR-114
282 'VS115' => pack("U", 0xE0162), # VARIATION SELECTOR-115
283 'VS116' => pack("U", 0xE0163), # VARIATION SELECTOR-116
284 'VS117' => pack("U", 0xE0164), # VARIATION SELECTOR-117
285 'VS118' => pack("U", 0xE0165), # VARIATION SELECTOR-118
286 'VS119' => pack("U", 0xE0166), # VARIATION SELECTOR-119
287 'VS120' => pack("U", 0xE0167), # VARIATION SELECTOR-120
288 'VS121' => pack("U", 0xE0168), # VARIATION SELECTOR-121
289 'VS122' => pack("U", 0xE0169), # VARIATION SELECTOR-122
290 'VS123' => pack("U", 0xE016A), # VARIATION SELECTOR-123
291 'VS124' => pack("U", 0xE016B), # VARIATION SELECTOR-124
292 'VS125' => pack("U", 0xE016C), # VARIATION SELECTOR-125
293 'VS126' => pack("U", 0xE016D), # VARIATION SELECTOR-126
294 'VS127' => pack("U", 0xE016E), # VARIATION SELECTOR-127
295 'VS128' => pack("U", 0xE016F), # VARIATION SELECTOR-128
296 'VS129' => pack("U", 0xE0170), # VARIATION SELECTOR-129
297 'VS130' => pack("U", 0xE0171), # VARIATION SELECTOR-130
298 'VS131' => pack("U", 0xE0172), # VARIATION SELECTOR-131
299 'VS132' => pack("U", 0xE0173), # VARIATION SELECTOR-132
300 'VS133' => pack("U", 0xE0174), # VARIATION SELECTOR-133
301 'VS134' => pack("U", 0xE0175), # VARIATION SELECTOR-134
302 'VS135' => pack("U", 0xE0176), # VARIATION SELECTOR-135
303 'VS136' => pack("U", 0xE0177), # VARIATION SELECTOR-136
304 'VS137' => pack("U", 0xE0178), # VARIATION SELECTOR-137
305 'VS138' => pack("U", 0xE0179), # VARIATION SELECTOR-138
306 'VS139' => pack("U", 0xE017A), # VARIATION SELECTOR-139
307 'VS140' => pack("U", 0xE017B), # VARIATION SELECTOR-140
308 'VS141' => pack("U", 0xE017C), # VARIATION SELECTOR-141
309 'VS142' => pack("U", 0xE017D), # VARIATION SELECTOR-142
310 'VS143' => pack("U", 0xE017E), # VARIATION SELECTOR-143
311 'VS144' => pack("U", 0xE017F), # VARIATION SELECTOR-144
312 'VS145' => pack("U", 0xE0180), # VARIATION SELECTOR-145
313 'VS146' => pack("U", 0xE0181), # VARIATION SELECTOR-146
314 'VS147' => pack("U", 0xE0182), # VARIATION SELECTOR-147
315 'VS148' => pack("U", 0xE0183), # VARIATION SELECTOR-148
316 'VS149' => pack("U", 0xE0184), # VARIATION SELECTOR-149
317 'VS150' => pack("U", 0xE0185), # VARIATION SELECTOR-150
318 'VS151' => pack("U", 0xE0186), # VARIATION SELECTOR-151
319 'VS152' => pack("U", 0xE0187), # VARIATION SELECTOR-152
320 'VS153' => pack("U", 0xE0188), # VARIATION SELECTOR-153
321 'VS154' => pack("U", 0xE0189), # VARIATION SELECTOR-154
322 'VS155' => pack("U", 0xE018A), # VARIATION SELECTOR-155
323 'VS156' => pack("U", 0xE018B), # VARIATION SELECTOR-156
324 'VS157' => pack("U", 0xE018C), # VARIATION SELECTOR-157
325 'VS158' => pack("U", 0xE018D), # VARIATION SELECTOR-158
326 'VS159' => pack("U", 0xE018E), # VARIATION SELECTOR-159
327 'VS160' => pack("U", 0xE018F), # VARIATION SELECTOR-160
328 'VS161' => pack("U", 0xE0190), # VARIATION SELECTOR-161
329 'VS162' => pack("U", 0xE0191), # VARIATION SELECTOR-162
330 'VS163' => pack("U", 0xE0192), # VARIATION SELECTOR-163
331 'VS164' => pack("U", 0xE0193), # VARIATION SELECTOR-164
332 'VS165' => pack("U", 0xE0194), # VARIATION SELECTOR-165
333 'VS166' => pack("U", 0xE0195), # VARIATION SELECTOR-166
334 'VS167' => pack("U", 0xE0196), # VARIATION SELECTOR-167
335 'VS168' => pack("U", 0xE0197), # VARIATION SELECTOR-168
336 'VS169' => pack("U", 0xE0198), # VARIATION SELECTOR-169
337 'VS170' => pack("U", 0xE0199), # VARIATION SELECTOR-170
338 'VS171' => pack("U", 0xE019A), # VARIATION SELECTOR-171
339 'VS172' => pack("U", 0xE019B), # VARIATION SELECTOR-172
340 'VS173' => pack("U", 0xE019C), # VARIATION SELECTOR-173
341 'VS174' => pack("U", 0xE019D), # VARIATION SELECTOR-174
342 'VS175' => pack("U", 0xE019E), # VARIATION SELECTOR-175
343 'VS176' => pack("U", 0xE019F), # VARIATION SELECTOR-176
344 'VS177' => pack("U", 0xE01A0), # VARIATION SELECTOR-177
345 'VS178' => pack("U", 0xE01A1), # VARIATION SELECTOR-178
346 'VS179' => pack("U", 0xE01A2), # VARIATION SELECTOR-179
347 'VS180' => pack("U", 0xE01A3), # VARIATION SELECTOR-180
348 'VS181' => pack("U", 0xE01A4), # VARIATION SELECTOR-181
349 'VS182' => pack("U", 0xE01A5), # VARIATION SELECTOR-182
350 'VS183' => pack("U", 0xE01A6), # VARIATION SELECTOR-183
351 'VS184' => pack("U", 0xE01A7), # VARIATION SELECTOR-184
352 'VS185' => pack("U", 0xE01A8), # VARIATION SELECTOR-185
353 'VS186' => pack("U", 0xE01A9), # VARIATION SELECTOR-186
354 'VS187' => pack("U", 0xE01AA), # VARIATION SELECTOR-187
355 'VS188' => pack("U", 0xE01AB), # VARIATION SELECTOR-188
356 'VS189' => pack("U", 0xE01AC), # VARIATION SELECTOR-189
357 'VS190' => pack("U", 0xE01AD), # VARIATION SELECTOR-190
358 'VS191' => pack("U", 0xE01AE), # VARIATION SELECTOR-191
359 'VS192' => pack("U", 0xE01AF), # VARIATION SELECTOR-192
360 'VS193' => pack("U", 0xE01B0), # VARIATION SELECTOR-193
361 'VS194' => pack("U", 0xE01B1), # VARIATION SELECTOR-194
362 'VS195' => pack("U", 0xE01B2), # VARIATION SELECTOR-195
363 'VS196' => pack("U", 0xE01B3), # VARIATION SELECTOR-196
364 'VS197' => pack("U", 0xE01B4), # VARIATION SELECTOR-197
365 'VS198' => pack("U", 0xE01B5), # VARIATION SELECTOR-198
366 'VS199' => pack("U", 0xE01B6), # VARIATION SELECTOR-199
367 'VS200' => pack("U", 0xE01B7), # VARIATION SELECTOR-200
368 'VS201' => pack("U", 0xE01B8), # VARIATION SELECTOR-201
369 'VS202' => pack("U", 0xE01B9), # VARIATION SELECTOR-202
370 'VS203' => pack("U", 0xE01BA), # VARIATION SELECTOR-203
371 'VS204' => pack("U", 0xE01BB), # VARIATION SELECTOR-204
372 'VS205' => pack("U", 0xE01BC), # VARIATION SELECTOR-205
373 'VS206' => pack("U", 0xE01BD), # VARIATION SELECTOR-206
374 'VS207' => pack("U", 0xE01BE), # VARIATION SELECTOR-207
375 'VS208' => pack("U", 0xE01BF), # VARIATION SELECTOR-208
376 'VS209' => pack("U", 0xE01C0), # VARIATION SELECTOR-209
377 'VS210' => pack("U", 0xE01C1), # VARIATION SELECTOR-210
378 'VS211' => pack("U", 0xE01C2), # VARIATION SELECTOR-211
379 'VS212' => pack("U", 0xE01C3), # VARIATION SELECTOR-212
380 'VS213' => pack("U", 0xE01C4), # VARIATION SELECTOR-213
381 'VS214' => pack("U", 0xE01C5), # VARIATION SELECTOR-214
382 'VS215' => pack("U", 0xE01C6), # VARIATION SELECTOR-215
383 'VS216' => pack("U", 0xE01C7), # VARIATION SELECTOR-216
384 'VS217' => pack("U", 0xE01C8), # VARIATION SELECTOR-217
385 'VS218' => pack("U", 0xE01C9), # VARIATION SELECTOR-218
386 'VS219' => pack("U", 0xE01CA), # VARIATION SELECTOR-219
387 'VS220' => pack("U", 0xE01CB), # VARIATION SELECTOR-220
388 'VS221' => pack("U", 0xE01CC), # VARIATION SELECTOR-221
389 'VS222' => pack("U", 0xE01CD), # VARIATION SELECTOR-222
390 'VS223' => pack("U", 0xE01CE), # VARIATION SELECTOR-223
391 'VS224' => pack("U", 0xE01CF), # VARIATION SELECTOR-224
392 'VS225' => pack("U", 0xE01D0), # VARIATION SELECTOR-225
393 'VS226' => pack("U", 0xE01D1), # VARIATION SELECTOR-226
394 'VS227' => pack("U", 0xE01D2), # VARIATION SELECTOR-227
395 'VS228' => pack("U", 0xE01D3), # VARIATION SELECTOR-228
396 'VS229' => pack("U", 0xE01D4), # VARIATION SELECTOR-229
397 'VS230' => pack("U", 0xE01D5), # VARIATION SELECTOR-230
398 'VS231' => pack("U", 0xE01D6), # VARIATION SELECTOR-231
399 'VS232' => pack("U", 0xE01D7), # VARIATION SELECTOR-232
400 'VS233' => pack("U", 0xE01D8), # VARIATION SELECTOR-233
401 'VS234' => pack("U", 0xE01D9), # VARIATION SELECTOR-234
402 'VS235' => pack("U", 0xE01DA), # VARIATION SELECTOR-235
403 'VS236' => pack("U", 0xE01DB), # VARIATION SELECTOR-236
404 'VS237' => pack("U", 0xE01DC), # VARIATION SELECTOR-237
405 'VS238' => pack("U", 0xE01DD), # VARIATION SELECTOR-238
406 'VS239' => pack("U", 0xE01DE), # VARIATION SELECTOR-239
407 'VS240' => pack("U", 0xE01DF), # VARIATION SELECTOR-240
408 'VS241' => pack("U", 0xE01E0), # VARIATION SELECTOR-241
409 'VS242' => pack("U", 0xE01E1), # VARIATION SELECTOR-242
410 'VS243' => pack("U", 0xE01E2), # VARIATION SELECTOR-243
411 'VS244' => pack("U", 0xE01E3), # VARIATION SELECTOR-244
412 'VS245' => pack("U", 0xE01E4), # VARIATION SELECTOR-245
413 'VS246' => pack("U", 0xE01E5), # VARIATION SELECTOR-246
414 'VS247' => pack("U", 0xE01E6), # VARIATION SELECTOR-247
415 'VS248' => pack("U", 0xE01E7), # VARIATION SELECTOR-248
416 'VS249' => pack("U", 0xE01E8), # VARIATION SELECTOR-249
417 'VS250' => pack("U", 0xE01E9), # VARIATION SELECTOR-250
418 'VS251' => pack("U", 0xE01EA), # VARIATION SELECTOR-251
419 'VS252' => pack("U", 0xE01EB), # VARIATION SELECTOR-252
420 'VS253' => pack("U", 0xE01EC), # VARIATION SELECTOR-253
421 'VS254' => pack("U", 0xE01ED), # VARIATION SELECTOR-254
422 'VS255' => pack("U", 0xE01EE), # VARIATION SELECTOR-255
423 'VS256' => pack("U", 0xE01EF), # VARIATION SELECTOR-256
424 'WJ' => pack("U", 0x2060), # WORD JOINER
425 'ZWJ' => pack("U", 0x200D), # ZERO WIDTH JOINER
426 'ZWNJ' => pack("U", 0x200C), # ZERO WIDTH NON-JOINER
427 'ZWSP' => pack("U", 0x200B), # ZERO WIDTH SPACE
428);
52ea3e69 429
38f4139d
KW
430# These are the aliases above that differ under :loose and :full matching
431# because the :full versions have blanks or hyphens in them.
432my %loose_system_aliases = (
433 'LINEFEED' => pack("U", 0x0A),
434 'FORMFEED' => pack("U", 0x0C),
435 'CARRIAGERETURN' => pack("U", 0x0D),
436 'NEXTLINE' => pack("U", 0x85),
437 'SINGLESHIFT2' => pack("U", 0x8E),
438 'SINGLESHIFT3' => pack("U", 0x8F),
439 'PRIVATEUSE1' => pack("U", 0x91),
440 'PRIVATEUSE2' => pack("U", 0x92),
441 'STARTOFPROTECTEDAREA' => pack("U", 0x96),
442 'ENDOFPROTECTEDAREA' => pack("U", 0x97),
443 'PADDINGCHARACTER' => pack("U", 0x80),
444 'HIGHOCTETPRESET' => pack("U", 0x81),
445 'SINGLEGRAPHICCHARACTERINTRODUCER' => pack("U", 0x99),
446 'BYTEORDERMARK' => pack("U", 0xFEFF),
447);
448
232cbbee 449my %deprecated_aliases = (
bcc08981
KW
450 # Pre-3.2 compatibility (only for the first 256 characters).
451 # Use of these gives deprecated message.
452 'HORIZONTAL TABULATION' => pack("U", 0x09), # CHARACTER TABULATION
453 'VERTICAL TABULATION' => pack("U", 0x0B), # LINE TABULATION
454 'FILE SEPARATOR' => pack("U", 0x1C), # INFORMATION SEPARATOR FOUR
455 'GROUP SEPARATOR' => pack("U", 0x1D), # INFORMATION SEPARATOR THREE
456 'RECORD SEPARATOR' => pack("U", 0x1E), # INFORMATION SEPARATOR TWO
457 'UNIT SEPARATOR' => pack("U", 0x1F), # INFORMATION SEPARATOR ONE
458 'HORIZONTAL TABULATION SET' => pack("U", 0x88), # CHARACTER TABULATION SET
459 'HORIZONTAL TABULATION WITH JUSTIFICATION' => pack("U", 0x89), # CHARACTER TABULATION WITH JUSTIFICATION
460 'PARTIAL LINE DOWN' => pack("U", 0x8B), # PARTIAL LINE FORWARD
461 'PARTIAL LINE UP' => pack("U", 0x8C), # PARTIAL LINE BACKWARD
462 'VERTICAL TABULATION SET' => pack("U", 0x8A), # LINE TABULATION SET
463 'REVERSE INDEX' => pack("U", 0x8D), # REVERSE LINE FEED
3ffed8c2
KW
464
465 # Unicode 6.0 co-opted this for U+1F514, so deprecate it for now.
466 'BELL' => pack("U", 0x07),
bcc08981 467);
52ea3e69 468
38f4139d
KW
469my %loose_deprecated_aliases = (
470 'HORIZONTALTABULATION' => pack("U", 0x09),
471 'VERTICALTABULATION' => pack("U", 0x0B),
472 'FILESEPARATOR' => pack("U", 0x1C),
473 'GROUPSEPARATOR' => pack("U", 0x1D),
474 'RECORDSEPARATOR' => pack("U", 0x1E),
475 'UNITSEPARATOR' => pack("U", 0x1F),
476 'HORIZONTALTABULATIONSET' => pack("U", 0x88),
477 'HORIZONTALTABULATIONWITHJUSTIFICATION' => pack("U", 0x89),
478 'PARTIALLINEDOWN' => pack("U", 0x8B),
479 'PARTIALLINEUP' => pack("U", 0x8C),
480 'VERTICALTABULATIONSET' => pack("U", 0x8A),
481 'REVERSEINDEX' => pack("U", 0x8D),
482);
483
484# These are special cased in :loose matching, differing only in a medial
485# hyphen
486my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180);
487my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C);
488
84374e30 489
cc26ddeb 490my $txt; # The table of official character names
281aa49e 491
84374e30
KW
492my %full_names_cache; # Holds already-looked-up names, so don't have to
493# re-look them up again. The previous versions of charnames had scoping
494# bugs. For example if we use script A in one scope and find and cache
495# what Z resolves to, we can't use that cache in a different scope that
496# uses script B instead of A, as Z might be an entirely different letter
497# there; or there might be different aliases in effect in different
498# scopes, or :short may be in effect or not effect in different scopes,
499# or various combinations thereof. This was solved in this version
500# mostly by moving things to %^H. But some things couldn't be moved
501# there. One of them was the cache of runtime looked-up names, in part
502# because %^H is read-only at runtime. I (khw) don't know why the cache
503# was run-time only in the previous versions: perhaps oversight; perhaps
504# that compile time looking doesn't happen in a loop so didn't think it
505# was worthwhile; perhaps not wanting to make the cache too large. But
506# I decided to make it compile time as well; this could easily be
507# changed.
508# Anyway, this hash is not scoped, and is added to at runtime. It
509# doesn't have scoping problems because the data in it is restricted to
510# official names, which are always invariant, and we only set it and
511# look at it at during :full lookups, so is unaffected by any other
512# scoped options. I put this in to maintain parity with the older
513# version. If desired, a %short_names cache could also be made, as well
514# as one for each script, say in %script_names_cache, with each key
515# being a hash for a script named in a 'use charnames' statement. I
516# decided not to do that for now, just because it's added complication,
517# and because I'm just trying to maintain parity, not extend it.
518
38f4139d
KW
519# Like %full_names_cache, but for use when :loose is in effect. There needs
520# to be two caches because :loose may not be in effect for a scope, and a
521# loose name could inappropriately be returned when only exact matching is
522# called for.
523my %loose_names_cache;
524
281aa49e
KW
525# Designed so that test decimal first, and then hex. Leading zeros
526# imply non-decimal, as do non-[0-9]
232cbbee
KW
527my $decimal_qr = qr/^[1-9]\d*$/;
528
529# Returns the hex number in $1.
530my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/;
423cee85 531
8878f897
T
532sub croak
533{
534 require Carp; goto &Carp::croak;
535} # croak
536
537sub carp
538{
539 require Carp; goto &Carp::carp;
540} # carp
541
cc26ddeb 542sub alias (@) # Set up a single alias
35c0985d 543{
35c0985d 544 my $alias = ref $_[0] ? $_[0] : { @_ };
232cbbee
KW
545 foreach my $name (keys %$alias) {
546 my $value = $alias->{$name};
52fb7278 547 next unless defined $value; # Omit if screwed up.
84374e30
KW
548
549 # Is slightly slower to just after this statement see if it is
550 # decimal, since we already know it is after having converted from
551 # hex, but makes the code easier to maintain, and is called
552 # infrequently, only at compile-time
553 if ($value !~ $decimal_qr && $value =~ $hex_qr) {
554 $value = CORE::hex $1;
555 }
232cbbee 556 if ($value =~ $decimal_qr) {
0ae19c29 557 no warnings qw(non_unicode surrogate nonchar); # Allow any non-malformed
fb121860 558 $^H{charnames_ord_aliases}{$name} = pack("U", $value);
232cbbee
KW
559
560 # Use a canonical form.
b1c167a3 561 $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
232cbbee
KW
562 }
563 else {
52fb7278
KW
564 # XXX validate syntax when deprecation cycle complete. ie. start
565 # with an alpha only, etc.
566 $^H{charnames_name_aliases}{$name} = $value;
232cbbee
KW
567 }
568 }
35c0985d
MB
569} # alias
570
5a7fb30a 571sub not_legal_use_bytes_msg {
fb121860
KW
572 my ($name, $utf8) = @_;
573 my $return;
574
575 if (length($utf8) == 1) {
576 $return = sprintf("Character 0x%04x with name '%s' is", ord $utf8, $name);
577 } else {
578 $return = sprintf("String with name '%s' (and ordinals %s) contains character(s)", $name, join(" ", map { sprintf "0x%04X", ord $_ } split(//, $utf8)));
579 }
580 return $return . " above 0xFF with 'use bytes' in effect";
5a7fb30a
KW
581}
582
281aa49e 583sub alias_file ($) # Reads a file containing alias definitions
35c0985d 584{
51cf30b6
MB
585 my ($arg, $file) = @_;
586 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
587 $file = $arg;
588 }
589 elsif ($arg =~ m/^\w+$/) {
590 $file = "unicore/${arg}_alias.pl";
591 }
592 else {
593 croak "Charnames alias files can only have identifier characters";
594 }
35c0985d 595 if (my @alias = do $file) {
51cf30b6
MB
596 @alias == 1 && !defined $alias[0] and
597 croak "$file cannot be used as alias file for charnames";
598 @alias % 2 and
599 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
600 alias (@alias);
601 return (1);
602 }
603 0;
604} # alias_file
605
03f95285
KW
606# For use when don't import anything. This structure must be kept in
607# sync with the one that import() fills up.
608my %dummy_H = (
609 charnames_stringified_names => "",
610 charnames_stringified_ords => "",
611 charnames_scripts => "",
612 charnames_full => 1,
38f4139d 613 charnames_loose => 0,
03f95285
KW
614 charnames_short => 0,
615 );
616
63098191 617
fb121860
KW
618sub lookup_name ($$$) {
619 my ($name, $wants_ord, $runtime) = @_;
63098191 620
fb121860
KW
621 # Lookup the name or sequence $name in the tables. If $wants_ord is false,
622 # returns the string equivalent of $name; if true, returns the ordinal value
623 # instead, but in this case $name must not be a sequence; otherwise undef is
624 # returned and a warning raised. $runtime is 0 if compiletime, otherwise
625 # gives the number of stack frames to go back to get the application caller
626 # info.
627 # If $name is not found, returns undef in runtime with no warning; and in
628 # compiletime, the Unicode replacement character, with a warning.
63098191 629
fb121860
KW
630 # It looks first in the aliases, then in the large table of official Unicode
631 # names.
84374e30 632
9deebca3 633 my $utf8; # The string result
e79869e1 634 my $save_input;
b177ca84 635
84374e30 636 if ($runtime) {
03f95285 637
fb121860
KW
638 my $hints_ref = (caller($runtime))[10];
639
03f95285
KW
640 # If we didn't import anything (which happens with 'use charnames ()',
641 # substitute a dummy structure.
642 $hints_ref = \%dummy_H if ! defined $hints_ref
38f4139d
KW
643 || (! defined $hints_ref->{charnames_full}
644 && ! defined $hints_ref->{charnames_loose});
03f95285 645
84374e30
KW
646 # At runtime, but currently not at compile time, $^H gets
647 # stringified, so un-stringify back to the original data structures.
648 # These get thrown away by perl before the next invocation
649 # Also fill in the hash with the non-stringified data.
03f95285 650 # N.B. New fields must be also added to %dummy_H
84374e30 651
03f95285
KW
652 %{$^H{charnames_name_aliases}} = split ',',
653 $hints_ref->{charnames_stringified_names};
654 %{$^H{charnames_ord_aliases}} = split ',',
655 $hints_ref->{charnames_stringified_ords};
e79869e1 656 $^H{charnames_scripts} = $hints_ref->{charnames_scripts};
84374e30 657 $^H{charnames_full} = $hints_ref->{charnames_full};
38f4139d 658 $^H{charnames_loose} = $hints_ref->{charnames_loose};
84374e30
KW
659 $^H{charnames_short} = $hints_ref->{charnames_short};
660 }
661
38f4139d
KW
662 my $loose = $^H{charnames_loose};
663 my $lookup_name; # Input name suitably modified for grepping for in the
664 # table
665
232cbbee 666 # User alias should be checked first or else can't override ours, and if we
9deebca3 667 # were to add any, could conflict with theirs.
84374e30 668 if (exists $^H{charnames_ord_aliases}{$name}) {
f1ccd77d 669 $utf8 = $^H{charnames_ord_aliases}{$name};
16036bcd 670 }
84374e30
KW
671 elsif (exists $^H{charnames_name_aliases}{$name}) {
672 $name = $^H{charnames_name_aliases}{$name};
38f4139d
KW
673 $save_input = $lookup_name = $name; # Cache the result for any error
674 # message
675 # The aliases are documented to not match loosely, so change loose match
676 # into full.
677 if ($loose) {
678 $loose = 0;
679 $^H{charnames_full} = 1;
680 }
52ea3e69 681 }
38f4139d
KW
682 else {
683
684 # Here, not a user alias. That means that loose matching may be in
685 # effect; will have to modify the input name.
686 $lookup_name = $name;
687 if ($loose) {
688 $lookup_name = uc $lookup_name;
689
690 # Squeeze out all underscores
691 $lookup_name =~ s/_//g;
692
693 # Remove all medial hyphens
694 $lookup_name =~ s/ (?<= \S ) - (?= \S )//gx;
695
696 # Squeeze out all spaces
697 $lookup_name =~ s/\s//g;
698 }
699
700 # Here, $lookup_name has been modified as necessary for looking in the
701 # hashes. Check the system alias files next. Most of these aliases are
702 # the same for both strict and loose matching. To save space, the ones
703 # which differ are in their own separate hash, which is checked if loose
704 # matching is selected and the regular match fails. To save time, the
705 # loose hashes could be expanded to include all aliases, and there would
706 # only have to be one check. But if someone specifies :loose, they are
707 # interested in convenience over speed, and the time for this second check
708 # is miniscule compared to the rest of the routine.
709 if (exists $system_aliases{$lookup_name}) {
710 $utf8 = $system_aliases{$lookup_name};
711 }
712 elsif ($loose && exists $loose_system_aliases{$lookup_name}) {
713 $utf8 = $loose_system_aliases{$lookup_name};
714 }
715 elsif (exists $deprecated_aliases{$lookup_name}) {
716 require warnings;
717 warnings::warnif('deprecated',
718 "Unicode character name \"$name\" is deprecated, use \""
719 . viacode(ord $deprecated_aliases{$lookup_name})
720 . "\" instead");
721 $utf8 = $deprecated_aliases{$lookup_name};
722 }
723 elsif ($loose && exists $loose_deprecated_aliases{$lookup_name}) {
724 require warnings;
725 warnings::warnif('deprecated',
726 "Unicode character name \"$name\" is deprecated, use \""
727 . viacode(ord $loose_deprecated_aliases{$lookup_name})
728 . "\" instead");
729 $utf8 = $loose_deprecated_aliases{$lookup_name};
730 }
52ea3e69 731 }
b177ca84 732
38f4139d 733 my @off; # Offsets into table of pattern match begin and end
52ea3e69 734
38f4139d 735 # If haven't found it yet...
f1ccd77d 736 if (! defined $utf8) {
35c0985d 737
9deebca3 738 # See if has looked this input up earlier.
38f4139d 739 if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) {
f1ccd77d 740 $utf8 = $full_names_cache{$name};
35c0985d 741 }
38f4139d
KW
742 elsif ($loose && exists $loose_names_cache{$name}) {
743 $utf8 = $loose_names_cache{$name};
744 }
745 else { # Here, must do a look-up
746
747 # If full or loose matching succeeded, points to where to cache the
748 # result
749 my $cache_ref;
35c0985d 750
84374e30
KW
751 ## Suck in the code/name list as a big string.
752 ## Lines look like:
73d9566f 753 ## "00052\tLATIN CAPITAL LETTER R\n"
fb121860
KW
754 # or
755 # "0052 0303\tLATIN CAPITAL LETTER R WITH TILDE\n"
84374e30
KW
756 $txt = do "unicore/Name.pl" unless $txt;
757
758 ## @off will hold the index into the code/name string of the start and
759 ## end of the name as we find it.
760
38f4139d
KW
761 ## If :loose, look for a loose match; if :full, look for the name
762 ## exactly
6294fed8
KW
763 # First, see if the name is one which is algorithmically determinable.
764 # The subroutine is included in Name.pl. The table contained in
765 # $txt doesn't contain these. Experiments show that checking
766 # for these before checking for the regular names has no
767 # noticeable impact on performance for the regular names, but
768 # the other way around slows down finding these immensely.
769 # Algorithmically determinables are not placed in the cache because
770 # that uses up memory, and finding these again is fast.
38f4139d
KW
771 if (($loose || $^H{charnames_full})
772 && (defined (my $ord = name_to_code_point_special($lookup_name, $loose))))
773 {
774 $utf8 = pack("U", $ord);
775 }
776 else {
777
778 # Not algorithmically determinable; look up in the table. The name
779 # will be turned into a regex, so quote any meta characters.
780 $lookup_name = quotemeta $lookup_name;
781
782 if ($loose) {
783
784 # For loose matches, $lookup_name has already squeezed out the
785 # non-essential characters. We have to add in code to make the
786 # squeezed version match the non-squeezed equivalent in the table.
787 # The only remaining hyphens are ones that start or end a word in
788 # the original. They have been quoted in $lookup_name so they look
789 # like "\-". Change all other characters except the backslash
790 # quotes for any metacharacters, and the final character, so that
791 # e.g., COLON gets transformed into: /C[- ]?O[- ]?L[- ]?O[- ]?N/
792 $lookup_name =~ s/ (?! \\ -) # Don't do this to the \- sequence
793 ( [^-\\] ) # Nor the "-" within that sequence,
794 # nor the "\" that quotes metachars,
795 # but otherwise put the char into $1
796 (?=.) # And don't do it for the final char
797 /$1\[- \]?/gx; # And add an optional blank or
798 # '-' after each $1 char
799
800 # Those remaining hyphens were originally at the beginning or end of
801 # a word, so they can match either a blank before or after, but not
802 # both. (Keep in mind that they have been quoted, so are a '\-'
803 # sequence)
804 $lookup_name =~ s/\\ -/(?:- | -)/xg;
fb121860 805 }
5bd59e57 806
38f4139d
KW
807 # Do the lookup in the full table if asked for, and if succeeds
808 # save the offsets and set where to cache the result.
809 if (($loose || $^H{charnames_full}) && $txt =~ /\t$lookup_name$/m) {
810 @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
811 $cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache;
52fb7278 812 }
38f4139d 813 else {
84374e30 814
6294fed8
KW
815 # Here, didn't look for, or didn't find the name.
816 # If :short is allowed, see if input is like "greek:Sigma".
817 # Keep in mind that $lookup_name has had the metas quoted.
818 my $scripts_trie = "";
819 my $name_has_uppercase;
820 if (($^H{charnames_short})
821 && $lookup_name =~ /^ (?: \\ \s)* # Quoted space
822 (.+?) # $1 = the script
823 (?: \\ \s)*
824 \\ : # Quoted colon
825 (?: \\ \s)*
826 (.+?) # $2 = the name
827 (?: \\ \s)* $
828 /xs)
829 {
830 # Even in non-loose matching, the script traditionally has been
831 # case insensitve
832 $scripts_trie = "\U$1";
833 $lookup_name = $2;
834
835 # Use original name to find its input casing, but ignore the
836 # script part of that to make the determination.
837 $save_input = $name if ! defined $save_input;
838 $name =~ s/.*?://;
839 $name_has_uppercase = $name =~ /[[:upper:]]/;
840 }
841 else { # Otherwise look in allowed scripts
842 $scripts_trie = $^H{charnames_scripts};
843
844 # Use original name to find its input casing
845 $name_has_uppercase = $name =~ /[[:upper:]]/;
846 }
847
848 my $case = $name_has_uppercase ? "CAPITAL" : "SMALL";
849 if (! $scripts_trie
850 || $txt !~
851 /\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm)
852 {
853 # Here we still don't have it, give up.
854 return if $runtime;
855
856 # May have zapped input name, get it again.
857 $name = (defined $save_input) ? $save_input : $_[0];
858 carp "Unknown charname '$name'";
859 return ($wants_ord) ? 0xFFFD : pack("U", 0xFFFD);
860 }
861
862 # Here have found the input name in the table.
863 @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
38f4139d 864 }
b1c167a3 865
69ccf208
KW
866 # Here, the input name has been found; we haven't set up the output,
867 # but we know where in the string
fb121860 868 # the name starts. The string is set up so that for single characters
98dc9551 869 # (and not named sequences), the name is preceded immediately by a
fb121860 870 # tab and 5 hex digits for its code, with a \n before those. Named
98dc9551 871 # sequences won't have the 7th preceding character be a \n.
fb121860
KW
872 # (Actually, for the very first entry in the table this isn't strictly
873 # true: subtracting 7 will yield -1, and the substr below will
874 # therefore yield the very last character in the table, which should
875 # also be a \n, so the statement works anyway.)
876 if (substr($txt, $off[0] - 7, 1) eq "\n") {
877 $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5));
38f4139d
KW
878
879 # Handle the single loose matching special case, in which two names
880 # differ only by a single medial hyphen. If the original had a
881 # hyphen (or more) in the right place, then it is that one.
882 $utf8 = $HANGUL_JUNGSEONG_O_E_utf8
883 if $loose
884 && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8
885 && $name =~ m/O \s* - [-\s]* E/ix;
886 # Note that this wouldn't work if there were a 2nd
887 # OE in the name
fb121860
KW
888 }
889 else {
890
891 # Here, is a named sequence. Need to go looking for the beginning,
892 # which is just after the \n from the previous entry in the table.
893 # The +1 skips past that newline, or, if the rindex() fails, to put
894 # us to an offset of zero.
895 my $charstart = rindex($txt, "\n", $off[0] - 7) + 1;
896 $utf8 = pack("U*", map { CORE::hex }
897 split " ", substr($txt, $charstart, $off[0] - $charstart - 1));
898 }
5bd59e57 899 }
84374e30
KW
900
901 # Cache the input so as to not have to search the large table
902 # again, but only if it came from the one search that we cache.
38f4139d
KW
903 # (Haven't bothered with the pain of sorting out scoping issues for the
904 # scripts searches.)
905 $cache_ref->{$name} = $utf8 if defined $cache_ref;
35c0985d 906 }
423cee85 907 }
b177ca84 908
63098191 909
fb121860
KW
910 # Here, have the utf8. If the return is to be an ord, must be any single
911 # character.
912 if ($wants_ord) {
913 return ord($utf8) if length $utf8 == 1;
914 }
915 else {
916
917 # Here, wants string output. If utf8 is acceptable, just return what
918 # we've got; otherwise attempt to convert it to non-utf8 and return that.
919 my $in_bytes = ($runtime)
920 ? (caller $runtime)[8] & $bytes::hint_bits
921 : $^H & $bytes::hint_bits;
922 return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg
923 # means don't die on failure
924 }
925
926 # Here, there is an error: either there are too many characters, or the
927 # result string needs to be non-utf8, and at least one character requires
928 # utf8. Prefer any official name over the input one for the error message.
e79869e1
KW
929 if (@off) {
930 $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
931 }
932 else {
933 $name = (defined $save_input) ? $save_input : $_[0];
934 }
fb121860
KW
935
936 if ($wants_ord) {
937 # Only way to get here in this case is if result too long. Message
938 # assumes that our only caller that requires single char result is
939 # vianame.
940 carp "charnames::vianame() doesn't handle named sequences ($name). Use charnames::string_vianame() instead";
941 return;
942 }
943
944 # Only other possible failure here is from use bytes.
945 if ($runtime) {
946 carp not_legal_use_bytes_msg($name, $utf8);
947 return;
948 } else {
949 croak not_legal_use_bytes_msg($name, $utf8);
950 }
951
63098191
KW
952} # lookup_name
953
954sub charnames {
63098191 955
9deebca3
KW
956 # For \N{...}. Looks up the character name and returns the string
957 # representation of it.
63098191 958
fb121860
KW
959 # The first 0 arg means wants a string returned; the second that we are in
960 # compile time
961 return lookup_name($_[0], 0, 0);
63098191 962}
423cee85 963
b177ca84
JF
964sub import
965{
966 shift; ## ignore class name
967
35c0985d 968 if (not @_) {
1f874cb6 969 carp("'use charnames' needs explicit imports list");
b177ca84 970 }
423cee85 971 $^H{charnames} = \&charnames ;
84374e30
KW
972 $^H{charnames_ord_aliases} = {};
973 $^H{charnames_name_aliases} = {};
974 $^H{charnames_inverse_ords} = {};
03f95285
KW
975 # New fields must be added to %dummy_H, and the code in lookup_name()
976 # that copies fields from the runtime structure
b177ca84
JF
977
978 ##
979 ## fill %h keys with our @_ args.
980 ##
35c0985d 981 my ($promote, %h, @args) = (0);
e5c3f898
MG
982 while (my $arg = shift) {
983 if ($arg eq ":alias") {
51cf30b6 984 @_ or
52fb7278 985 croak ":alias needs an argument in charnames";
35c0985d
MB
986 my $alias = shift;
987 if (ref $alias) {
52fb7278
KW
988 ref $alias eq "HASH" or
989 croak "Only HASH reference supported as argument to :alias";
990 alias ($alias);
991 next;
35c0985d 992 }
51cf30b6 993 if ($alias =~ m{:(\w+)$}) {
38f4139d 994 $1 eq "full" || $1 eq "loose" || $1 eq "short" and
52fb7278
KW
995 croak ":alias cannot use existing pragma :$1 (reversed order?)";
996 alias_file ($1) and $promote = 1;
997 next;
35c0985d 998 }
51cf30b6
MB
999 alias_file ($alias);
1000 next;
1001 }
38f4139d
KW
1002 if (substr($arg, 0, 1) eq ':'
1003 and ! ($arg eq ":full" || $arg eq ":short" || $arg eq ":loose"))
1004 {
e5c3f898 1005 warn "unsupported special '$arg' in charnames";
51cf30b6 1006 next;
35c0985d 1007 }
e5c3f898 1008 push @args, $arg;
35c0985d
MB
1009 }
1010 @args == 0 && $promote and @args = (":full");
1011 @h{@args} = (1) x @args;
b177ca84 1012
38f4139d
KW
1013 # Don't leave these undefined as are tested for in lookup_names
1014 $^H{charnames_full} = delete $h{':full'} || 0;
1015 $^H{charnames_loose} = delete $h{':loose'} || 0;
03f95285 1016 $^H{charnames_short} = delete $h{':short'} || 0;
363879a0 1017 my @scripts = map { uc quotemeta } keys %h;
b177ca84
JF
1018
1019 ##
1020 ## If utf8? warnings are enabled, and some scripts were given,
281aa49e 1021 ## see if at least we can find one letter from each script.
b177ca84 1022 ##
e79869e1 1023 if (warnings::enabled('utf8') && @scripts) {
35c0985d
MB
1024 $txt = do "unicore/Name.pl" unless $txt;
1025
e79869e1 1026 for my $script (@scripts) {
73d9566f 1027 if (not $txt =~ m/\t$script (?:CAPITAL |SMALL )?LETTER /) {
52fb7278 1028 warnings::warn('utf8', "No such script: '$script'");
e79869e1 1029 $script = quotemeta $script; # Escape it, for use in the re.
b177ca84 1030 }
35c0985d 1031 }
bd62941a 1032 }
84374e30
KW
1033
1034 # %^H gets stringified, so serialize it ourselves so can extract the
1035 # real data back later.
1036 $^H{charnames_stringified_ords} = join ",", %{$^H{charnames_ord_aliases}};
1037 $^H{charnames_stringified_names} = join ",", %{$^H{charnames_name_aliases}};
1038 $^H{charnames_stringified_inverse_ords} = join ",", %{$^H{charnames_inverse_ords}};
38f4139d
KW
1039
1040 # Modify the input script names for loose name matching if that is also
1041 # specified, similar to the way the base character name is prepared. They
1042 # don't (currently, and hopefully never will) have dashes. These go into a
1043 # regex, and have already been uppercased and quotemeta'd. Squeeze out all
1044 # input underscores, blanks, and dashes. Then convert so will match a blank
1045 # between any characters.
1046 if ($^H{charnames_loose}) {
1047 for (my $i = 0; $i < @scripts; $i++) {
1048 $scripts[$i] =~ s/[_ -]//g;
1049 $scripts[$i] =~ s/ ( [^\\] ) (?= . ) /$1\\ ?/gx;
1050 }
1051 }
1052
e79869e1 1053 $^H{charnames_scripts} = join "|", @scripts; # Stringifiy them as a trie
35c0985d 1054} # import
423cee85 1055
84374e30
KW
1056# Cache of already looked-up values. This is set to only contain
1057# official values, and user aliases can't override them, so scoping is
1058# not an issue.
1059my %viacode;
63098191
KW
1060
1061sub viacode {
1062
1063 # Returns the name of the code point argument
4e2cda5d 1064
35c0985d
MB
1065 if (@_ != 1) {
1066 carp "charnames::viacode() expects one argument";
bd5c3bd9 1067 return;
35c0985d 1068 }
f0175764 1069
35c0985d 1070 my $arg = shift;
b177ca84 1071
e5432b89
KW
1072 # This is derived from Unicode::UCD, where it is nearly the same as the
1073 # function _getcode(), but here it makes sure that even a hex argument
1074 # has the proper number of leading zeros, which is critical in
1075 # matching against $txt below
281aa49e 1076 # Must check if decimal first; see comments at that definition
35c0985d 1077 my $hex;
232cbbee 1078 if ($arg =~ $decimal_qr) {
b1c167a3 1079 $hex = sprintf "%05X", $arg;
232cbbee 1080 } elsif ($arg =~ $hex_qr) {
e10d7780 1081 # Below is the line that differs from the _getcode() source
b1c167a3 1082 $hex = sprintf "%05X", hex $1;
35c0985d
MB
1083 } else {
1084 carp("unexpected arg \"$arg\" to charnames::viacode()");
1085 return;
1086 }
b177ca84 1087
35c0985d 1088 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 1089
ac046fe1
KW
1090 # If the code point is above the max in the table, there's no point
1091 # looking through it. Checking the length first is slightly faster
1092 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) {
1093 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 1094
5bd59e57
KW
1095 # See if the name is algorithmically determinable.
1096 my $algorithmic = code_point_to_name_special(CORE::hex $hex);
1097 if (defined $algorithmic) {
1098 $viacode{$hex} = $algorithmic;
1099 return $algorithmic;
1100 }
1101
ac046fe1
KW
1102 # Return the official name, if exists. It's unclear to me (khw) at
1103 # this juncture if it is better to return a user-defined override, so
1104 # leaving it as is for now.
73d9566f 1105 if ($txt =~ m/^$hex\t/m) {
f3227b74 1106
52fb7278
KW
1107 # The name starts with the next character and goes up to the
1108 # next new-line. Using capturing parentheses above instead of
1109 # @+ more than doubles the execution time in Perl 5.13
f3227b74 1110 $viacode{$hex} = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
52fb7278 1111 return $viacode{$hex};
ac046fe1 1112 }
232cbbee
KW
1113 }
1114
1115 # See if there is a user name for it, before giving up completely.
03f95285
KW
1116 # First get the scoped aliases, give up if have none.
1117 my $H_ref = (caller(0))[10];
1118 return if ! defined $H_ref
1119 || ! exists $H_ref->{charnames_stringified_inverse_ords};
1120
84374e30 1121 my %code_point_aliases = split ',',
03f95285 1122 $H_ref->{charnames_stringified_inverse_ords};
84374e30 1123 if (! exists $code_point_aliases{$hex}) {
ac046fe1
KW
1124 if (CORE::hex($hex) > 0x10FFFF) {
1125 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
1126 }
1127 return;
1128 }
bd5c3bd9 1129
84374e30 1130 return $code_point_aliases{$hex};
35c0985d 1131} # viacode
daf0d493
JH
1132
1133sub vianame
1134{
35c0985d
MB
1135 if (@_ != 1) {
1136 carp "charnames::vianame() expects one name argument";
1137 return ()
1138 }
daf0d493 1139
63098191
KW
1140 # Looks up the character name and returns its ordinal if
1141 # found, undef otherwise.
daf0d493 1142
63098191 1143 my $arg = shift;
dbc0d4f2 1144
63098191 1145 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d 1146
fb121860
KW
1147 # khw claims that this is poor interface design. The function should
1148 # return either a an ord or a chr for all inputs; not be bipolar. But
1149 # can't change it because of backward compatibility. New code can use
1150 # string_vianame() instead.
5a7fb30a
KW
1151 my $ord = CORE::hex $1;
1152 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
fb121860 1153 carp not_legal_use_bytes_msg($arg, chr $ord);
5a7fb30a 1154 return;
63098191 1155 }
daf0d493 1156
fb121860
KW
1157 # The first 1 arg means wants an ord returned; the second that we are in
1158 # runtime, and this is the first level routine called from the user
1159 return lookup_name($arg, 1, 1);
35c0985d 1160} # vianame
b177ca84 1161
fb121860
KW
1162sub string_vianame {
1163
1164 # Looks up the character name and returns its string representation if
1165 # found, undef otherwise.
1166
1167 if (@_ != 1) {
1168 carp "charnames::string_vianame() expects one name argument";
1169 return;
1170 }
1171
1172 my $arg = shift;
1173
1174 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
1175
1176 my $ord = CORE::hex $1;
1177 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
1178
1179 carp not_legal_use_bytes_msg($arg, chr $ord);
1180 return;
1181 }
1182
1183 # The 0 arg means wants a string returned; the 1 arg means that we are in
1184 # runtime, and this is the first level routine called from the user
1185 return lookup_name($arg, 0, 1);
1186} # string_vianame
1187
1188
423cee85
JH
1189
11901;
1191__END__
1192
1193=head1 NAME
1194
fb121860 1195charnames - access to Unicode character names and named character sequences; also define character names
423cee85
JH
1196
1197=head1 SYNOPSIS
1198
bcc08981
KW
1199 use charnames ':full';
1200 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
1201 print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}",
1202 " is an officially named sequence of two Unicode characters\n";
1203
38f4139d
KW
1204 use charnames ':loose';
1205 print "\N{Greek small-letter sigma}",
1206 "can be used to ignore case, underscores, most blanks,"
1207 "and when you aren't sure if the official name has hyphens\n";
1208
bcc08981
KW
1209 use charnames ':short';
1210 print "\N{greek:Sigma} is an upper-case sigma.\n";
1211
1212 use charnames qw(cyrillic greek);
1213 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
1214
1215 use charnames ":full", ":alias" => {
1216 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
1217 mychar => 0xE8000, # Private use area
1218 };
1219 print "\N{e_ACUTE} is a small letter e with an acute.\n";
14aeae98 1220 print "\N{mychar} allows me to name private use characters.\n";
bcc08981
KW
1221
1222 use charnames ();
1223 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
1224 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
1225 # "10330"
1226 print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on
1227 # ASCII platforms;
1228 # 193 on EBCDIC
1229 print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A"
b177ca84 1230
423cee85
JH
1231=head1 DESCRIPTION
1232
da9dec57 1233Pragma C<use charnames> is used to gain access to the names of the
fb121860
KW
1234Unicode characters and named character sequences, and to allow you to define
1235your own character and character sequence names.
1236
1237All forms of the pragma enable use of the following 3 functions:
1238
1239=over
1240
1241=item *
1242
1243L</charnames::string_vianame(I<name>)> for run-time lookup of a
1244either a character name or a named character sequence, returning its string
1245representation
1246
1247=item *
1248
1249L</charnames::vianame(I<name>)> for run-time lookup of a
1250character name (but not a named character sequence) to get its ordinal value
1251(code point)
da9dec57 1252
fb121860 1253=item *
da9dec57 1254
fb121860
KW
1255L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its
1256Unicode name.
1257
1258=back
1259
1260All forms other than C<S<"use charnames ();">> also enable the use of
da9dec57 1261C<\N{I<CHARNAME>}> sequences to compile a Unicode character into a
8ebef31d 1262string, based on its name.
da9dec57
KW
1263
1264Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
1265also inserts a character into a string, but doesn't require the use of
1266this pragma. The character it inserts is the one whose code point
1267(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
1268the Unicode (white background, black foreground) smiley face; it doesn't
1269require this pragma, whereas the equivalent, C<"\N{WHITE SMILING FACE}">
1270does.
1271Also, C<\N{I<...>}> can mean a regex quantifier instead of a character
8ebef31d
KW
1272name, when the I<...> is a number (or comma separated pair of numbers
1273(see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
da9dec57 1274
38f4139d
KW
1275The C<charnames> pragma supports arguments C<:full>, C<:loose>, C<:short>,
1276script names and L<customized aliases|/CUSTOM ALIASES>.
1277
1278If C<:full> is present, for expansion of
da9dec57 1279C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
38f4139d
KW
1280standard Unicode character names.
1281
1282C<:loose> is a variant of C<:full> which allows I<CHARNAME> to be less
1283precisely specified. Details are in L</LOOSE MATCHES>.
1284
1285If C<:short> is present, and
da9dec57 1286I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
14aeae98
KW
1287as a letter in script I<SCRIPT>, as described in the next paragraph.
1288Or, if C<use charnames> is used
da9dec57
KW
1289with script name arguments, then for C<\N{I<CHARNAME>}> the name
1290I<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
1291specified order). Customized aliases can override these, and are explained in
1292L</CUSTOM ALIASES>.
423cee85 1293
da9dec57 1294For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>
14aeae98 1295this pragma looks in the table of standard Unicode names for the names
423cee85
JH
1296
1297 SCRIPTNAME CAPITAL LETTER CHARNAME
1298 SCRIPTNAME SMALL LETTER CHARNAME
1299 SCRIPTNAME LETTER CHARNAME
1300
14aeae98 1301If I<CHARNAME> is all lowercase,
daf0d493 1302then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
14aeae98 1303is ignored, and both I<CHARNAME> and I<SCRIPTNAME> are converted to all
38f4139d
KW
1304uppercase for look-up. Other than that, both of them follow L<loose|/LOOSE
1305MATCHES> rules if C<:loose> is also specified; strict otherwise.
daf0d493 1306
da9dec57
KW
1307Note that C<\N{...}> is compile-time; it's a special form of string
1308constant used inside double-quotish strings; this means that you cannot
4e2cda5d 1309use variables inside the C<\N{...}>. If you want similar run-time
fb121860
KW
1310functionality, use
1311L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>.
423cee85 1312
301a3cda 1313For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
da9dec57
KW
1314there are no official Unicode names but you can use instead the ISO 6429
1315names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF,
1f31fcd4 1316ESC, ...). In Unicode 3.2 (as of Perl 5.8) some naming changes took
b59ae8bb
KW
1317place, and ISO 6429 was updated, see L</ALIASES>. Since Unicode 6.0, it
1318is deprecated to use C<BELL>. Instead use C<ALERT> (but C<BEL> works).
301a3cda 1319
e5432b89
KW
1320If the input name is unknown, C<\N{NAME}> raises a warning and
1321substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
1322
8ebef31d
KW
1323For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the
1324input name is that of a character that won't fit into a byte (i.e., whose
1325ordinal is above 255).
e5432b89 1326
da9dec57
KW
1327Otherwise, any string that includes a C<\N{I<charname>}> or
1328C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
1329L<perlunicode/Byte and Character Semantics>).
1330
38f4139d
KW
1331=head1 LOOSE MATCHES
1332
1333By specifying C<:loose>, Unicode's L<loose character name
5ef88e32 1334matching|http://www.unicode.org/reports/tr44#Matching_Rules> rules are
38f4139d
KW
1335selected instead of the strict exact match used otherwise.
1336That means that I<CHARNAME> doesn't have to be so precisely specified.
1337Upper/lower case doesn't matter (except with scripts as mentioned above), nor
1338do any underscores, and the only hyphens that matter are those at the
1339beginning or end of a word in the name (with one exception: the hyphen in
1340U+1180 C<HANGUL JUNGSEONG O-E> does matter).
1341Also, blanks not adjacent to hyphens don't matter.
1342The official Unicode names are quite variable as to where they use hyphens
1343versus spaces to separate word-like units, and this option allows you to not
1344have to care as much.
1345The reason non-medial hyphens matter is because of cases like
1346U+0F60 C<TIBETAN LETTER -A> versus U+0F68 C<TIBETAN LETTER A>.
1347The hyphen here is significant, as is the space before it, and so both must be
1348included.
1349
1350C<:loose> slows down look-ups by a factor of 2 to 3 versus
1351C<:full>, but the trade-off may be worth it to you. Each individual look-up
1352takes very little time, and the results are cached, so the speed difference
1353would become a factor only in programs that do look-ups of many different
1354spellings, and probably only when those look-ups are through vianame() and
1355string_vianame(), since C<\N{...}> look-ups are done at compile time.
1356
5ffe0e96 1357=head1 ALIASES
423cee85 1358
14aeae98
KW
1359A few aliases have been defined for convenience; instead of having
1360to use the official names,
423cee85 1361
5ffe0e96
MB
1362 LINE FEED (LF)
1363 FORM FEED (FF)
1364 CARRIAGE RETURN (CR)
1365 NEXT LINE (NEL)
423cee85 1366
e5432b89 1367(yes, with parentheses), one can use
d5448623 1368
5ffe0e96
MB
1369 LINE FEED
1370 FORM FEED
1371 CARRIAGE RETURN
1372 NEXT LINE
1373 LF
1374 FF
1375 CR
1376 NEL
1377
16036bcd
KW
1378All the other standard abbreviations for the controls, such as C<ACK> for
1379C<ACKNOWLEDGE> also can be used.
1380
5ffe0e96
MB
1381One can also use
1382
1383 BYTE ORDER MARK
1384 BOM
1385
16036bcd
KW
1386and these abbreviations
1387
1388 Abbreviation Full Name
1389
1390 CGJ COMBINING GRAPHEME JOINER
1391 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
1392 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
1393 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
1394 LRE LEFT-TO-RIGHT EMBEDDING
1395 LRM LEFT-TO-RIGHT MARK
1396 LRO LEFT-TO-RIGHT OVERRIDE
1397 MMSP MEDIUM MATHEMATICAL SPACE
1398 MVS MONGOLIAN VOWEL SEPARATOR
1399 NBSP NO-BREAK SPACE
1400 NNBSP NARROW NO-BREAK SPACE
1401 PDF POP DIRECTIONAL FORMATTING
1402 RLE RIGHT-TO-LEFT EMBEDDING
1403 RLM RIGHT-TO-LEFT MARK
1404 RLO RIGHT-TO-LEFT OVERRIDE
1405 SHY SOFT HYPHEN
1406 VS1 VARIATION SELECTOR-1
1407 .
1408 .
1409 .
1410 VS256 VARIATION SELECTOR-256
1411 WJ WORD JOINER
1412 ZWJ ZERO WIDTH JOINER
1413 ZWNJ ZERO WIDTH NON-JOINER
1414 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
1415
1416For backward compatibility one can use the old names for
1417certain C0 and C1 controls
1418
1419 old new
1420
5ffe0e96
MB
1421 FILE SEPARATOR INFORMATION SEPARATOR FOUR
1422 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
1423 HORIZONTAL TABULATION CHARACTER TABULATION
1424 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
1425 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
1426 WITH JUSTIFICATION
5ffe0e96
MB
1427 PARTIAL LINE DOWN PARTIAL LINE FORWARD
1428 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
1429 RECORD SEPARATOR INFORMATION SEPARATOR TWO
1430 REVERSE INDEX REVERSE LINE FEED
1431 UNIT SEPARATOR INFORMATION SEPARATOR ONE
1432 VERTICAL TABULATION LINE TABULATION
1433 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
1434
1435but the old names in addition to giving the character
1436will also give a warning about being deprecated.
423cee85 1437
16036bcd
KW
1438And finally, certain published variants are usable, including some for
1439controls that have no Unicode names:
1440
1f31fcd4
KW
1441 name character
1442
52fb7278 1443 END OF PROTECTED AREA END OF GUARDED AREA, U+0097
1f31fcd4
KW
1444 HIGH OCTET PRESET U+0081
1445 HOP U+0081
1446 IND U+0084
1447 INDEX U+0084
1448 PAD U+0080
1449 PADDING CHARACTER U+0080
1450 PRIVATE USE 1 PRIVATE USE ONE, U+0091
1451 PRIVATE USE 2 PRIVATE USE TWO, U+0092
1452 SGC U+0099
1453 SINGLE GRAPHIC CHARACTER INTRODUCER U+0099
1454 SINGLE-SHIFT 2 SINGLE SHIFT TWO, U+008E
1455 SINGLE-SHIFT 3 SINGLE SHIFT THREE, U+008F
1456 START OF PROTECTED AREA START OF GUARDED AREA, U+0096
16036bcd 1457
35c0985d
MB
1458=head1 CUSTOM ALIASES
1459
1f31fcd4
KW
1460You can add customized aliases to standard (C<:full>) Unicode naming
1461conventions. The aliases override any standard definitions, so, if
da9dec57
KW
1462you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
1463mean C<"B">, etc.
55bc7d3c
KW
1464
1465Note that an alias should not be something that is a legal curly
1466brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
e5432b89
KW
1467C<\N{123}> means to match 123 non-newline characters, and is not treated as a
1468charnames alias. Aliases are discouraged from beginning with anything
1469other than an alphabetic character and from containing anything other
1470than alphanumerics, spaces, dashes, parentheses, and underscores.
1471Currently they must be ASCII.
1472
38f4139d
KW
1473An alias can map to either an official Unicode character name (not a loose
1474matched name) or to a
e5432b89
KW
1475numeric code point (ordinal). The latter is useful for assigning names
1476to code points in Unicode private use areas such as U+E800 through
f12d74c0
KW
1477U+F8FF.
1478A numeric code point must be a non-negative integer or a string beginning
1479with C<"U+"> or C<"0x"> with the remainder considered to be a
1480hexadecimal integer. A literal numeric constant must be unsigned; it
1481will be interpreted as hex if it has a leading zero or contains
1482non-decimal hex digits; otherwise it will be interpreted as decimal.
232cbbee 1483
da9dec57 1484Aliases are added either by the use of anonymous hashes:
35c0985d 1485
da9dec57 1486 use charnames ":alias" => {
35c0985d 1487 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee 1488 mychar1 => 0xE8000,
35c0985d
MB
1489 };
1490 my $str = "\N{e_ACUTE}";
1491
da9dec57 1492or by using a file containing aliases:
35c0985d 1493
da9dec57 1494 use charnames ":alias" => "pro";
35c0985d 1495
8ebef31d 1496This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
da9dec57 1497file should return a list in plain perl:
35c0985d
MB
1498
1499 (
1500 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
1501 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
1502 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
1503 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
1504 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
1505 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
1506 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
f12d74c0 1507 mychar2 => "U+E8001",
35c0985d
MB
1508 );
1509
da9dec57
KW
1510Both these methods insert C<":full"> automatically as the first argument (if no
1511other argument is given), and you can give the C<":full"> explicitly as
1512well, like
35c0985d 1513
da9dec57 1514 use charnames ":full", ":alias" => "pro";
35c0985d 1515
38f4139d
KW
1516C<":loose"> has no effect with these. Input names must match exactly, using
1517C<":full"> rules.
1518
14aeae98 1519Also, both these methods currently allow only single characters to be named.
8ebef31d
KW
1520To name a sequence of characters, use a
1521L<custom translator|/CUSTOM TRANSLATORS> (described below).
1522
da9dec57 1523=head1 charnames::viacode(I<code>)
b177ca84
JF
1524
1525Returns the full name of the character indicated by the numeric code.
da9dec57 1526For example,
b177ca84
JF
1527
1528 print charnames::viacode(0x2722);
1529
1530prints "FOUR TEARDROP-SPOKED ASTERISK".
1531
232cbbee 1532The name returned is the official name for the code point, if
8ebef31d 1533available; otherwise your custom alias for it. This means that your
232cbbee 1534alias will only be returned for code points that don't have an official
14aeae98 1535Unicode name (nor a Unicode version 1 name), such as private use code
232cbbee 1536points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
da9dec57
KW
1537If you define more than one name for the code point, it is indeterminate
1538which one will be returned.
1539
1540The function returns C<undef> if no name is known for the code point.
1541In Unicode the proper name of these is the empty string, which
1542C<undef> stringifies to. (If you ask for a code point past the legal
1543Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
f12d74c0
KW
1544get C<undef> plus a warning.)
1545
1546The input number must be a non-negative integer or a string beginning
1547with C<"U+"> or C<"0x"> with the remainder considered to be a
1548hexadecimal integer. A literal numeric constant must be unsigned; it
1549will be interpreted as hex if it has a leading zero or contains
1550non-decimal hex digits; otherwise it will be interpreted as decimal.
daf0d493 1551
274085e3
PN
1552Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
1553SPACE", not "BYTE ORDER MARK".
1554
fb121860 1555=head1 charnames::string_vianame(I<name>)
daf0d493 1556
fb121860
KW
1557This is a runtime equivalent to C<\N{...}>. I<name> can be any expression
1558that evaluates to a name accepted by C<\N{...}> under the L<C<:full>
1559option|/DESCRIPTION> to C<charnames>. In addition, any other options for the
38f4139d
KW
1560controlling C<"use charnames"> in the same scope apply, like C<:loose> or any
1561L<script list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM
1562ALIASES> you may have defined.
daf0d493 1563
fb121860
KW
1564The only difference is that if the input name is unknown, C<string_vianame>
1565returns C<undef> instead of the REPLACEMENT CHARACTER and does not raise a
1566warning message.
daf0d493 1567
fb121860
KW
1568=head1 charnames::vianame(I<name>)
1569
1570This is similar to C<string_vianame>. The main difference is that under most
5ef88e32 1571circumstances, vianame returns an ordinal code
fb121860 1572point, whereas C<string_vianame> returns a string. For example,
daf0d493 1573
fb121860 1574 printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
b177ca84 1575
fb121860 1576prints "U+2722".
1f31fcd4 1577
fb121860
KW
1578This leads to the other two differences. Since a single code point is
1579returned, the function can't handle named character sequences, as these are
14aeae98
KW
1580composed of multiple characters (it returns C<undef> for these. And, the code
1581point can be that of any
fb121860 1582character, even ones that aren't legal under the C<S<use bytes>> pragma,
b177ca84 1583
5ef88e32
KW
1584See L</BUGS> for the circumstances in which the behavior differs
1585from that described above.
1586
5ffe0e96 1587=head1 CUSTOM TRANSLATORS
52ea3e69 1588
5ffe0e96 1589The mechanism of translation of C<\N{...}> escapes is general and not
5ef88e32 1590hardwired into F<charnames.pm>. A module can install custom
5ffe0e96
MB
1591translations (inside the scope which C<use>s the module) with the
1592following magic incantation:
52ea3e69 1593
5ffe0e96 1594 sub import {
52fb7278
KW
1595 shift;
1596 $^H{charnames} = \&translator;
5ffe0e96 1597 }
52ea3e69 1598
da9dec57 1599Here translator() is a subroutine which takes I<CHARNAME> as an
5ffe0e96 1600argument, and returns text to insert into the string instead of the
5ef88e32
KW
1601C<\N{I<CHARNAME>}> escape.
1602
1603This is the only way you can create a custom named sequence of code points.
1604
1605Since the text to insert should be different
5ffe0e96
MB
1606in C<bytes> mode and out of it, the function should check the current
1607state of C<bytes>-flag as in:
52ea3e69 1608
52fb7278 1609 use bytes (); # for $bytes::hint_bits
5ffe0e96 1610 sub translator {
52fb7278
KW
1611 if ($^H & $bytes::hint_bits) {
1612 return bytes_translator(@_);
1613 }
1614 else {
1615 return utf8_translator(@_);
1616 }
5ffe0e96 1617 }
52ea3e69 1618
da9dec57 1619See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
f0175764 1620
9e808deb
KW
1621Of course, C<vianame>, C<viacode>, and C<string_vianame> would need to be
1622overridden as well.
1f31fcd4 1623
423cee85
JH
1624=head1 BUGS
1625
14aeae98 1626vianame() normally returns an ordinal code point, but when the input name is of
8ebef31d
KW
1627the form C<U+...>, it returns a chr instead. In this case, if C<use bytes> is
1628in effect and the character won't fit into a byte, it returns C<undef> and
1629raises a warning.
55bc7d3c 1630
16036bcd
KW
1631Names must be ASCII characters only, which means that you are out of luck if
1632you want to create aliases in a language where some or all the characters of
1633the desired aliases are non-ASCII.
bee80e93 1634
f12d74c0
KW
1635Since evaluation of the translation function (see L</CUSTOM
1636TRANSLATORS>) happens in the middle of compilation (of a string
1637literal), the translation function should not do any C<eval>s or
1638C<require>s. This restriction should be lifted (but is low priority) in
1639a future version of Perl.
423cee85
JH
1640
1641=cut
0eacc33e 1642
52fb7278 1643# ex: set ts=8 sts=2 sw=2 et: