This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
charnames and perlapi: pod nits
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
123148a1 5our $VERSION = '1.26';
a03f0b9f 6use unicore::Name; # mktables-generated algorithmically-defined names
b75c8c73 7
52fb7278 8use bytes (); # for $bytes::hint_bits
123148a1 9use re "/aa"; # Everything in here should be ASCII
423cee85 10
38f4139d
KW
11# Translate between Unicode character names and their code points.
12#
13# The official names with their code points are stored in a table in
14# lib/unicore/Name.pl which is read in as a large string (almost 3/4 Mb in
15# Unicode 6.0). Each code point/name combination is separated by a \n in the
16# string. (Some of the CJK and the Hangul syllable names are determined
a03f0b9f
KW
17# instead algorithmically via subroutines stored instead in
18# lib/unicore/Name.pm). Because of the large size of this table, it isn't
19# converted into hashes for faster lookup.
38f4139d
KW
20#
21# But, user defined aliases are stored in their own hashes, as are Perl
22# extensions to the official names. These are checked first before looking at
23# the official table.
24#
25# Basically, the table is grepped for the input code point (viacode()) or
26# name (the other functions), and the corresponding value on the same line is
27# returned. The grepping is done by turning the input into a regular
28# expression. Thus, the same table does double duty, used by both name and
29# code point lookup. (If we were to have hashes, we would need two, one for
30# each lookup direction.)
31#
32# For loose name matching, the logical thing would be to have a table
33# with all the ignorable characters squeezed out, and then grep it with the
34# similiarly-squeezed input name. (And this is in fact how the lookups are
35# done with the small Perl extension hashes.) But since we need to be able to
36# go from code point to official name, the original table would still need to
37# exist. Due to the large size of the table, it was decided to not read
38# another very large string into memory for a second table. Instead, the
39# regular expression of the input name is modified to have optional spaces and
40# dashes between characters. For example, in strict matching, the regular
41# expression would be:
42# qr/\tDIGIT ONE$/m
43# Under loose matching, the blank would be squeezed out, and the re would be:
44# qr/\tD[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m
45# which matches a blank or dash between any characters in the official table.
46#
47# This is also how script lookup is done. Basically the re looks like
48# qr/ (?:LATIN|GREEK|CYRILLIC) (?:SMALL )?LETTER $name/
49# where $name is the loose or strict regex for the remainder of the name.
50
fb121860
KW
51# The hashes are stored as utf8 strings. This makes it easier to deal with
52# sequences. I (khw) also tried making Name.pl utf8, but it slowed things
53# down by a factor of 7. I then tried making Name.pl store the ut8
54# equivalents but not calling them utf8. That led to similar speed as leaving
55# it alone, but since that is harder for a human to parse, I left it as-is.
56
232cbbee 57my %system_aliases = (
69ccf208 58 # Synonyms for the icky 3.2 names that have parentheses.
bcc08981
KW
59 'LINE FEED' => pack("U", 0x0A), # LINE FEED (LF)
60 'FORM FEED' => pack("U", 0x0C), # FORM FEED (FF)
61 'CARRIAGE RETURN' => pack("U", 0x0D), # CARRIAGE RETURN (CR)
62 'NEXT LINE' => pack("U", 0x85), # NEXT LINE (NEL)
63
64 # Some variant names from Wikipedia
65 'SINGLE-SHIFT 2' => pack("U", 0x8E),
66 'SINGLE-SHIFT 3' => pack("U", 0x8F),
67 'PRIVATE USE 1' => pack("U", 0x91),
68 'PRIVATE USE 2' => pack("U", 0x92),
69 'START OF PROTECTED AREA' => pack("U", 0x96),
70 'END OF PROTECTED AREA' => pack("U", 0x97),
71
72 # Convenience. Standard abbreviations for the controls
73 'NUL' => pack("U", 0x00), # NULL
74 'SOH' => pack("U", 0x01), # START OF HEADING
75 'STX' => pack("U", 0x02), # START OF TEXT
76 'ETX' => pack("U", 0x03), # END OF TEXT
77 'EOT' => pack("U", 0x04), # END OF TRANSMISSION
78 'ENQ' => pack("U", 0x05), # ENQUIRY
79 'ACK' => pack("U", 0x06), # ACKNOWLEDGE
3ffed8c2 80 'BEL' => pack("U", 0x07), # ALERT; formerly BELL
bcc08981
KW
81 'BS' => pack("U", 0x08), # BACKSPACE
82 'HT' => pack("U", 0x09), # HORIZONTAL TABULATION
83 'LF' => pack("U", 0x0A), # LINE FEED (LF)
84 'VT' => pack("U", 0x0B), # VERTICAL TABULATION
85 'FF' => pack("U", 0x0C), # FORM FEED (FF)
86 'CR' => pack("U", 0x0D), # CARRIAGE RETURN (CR)
87 'SO' => pack("U", 0x0E), # SHIFT OUT
88 'SI' => pack("U", 0x0F), # SHIFT IN
89 'DLE' => pack("U", 0x10), # DATA LINK ESCAPE
90 'DC1' => pack("U", 0x11), # DEVICE CONTROL ONE
91 'DC2' => pack("U", 0x12), # DEVICE CONTROL TWO
92 'DC3' => pack("U", 0x13), # DEVICE CONTROL THREE
93 'DC4' => pack("U", 0x14), # DEVICE CONTROL FOUR
94 'NAK' => pack("U", 0x15), # NEGATIVE ACKNOWLEDGE
95 'SYN' => pack("U", 0x16), # SYNCHRONOUS IDLE
96 'ETB' => pack("U", 0x17), # END OF TRANSMISSION BLOCK
97 'CAN' => pack("U", 0x18), # CANCEL
98 'EOM' => pack("U", 0x19), # END OF MEDIUM
99 'SUB' => pack("U", 0x1A), # SUBSTITUTE
100 'ESC' => pack("U", 0x1B), # ESCAPE
101 'FS' => pack("U", 0x1C), # FILE SEPARATOR
102 'GS' => pack("U", 0x1D), # GROUP SEPARATOR
103 'RS' => pack("U", 0x1E), # RECORD SEPARATOR
104 'US' => pack("U", 0x1F), # UNIT SEPARATOR
105 'DEL' => pack("U", 0x7F), # DELETE
106 'BPH' => pack("U", 0x82), # BREAK PERMITTED HERE
107 'NBH' => pack("U", 0x83), # NO BREAK HERE
108 'NEL' => pack("U", 0x85), # NEXT LINE (NEL)
109 'SSA' => pack("U", 0x86), # START OF SELECTED AREA
110 'ESA' => pack("U", 0x87), # END OF SELECTED AREA
111 'HTS' => pack("U", 0x88), # CHARACTER TABULATION SET
112 'HTJ' => pack("U", 0x89), # CHARACTER TABULATION WITH JUSTIFICATION
113 'VTS' => pack("U", 0x8A), # LINE TABULATION SET
114 'PLD' => pack("U", 0x8B), # PARTIAL LINE FORWARD
115 'PLU' => pack("U", 0x8C), # PARTIAL LINE BACKWARD
25aa614d 116 'RI' => pack("U", 0x8D), # REVERSE LINE FEED
bcc08981
KW
117 'SS2' => pack("U", 0x8E), # SINGLE SHIFT TWO
118 'SS3' => pack("U", 0x8F), # SINGLE SHIFT THREE
119 'DCS' => pack("U", 0x90), # DEVICE CONTROL STRING
120 'PU1' => pack("U", 0x91), # PRIVATE USE ONE
121 'PU2' => pack("U", 0x92), # PRIVATE USE TWO
122 'STS' => pack("U", 0x93), # SET TRANSMIT STATE
123 'CCH' => pack("U", 0x94), # CANCEL CHARACTER
25aa614d 124 'MW' => pack("U", 0x95), # MESSAGE WAITING
bcc08981
KW
125 'SPA' => pack("U", 0x96), # START OF GUARDED AREA
126 'EPA' => pack("U", 0x97), # END OF GUARDED AREA
127 'SOS' => pack("U", 0x98), # START OF STRING
128 'SCI' => pack("U", 0x9A), # SINGLE CHARACTER INTRODUCER
129 'CSI' => pack("U", 0x9B), # CONTROL SEQUENCE INTRODUCER
25aa614d 130 'ST' => pack("U", 0x9C), # STRING TERMINATOR
bcc08981 131 'OSC' => pack("U", 0x9D), # OPERATING SYSTEM COMMAND
25aa614d 132 'PM' => pack("U", 0x9E), # PRIVACY MESSAGE
bcc08981
KW
133 'APC' => pack("U", 0x9F), # APPLICATION PROGRAM COMMAND
134
81965e2b
KW
135 # There are no names for these in the Unicode standard; perhaps should be
136 # deprecated, but then again there are no alternative names, so am not
137 # deprecating. And if did, the code would have to change to not recommend
138 # an alternative for these.
bcc08981
KW
139 'PADDING CHARACTER' => pack("U", 0x80),
140 'PAD' => pack("U", 0x80),
141 'HIGH OCTET PRESET' => pack("U", 0x81),
142 'HOP' => pack("U", 0x81),
143 'INDEX' => pack("U", 0x84),
144 'IND' => pack("U", 0x84),
145 'SINGLE GRAPHIC CHARACTER INTRODUCER' => pack("U", 0x99),
146 'SGC' => pack("U", 0x99),
147
81965e2b
KW
148 # More convenience. For further convenience, it is suggested some way of
149 # using the NamesList aliases be implemented, but there are ambiguities in
bcc08981
KW
150 # NamesList.txt
151 'BOM' => pack("U", 0xFEFF), # BYTE ORDER MARK
152 'BYTE ORDER MARK'=> pack("U", 0xFEFF),
153 'CGJ' => pack("U", 0x034F), # COMBINING GRAPHEME JOINER
154 'FVS1' => pack("U", 0x180B), # MONGOLIAN FREE VARIATION SELECTOR ONE
155 'FVS2' => pack("U", 0x180C), # MONGOLIAN FREE VARIATION SELECTOR TWO
156 'FVS3' => pack("U", 0x180D), # MONGOLIAN FREE VARIATION SELECTOR THREE
157 'LRE' => pack("U", 0x202A), # LEFT-TO-RIGHT EMBEDDING
158 'LRM' => pack("U", 0x200E), # LEFT-TO-RIGHT MARK
159 'LRO' => pack("U", 0x202D), # LEFT-TO-RIGHT OVERRIDE
160 'MMSP' => pack("U", 0x205F), # MEDIUM MATHEMATICAL SPACE
161 'MVS' => pack("U", 0x180E), # MONGOLIAN VOWEL SEPARATOR
162 'NBSP' => pack("U", 0x00A0), # NO-BREAK SPACE
163 'NNBSP' => pack("U", 0x202F), # NARROW NO-BREAK SPACE
164 'PDF' => pack("U", 0x202C), # POP DIRECTIONAL FORMATTING
165 'RLE' => pack("U", 0x202B), # RIGHT-TO-LEFT EMBEDDING
166 'RLM' => pack("U", 0x200F), # RIGHT-TO-LEFT MARK
167 'RLO' => pack("U", 0x202E), # RIGHT-TO-LEFT OVERRIDE
168 'SHY' => pack("U", 0x00AD), # SOFT HYPHEN
169 'VS1' => pack("U", 0xFE00), # VARIATION SELECTOR-1
170 'VS2' => pack("U", 0xFE01), # VARIATION SELECTOR-2
171 'VS3' => pack("U", 0xFE02), # VARIATION SELECTOR-3
172 'VS4' => pack("U", 0xFE03), # VARIATION SELECTOR-4
173 'VS5' => pack("U", 0xFE04), # VARIATION SELECTOR-5
174 'VS6' => pack("U", 0xFE05), # VARIATION SELECTOR-6
175 'VS7' => pack("U", 0xFE06), # VARIATION SELECTOR-7
176 'VS8' => pack("U", 0xFE07), # VARIATION SELECTOR-8
177 'VS9' => pack("U", 0xFE08), # VARIATION SELECTOR-9
178 'VS10' => pack("U", 0xFE09), # VARIATION SELECTOR-10
179 'VS11' => pack("U", 0xFE0A), # VARIATION SELECTOR-11
180 'VS12' => pack("U", 0xFE0B), # VARIATION SELECTOR-12
181 'VS13' => pack("U", 0xFE0C), # VARIATION SELECTOR-13
182 'VS14' => pack("U", 0xFE0D), # VARIATION SELECTOR-14
183 'VS15' => pack("U", 0xFE0E), # VARIATION SELECTOR-15
184 'VS16' => pack("U", 0xFE0F), # VARIATION SELECTOR-16
185 'VS17' => pack("U", 0xE0100), # VARIATION SELECTOR-17
186 'VS18' => pack("U", 0xE0101), # VARIATION SELECTOR-18
187 'VS19' => pack("U", 0xE0102), # VARIATION SELECTOR-19
188 'VS20' => pack("U", 0xE0103), # VARIATION SELECTOR-20
189 'VS21' => pack("U", 0xE0104), # VARIATION SELECTOR-21
190 'VS22' => pack("U", 0xE0105), # VARIATION SELECTOR-22
191 'VS23' => pack("U", 0xE0106), # VARIATION SELECTOR-23
192 'VS24' => pack("U", 0xE0107), # VARIATION SELECTOR-24
193 'VS25' => pack("U", 0xE0108), # VARIATION SELECTOR-25
194 'VS26' => pack("U", 0xE0109), # VARIATION SELECTOR-26
195 'VS27' => pack("U", 0xE010A), # VARIATION SELECTOR-27
196 'VS28' => pack("U", 0xE010B), # VARIATION SELECTOR-28
197 'VS29' => pack("U", 0xE010C), # VARIATION SELECTOR-29
198 'VS30' => pack("U", 0xE010D), # VARIATION SELECTOR-30
199 'VS31' => pack("U", 0xE010E), # VARIATION SELECTOR-31
200 'VS32' => pack("U", 0xE010F), # VARIATION SELECTOR-32
201 'VS33' => pack("U", 0xE0110), # VARIATION SELECTOR-33
202 'VS34' => pack("U", 0xE0111), # VARIATION SELECTOR-34
203 'VS35' => pack("U", 0xE0112), # VARIATION SELECTOR-35
204 'VS36' => pack("U", 0xE0113), # VARIATION SELECTOR-36
205 'VS37' => pack("U", 0xE0114), # VARIATION SELECTOR-37
206 'VS38' => pack("U", 0xE0115), # VARIATION SELECTOR-38
207 'VS39' => pack("U", 0xE0116), # VARIATION SELECTOR-39
208 'VS40' => pack("U", 0xE0117), # VARIATION SELECTOR-40
209 'VS41' => pack("U", 0xE0118), # VARIATION SELECTOR-41
210 'VS42' => pack("U", 0xE0119), # VARIATION SELECTOR-42
211 'VS43' => pack("U", 0xE011A), # VARIATION SELECTOR-43
212 'VS44' => pack("U", 0xE011B), # VARIATION SELECTOR-44
213 'VS45' => pack("U", 0xE011C), # VARIATION SELECTOR-45
214 'VS46' => pack("U", 0xE011D), # VARIATION SELECTOR-46
215 'VS47' => pack("U", 0xE011E), # VARIATION SELECTOR-47
216 'VS48' => pack("U", 0xE011F), # VARIATION SELECTOR-48
217 'VS49' => pack("U", 0xE0120), # VARIATION SELECTOR-49
218 'VS50' => pack("U", 0xE0121), # VARIATION SELECTOR-50
219 'VS51' => pack("U", 0xE0122), # VARIATION SELECTOR-51
220 'VS52' => pack("U", 0xE0123), # VARIATION SELECTOR-52
221 'VS53' => pack("U", 0xE0124), # VARIATION SELECTOR-53
222 'VS54' => pack("U", 0xE0125), # VARIATION SELECTOR-54
223 'VS55' => pack("U", 0xE0126), # VARIATION SELECTOR-55
224 'VS56' => pack("U", 0xE0127), # VARIATION SELECTOR-56
225 'VS57' => pack("U", 0xE0128), # VARIATION SELECTOR-57
226 'VS58' => pack("U", 0xE0129), # VARIATION SELECTOR-58
227 'VS59' => pack("U", 0xE012A), # VARIATION SELECTOR-59
228 'VS60' => pack("U", 0xE012B), # VARIATION SELECTOR-60
229 'VS61' => pack("U", 0xE012C), # VARIATION SELECTOR-61
230 'VS62' => pack("U", 0xE012D), # VARIATION SELECTOR-62
231 'VS63' => pack("U", 0xE012E), # VARIATION SELECTOR-63
232 'VS64' => pack("U", 0xE012F), # VARIATION SELECTOR-64
233 'VS65' => pack("U", 0xE0130), # VARIATION SELECTOR-65
234 'VS66' => pack("U", 0xE0131), # VARIATION SELECTOR-66
235 'VS67' => pack("U", 0xE0132), # VARIATION SELECTOR-67
236 'VS68' => pack("U", 0xE0133), # VARIATION SELECTOR-68
237 'VS69' => pack("U", 0xE0134), # VARIATION SELECTOR-69
238 'VS70' => pack("U", 0xE0135), # VARIATION SELECTOR-70
239 'VS71' => pack("U", 0xE0136), # VARIATION SELECTOR-71
240 'VS72' => pack("U", 0xE0137), # VARIATION SELECTOR-72
241 'VS73' => pack("U", 0xE0138), # VARIATION SELECTOR-73
242 'VS74' => pack("U", 0xE0139), # VARIATION SELECTOR-74
243 'VS75' => pack("U", 0xE013A), # VARIATION SELECTOR-75
244 'VS76' => pack("U", 0xE013B), # VARIATION SELECTOR-76
245 'VS77' => pack("U", 0xE013C), # VARIATION SELECTOR-77
246 'VS78' => pack("U", 0xE013D), # VARIATION SELECTOR-78
247 'VS79' => pack("U", 0xE013E), # VARIATION SELECTOR-79
248 'VS80' => pack("U", 0xE013F), # VARIATION SELECTOR-80
249 'VS81' => pack("U", 0xE0140), # VARIATION SELECTOR-81
250 'VS82' => pack("U", 0xE0141), # VARIATION SELECTOR-82
251 'VS83' => pack("U", 0xE0142), # VARIATION SELECTOR-83
252 'VS84' => pack("U", 0xE0143), # VARIATION SELECTOR-84
253 'VS85' => pack("U", 0xE0144), # VARIATION SELECTOR-85
254 'VS86' => pack("U", 0xE0145), # VARIATION SELECTOR-86
255 'VS87' => pack("U", 0xE0146), # VARIATION SELECTOR-87
256 'VS88' => pack("U", 0xE0147), # VARIATION SELECTOR-88
257 'VS89' => pack("U", 0xE0148), # VARIATION SELECTOR-89
258 'VS90' => pack("U", 0xE0149), # VARIATION SELECTOR-90
259 'VS91' => pack("U", 0xE014A), # VARIATION SELECTOR-91
260 'VS92' => pack("U", 0xE014B), # VARIATION SELECTOR-92
261 'VS93' => pack("U", 0xE014C), # VARIATION SELECTOR-93
262 'VS94' => pack("U", 0xE014D), # VARIATION SELECTOR-94
263 'VS95' => pack("U", 0xE014E), # VARIATION SELECTOR-95
264 'VS96' => pack("U", 0xE014F), # VARIATION SELECTOR-96
265 'VS97' => pack("U", 0xE0150), # VARIATION SELECTOR-97
266 'VS98' => pack("U", 0xE0151), # VARIATION SELECTOR-98
267 'VS99' => pack("U", 0xE0152), # VARIATION SELECTOR-99
268 'VS100' => pack("U", 0xE0153), # VARIATION SELECTOR-100
269 'VS101' => pack("U", 0xE0154), # VARIATION SELECTOR-101
270 'VS102' => pack("U", 0xE0155), # VARIATION SELECTOR-102
271 'VS103' => pack("U", 0xE0156), # VARIATION SELECTOR-103
272 'VS104' => pack("U", 0xE0157), # VARIATION SELECTOR-104
273 'VS105' => pack("U", 0xE0158), # VARIATION SELECTOR-105
274 'VS106' => pack("U", 0xE0159), # VARIATION SELECTOR-106
275 'VS107' => pack("U", 0xE015A), # VARIATION SELECTOR-107
276 'VS108' => pack("U", 0xE015B), # VARIATION SELECTOR-108
277 'VS109' => pack("U", 0xE015C), # VARIATION SELECTOR-109
278 'VS110' => pack("U", 0xE015D), # VARIATION SELECTOR-110
279 'VS111' => pack("U", 0xE015E), # VARIATION SELECTOR-111
280 'VS112' => pack("U", 0xE015F), # VARIATION SELECTOR-112
281 'VS113' => pack("U", 0xE0160), # VARIATION SELECTOR-113
282 'VS114' => pack("U", 0xE0161), # VARIATION SELECTOR-114
283 'VS115' => pack("U", 0xE0162), # VARIATION SELECTOR-115
284 'VS116' => pack("U", 0xE0163), # VARIATION SELECTOR-116
285 'VS117' => pack("U", 0xE0164), # VARIATION SELECTOR-117
286 'VS118' => pack("U", 0xE0165), # VARIATION SELECTOR-118
287 'VS119' => pack("U", 0xE0166), # VARIATION SELECTOR-119
288 'VS120' => pack("U", 0xE0167), # VARIATION SELECTOR-120
289 'VS121' => pack("U", 0xE0168), # VARIATION SELECTOR-121
290 'VS122' => pack("U", 0xE0169), # VARIATION SELECTOR-122
291 'VS123' => pack("U", 0xE016A), # VARIATION SELECTOR-123
292 'VS124' => pack("U", 0xE016B), # VARIATION SELECTOR-124
293 'VS125' => pack("U", 0xE016C), # VARIATION SELECTOR-125
294 'VS126' => pack("U", 0xE016D), # VARIATION SELECTOR-126
295 'VS127' => pack("U", 0xE016E), # VARIATION SELECTOR-127
296 'VS128' => pack("U", 0xE016F), # VARIATION SELECTOR-128
297 'VS129' => pack("U", 0xE0170), # VARIATION SELECTOR-129
298 'VS130' => pack("U", 0xE0171), # VARIATION SELECTOR-130
299 'VS131' => pack("U", 0xE0172), # VARIATION SELECTOR-131
300 'VS132' => pack("U", 0xE0173), # VARIATION SELECTOR-132
301 'VS133' => pack("U", 0xE0174), # VARIATION SELECTOR-133
302 'VS134' => pack("U", 0xE0175), # VARIATION SELECTOR-134
303 'VS135' => pack("U", 0xE0176), # VARIATION SELECTOR-135
304 'VS136' => pack("U", 0xE0177), # VARIATION SELECTOR-136
305 'VS137' => pack("U", 0xE0178), # VARIATION SELECTOR-137
306 'VS138' => pack("U", 0xE0179), # VARIATION SELECTOR-138
307 'VS139' => pack("U", 0xE017A), # VARIATION SELECTOR-139
308 'VS140' => pack("U", 0xE017B), # VARIATION SELECTOR-140
309 'VS141' => pack("U", 0xE017C), # VARIATION SELECTOR-141
310 'VS142' => pack("U", 0xE017D), # VARIATION SELECTOR-142
311 'VS143' => pack("U", 0xE017E), # VARIATION SELECTOR-143
312 'VS144' => pack("U", 0xE017F), # VARIATION SELECTOR-144
313 'VS145' => pack("U", 0xE0180), # VARIATION SELECTOR-145
314 'VS146' => pack("U", 0xE0181), # VARIATION SELECTOR-146
315 'VS147' => pack("U", 0xE0182), # VARIATION SELECTOR-147
316 'VS148' => pack("U", 0xE0183), # VARIATION SELECTOR-148
317 'VS149' => pack("U", 0xE0184), # VARIATION SELECTOR-149
318 'VS150' => pack("U", 0xE0185), # VARIATION SELECTOR-150
319 'VS151' => pack("U", 0xE0186), # VARIATION SELECTOR-151
320 'VS152' => pack("U", 0xE0187), # VARIATION SELECTOR-152
321 'VS153' => pack("U", 0xE0188), # VARIATION SELECTOR-153
322 'VS154' => pack("U", 0xE0189), # VARIATION SELECTOR-154
323 'VS155' => pack("U", 0xE018A), # VARIATION SELECTOR-155
324 'VS156' => pack("U", 0xE018B), # VARIATION SELECTOR-156
325 'VS157' => pack("U", 0xE018C), # VARIATION SELECTOR-157
326 'VS158' => pack("U", 0xE018D), # VARIATION SELECTOR-158
327 'VS159' => pack("U", 0xE018E), # VARIATION SELECTOR-159
328 'VS160' => pack("U", 0xE018F), # VARIATION SELECTOR-160
329 'VS161' => pack("U", 0xE0190), # VARIATION SELECTOR-161
330 'VS162' => pack("U", 0xE0191), # VARIATION SELECTOR-162
331 'VS163' => pack("U", 0xE0192), # VARIATION SELECTOR-163
332 'VS164' => pack("U", 0xE0193), # VARIATION SELECTOR-164
333 'VS165' => pack("U", 0xE0194), # VARIATION SELECTOR-165
334 'VS166' => pack("U", 0xE0195), # VARIATION SELECTOR-166
335 'VS167' => pack("U", 0xE0196), # VARIATION SELECTOR-167
336 'VS168' => pack("U", 0xE0197), # VARIATION SELECTOR-168
337 'VS169' => pack("U", 0xE0198), # VARIATION SELECTOR-169
338 'VS170' => pack("U", 0xE0199), # VARIATION SELECTOR-170
339 'VS171' => pack("U", 0xE019A), # VARIATION SELECTOR-171
340 'VS172' => pack("U", 0xE019B), # VARIATION SELECTOR-172
341 'VS173' => pack("U", 0xE019C), # VARIATION SELECTOR-173
342 'VS174' => pack("U", 0xE019D), # VARIATION SELECTOR-174
343 'VS175' => pack("U", 0xE019E), # VARIATION SELECTOR-175
344 'VS176' => pack("U", 0xE019F), # VARIATION SELECTOR-176
345 'VS177' => pack("U", 0xE01A0), # VARIATION SELECTOR-177
346 'VS178' => pack("U", 0xE01A1), # VARIATION SELECTOR-178
347 'VS179' => pack("U", 0xE01A2), # VARIATION SELECTOR-179
348 'VS180' => pack("U", 0xE01A3), # VARIATION SELECTOR-180
349 'VS181' => pack("U", 0xE01A4), # VARIATION SELECTOR-181
350 'VS182' => pack("U", 0xE01A5), # VARIATION SELECTOR-182
351 'VS183' => pack("U", 0xE01A6), # VARIATION SELECTOR-183
352 'VS184' => pack("U", 0xE01A7), # VARIATION SELECTOR-184
353 'VS185' => pack("U", 0xE01A8), # VARIATION SELECTOR-185
354 'VS186' => pack("U", 0xE01A9), # VARIATION SELECTOR-186
355 'VS187' => pack("U", 0xE01AA), # VARIATION SELECTOR-187
356 'VS188' => pack("U", 0xE01AB), # VARIATION SELECTOR-188
357 'VS189' => pack("U", 0xE01AC), # VARIATION SELECTOR-189
358 'VS190' => pack("U", 0xE01AD), # VARIATION SELECTOR-190
359 'VS191' => pack("U", 0xE01AE), # VARIATION SELECTOR-191
360 'VS192' => pack("U", 0xE01AF), # VARIATION SELECTOR-192
361 'VS193' => pack("U", 0xE01B0), # VARIATION SELECTOR-193
362 'VS194' => pack("U", 0xE01B1), # VARIATION SELECTOR-194
363 'VS195' => pack("U", 0xE01B2), # VARIATION SELECTOR-195
364 'VS196' => pack("U", 0xE01B3), # VARIATION SELECTOR-196
365 'VS197' => pack("U", 0xE01B4), # VARIATION SELECTOR-197
366 'VS198' => pack("U", 0xE01B5), # VARIATION SELECTOR-198
367 'VS199' => pack("U", 0xE01B6), # VARIATION SELECTOR-199
368 'VS200' => pack("U", 0xE01B7), # VARIATION SELECTOR-200
369 'VS201' => pack("U", 0xE01B8), # VARIATION SELECTOR-201
370 'VS202' => pack("U", 0xE01B9), # VARIATION SELECTOR-202
371 'VS203' => pack("U", 0xE01BA), # VARIATION SELECTOR-203
372 'VS204' => pack("U", 0xE01BB), # VARIATION SELECTOR-204
373 'VS205' => pack("U", 0xE01BC), # VARIATION SELECTOR-205
374 'VS206' => pack("U", 0xE01BD), # VARIATION SELECTOR-206
375 'VS207' => pack("U", 0xE01BE), # VARIATION SELECTOR-207
376 'VS208' => pack("U", 0xE01BF), # VARIATION SELECTOR-208
377 'VS209' => pack("U", 0xE01C0), # VARIATION SELECTOR-209
378 'VS210' => pack("U", 0xE01C1), # VARIATION SELECTOR-210
379 'VS211' => pack("U", 0xE01C2), # VARIATION SELECTOR-211
380 'VS212' => pack("U", 0xE01C3), # VARIATION SELECTOR-212
381 'VS213' => pack("U", 0xE01C4), # VARIATION SELECTOR-213
382 'VS214' => pack("U", 0xE01C5), # VARIATION SELECTOR-214
383 'VS215' => pack("U", 0xE01C6), # VARIATION SELECTOR-215
384 'VS216' => pack("U", 0xE01C7), # VARIATION SELECTOR-216
385 'VS217' => pack("U", 0xE01C8), # VARIATION SELECTOR-217
386 'VS218' => pack("U", 0xE01C9), # VARIATION SELECTOR-218
387 'VS219' => pack("U", 0xE01CA), # VARIATION SELECTOR-219
388 'VS220' => pack("U", 0xE01CB), # VARIATION SELECTOR-220
389 'VS221' => pack("U", 0xE01CC), # VARIATION SELECTOR-221
390 'VS222' => pack("U", 0xE01CD), # VARIATION SELECTOR-222
391 'VS223' => pack("U", 0xE01CE), # VARIATION SELECTOR-223
392 'VS224' => pack("U", 0xE01CF), # VARIATION SELECTOR-224
393 'VS225' => pack("U", 0xE01D0), # VARIATION SELECTOR-225
394 'VS226' => pack("U", 0xE01D1), # VARIATION SELECTOR-226
395 'VS227' => pack("U", 0xE01D2), # VARIATION SELECTOR-227
396 'VS228' => pack("U", 0xE01D3), # VARIATION SELECTOR-228
397 'VS229' => pack("U", 0xE01D4), # VARIATION SELECTOR-229
398 'VS230' => pack("U", 0xE01D5), # VARIATION SELECTOR-230
399 'VS231' => pack("U", 0xE01D6), # VARIATION SELECTOR-231
400 'VS232' => pack("U", 0xE01D7), # VARIATION SELECTOR-232
401 'VS233' => pack("U", 0xE01D8), # VARIATION SELECTOR-233
402 'VS234' => pack("U", 0xE01D9), # VARIATION SELECTOR-234
403 'VS235' => pack("U", 0xE01DA), # VARIATION SELECTOR-235
404 'VS236' => pack("U", 0xE01DB), # VARIATION SELECTOR-236
405 'VS237' => pack("U", 0xE01DC), # VARIATION SELECTOR-237
406 'VS238' => pack("U", 0xE01DD), # VARIATION SELECTOR-238
407 'VS239' => pack("U", 0xE01DE), # VARIATION SELECTOR-239
408 'VS240' => pack("U", 0xE01DF), # VARIATION SELECTOR-240
409 'VS241' => pack("U", 0xE01E0), # VARIATION SELECTOR-241
410 'VS242' => pack("U", 0xE01E1), # VARIATION SELECTOR-242
411 'VS243' => pack("U", 0xE01E2), # VARIATION SELECTOR-243
412 'VS244' => pack("U", 0xE01E3), # VARIATION SELECTOR-244
413 'VS245' => pack("U", 0xE01E4), # VARIATION SELECTOR-245
414 'VS246' => pack("U", 0xE01E5), # VARIATION SELECTOR-246
415 'VS247' => pack("U", 0xE01E6), # VARIATION SELECTOR-247
416 'VS248' => pack("U", 0xE01E7), # VARIATION SELECTOR-248
417 'VS249' => pack("U", 0xE01E8), # VARIATION SELECTOR-249
418 'VS250' => pack("U", 0xE01E9), # VARIATION SELECTOR-250
419 'VS251' => pack("U", 0xE01EA), # VARIATION SELECTOR-251
420 'VS252' => pack("U", 0xE01EB), # VARIATION SELECTOR-252
421 'VS253' => pack("U", 0xE01EC), # VARIATION SELECTOR-253
422 'VS254' => pack("U", 0xE01ED), # VARIATION SELECTOR-254
423 'VS255' => pack("U", 0xE01EE), # VARIATION SELECTOR-255
424 'VS256' => pack("U", 0xE01EF), # VARIATION SELECTOR-256
425 'WJ' => pack("U", 0x2060), # WORD JOINER
426 'ZWJ' => pack("U", 0x200D), # ZERO WIDTH JOINER
427 'ZWNJ' => pack("U", 0x200C), # ZERO WIDTH NON-JOINER
428 'ZWSP' => pack("U", 0x200B), # ZERO WIDTH SPACE
429);
52ea3e69 430
38f4139d
KW
431# These are the aliases above that differ under :loose and :full matching
432# because the :full versions have blanks or hyphens in them.
433my %loose_system_aliases = (
434 'LINEFEED' => pack("U", 0x0A),
435 'FORMFEED' => pack("U", 0x0C),
436 'CARRIAGERETURN' => pack("U", 0x0D),
437 'NEXTLINE' => pack("U", 0x85),
438 'SINGLESHIFT2' => pack("U", 0x8E),
439 'SINGLESHIFT3' => pack("U", 0x8F),
440 'PRIVATEUSE1' => pack("U", 0x91),
441 'PRIVATEUSE2' => pack("U", 0x92),
442 'STARTOFPROTECTEDAREA' => pack("U", 0x96),
443 'ENDOFPROTECTEDAREA' => pack("U", 0x97),
444 'PADDINGCHARACTER' => pack("U", 0x80),
445 'HIGHOCTETPRESET' => pack("U", 0x81),
446 'SINGLEGRAPHICCHARACTERINTRODUCER' => pack("U", 0x99),
447 'BYTEORDERMARK' => pack("U", 0xFEFF),
448);
449
232cbbee 450my %deprecated_aliases = (
bcc08981
KW
451 # Pre-3.2 compatibility (only for the first 256 characters).
452 # Use of these gives deprecated message.
453 'HORIZONTAL TABULATION' => pack("U", 0x09), # CHARACTER TABULATION
454 'VERTICAL TABULATION' => pack("U", 0x0B), # LINE TABULATION
455 'FILE SEPARATOR' => pack("U", 0x1C), # INFORMATION SEPARATOR FOUR
456 'GROUP SEPARATOR' => pack("U", 0x1D), # INFORMATION SEPARATOR THREE
457 'RECORD SEPARATOR' => pack("U", 0x1E), # INFORMATION SEPARATOR TWO
458 'UNIT SEPARATOR' => pack("U", 0x1F), # INFORMATION SEPARATOR ONE
459 'HORIZONTAL TABULATION SET' => pack("U", 0x88), # CHARACTER TABULATION SET
460 'HORIZONTAL TABULATION WITH JUSTIFICATION' => pack("U", 0x89), # CHARACTER TABULATION WITH JUSTIFICATION
461 'PARTIAL LINE DOWN' => pack("U", 0x8B), # PARTIAL LINE FORWARD
462 'PARTIAL LINE UP' => pack("U", 0x8C), # PARTIAL LINE BACKWARD
463 'VERTICAL TABULATION SET' => pack("U", 0x8A), # LINE TABULATION SET
464 'REVERSE INDEX' => pack("U", 0x8D), # REVERSE LINE FEED
3ffed8c2
KW
465
466 # Unicode 6.0 co-opted this for U+1F514, so deprecate it for now.
467 'BELL' => pack("U", 0x07),
bcc08981 468);
52ea3e69 469
38f4139d
KW
470my %loose_deprecated_aliases = (
471 'HORIZONTALTABULATION' => pack("U", 0x09),
472 'VERTICALTABULATION' => pack("U", 0x0B),
473 'FILESEPARATOR' => pack("U", 0x1C),
474 'GROUPSEPARATOR' => pack("U", 0x1D),
475 'RECORDSEPARATOR' => pack("U", 0x1E),
476 'UNITSEPARATOR' => pack("U", 0x1F),
477 'HORIZONTALTABULATIONSET' => pack("U", 0x88),
478 'HORIZONTALTABULATIONWITHJUSTIFICATION' => pack("U", 0x89),
479 'PARTIALLINEDOWN' => pack("U", 0x8B),
480 'PARTIALLINEUP' => pack("U", 0x8C),
481 'VERTICALTABULATIONSET' => pack("U", 0x8A),
482 'REVERSEINDEX' => pack("U", 0x8D),
483);
484
485# These are special cased in :loose matching, differing only in a medial
486# hyphen
487my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180);
488my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C);
489
84374e30 490
cc26ddeb 491my $txt; # The table of official character names
281aa49e 492
84374e30
KW
493my %full_names_cache; # Holds already-looked-up names, so don't have to
494# re-look them up again. The previous versions of charnames had scoping
495# bugs. For example if we use script A in one scope and find and cache
496# what Z resolves to, we can't use that cache in a different scope that
497# uses script B instead of A, as Z might be an entirely different letter
498# there; or there might be different aliases in effect in different
499# scopes, or :short may be in effect or not effect in different scopes,
500# or various combinations thereof. This was solved in this version
501# mostly by moving things to %^H. But some things couldn't be moved
502# there. One of them was the cache of runtime looked-up names, in part
503# because %^H is read-only at runtime. I (khw) don't know why the cache
504# was run-time only in the previous versions: perhaps oversight; perhaps
505# that compile time looking doesn't happen in a loop so didn't think it
506# was worthwhile; perhaps not wanting to make the cache too large. But
507# I decided to make it compile time as well; this could easily be
508# changed.
509# Anyway, this hash is not scoped, and is added to at runtime. It
510# doesn't have scoping problems because the data in it is restricted to
511# official names, which are always invariant, and we only set it and
512# look at it at during :full lookups, so is unaffected by any other
513# scoped options. I put this in to maintain parity with the older
514# version. If desired, a %short_names cache could also be made, as well
515# as one for each script, say in %script_names_cache, with each key
516# being a hash for a script named in a 'use charnames' statement. I
517# decided not to do that for now, just because it's added complication,
518# and because I'm just trying to maintain parity, not extend it.
519
38f4139d
KW
520# Like %full_names_cache, but for use when :loose is in effect. There needs
521# to be two caches because :loose may not be in effect for a scope, and a
522# loose name could inappropriately be returned when only exact matching is
523# called for.
524my %loose_names_cache;
525
281aa49e
KW
526# Designed so that test decimal first, and then hex. Leading zeros
527# imply non-decimal, as do non-[0-9]
232cbbee
KW
528my $decimal_qr = qr/^[1-9]\d*$/;
529
530# Returns the hex number in $1.
531my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/;
423cee85 532
8878f897
T
533sub croak
534{
535 require Carp; goto &Carp::croak;
536} # croak
537
538sub carp
539{
540 require Carp; goto &Carp::carp;
541} # carp
542
cc26ddeb 543sub alias (@) # Set up a single alias
35c0985d 544{
35c0985d 545 my $alias = ref $_[0] ? $_[0] : { @_ };
232cbbee
KW
546 foreach my $name (keys %$alias) {
547 my $value = $alias->{$name};
52fb7278 548 next unless defined $value; # Omit if screwed up.
84374e30
KW
549
550 # Is slightly slower to just after this statement see if it is
551 # decimal, since we already know it is after having converted from
552 # hex, but makes the code easier to maintain, and is called
553 # infrequently, only at compile-time
554 if ($value !~ $decimal_qr && $value =~ $hex_qr) {
555 $value = CORE::hex $1;
556 }
232cbbee 557 if ($value =~ $decimal_qr) {
0ae19c29 558 no warnings qw(non_unicode surrogate nonchar); # Allow any non-malformed
fb121860 559 $^H{charnames_ord_aliases}{$name} = pack("U", $value);
232cbbee
KW
560
561 # Use a canonical form.
b1c167a3 562 $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
232cbbee
KW
563 }
564 else {
52fb7278
KW
565 # XXX validate syntax when deprecation cycle complete. ie. start
566 # with an alpha only, etc.
567 $^H{charnames_name_aliases}{$name} = $value;
232cbbee
KW
568 }
569 }
35c0985d
MB
570} # alias
571
5a7fb30a 572sub not_legal_use_bytes_msg {
fb121860
KW
573 my ($name, $utf8) = @_;
574 my $return;
575
576 if (length($utf8) == 1) {
577 $return = sprintf("Character 0x%04x with name '%s' is", ord $utf8, $name);
578 } else {
579 $return = sprintf("String with name '%s' (and ordinals %s) contains character(s)", $name, join(" ", map { sprintf "0x%04X", ord $_ } split(//, $utf8)));
580 }
581 return $return . " above 0xFF with 'use bytes' in effect";
5a7fb30a
KW
582}
583
281aa49e 584sub alias_file ($) # Reads a file containing alias definitions
35c0985d 585{
51cf30b6
MB
586 my ($arg, $file) = @_;
587 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
588 $file = $arg;
589 }
590 elsif ($arg =~ m/^\w+$/) {
591 $file = "unicore/${arg}_alias.pl";
592 }
593 else {
594 croak "Charnames alias files can only have identifier characters";
595 }
35c0985d 596 if (my @alias = do $file) {
51cf30b6
MB
597 @alias == 1 && !defined $alias[0] and
598 croak "$file cannot be used as alias file for charnames";
599 @alias % 2 and
600 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
601 alias (@alias);
602 return (1);
603 }
604 0;
605} # alias_file
606
03f95285
KW
607# For use when don't import anything. This structure must be kept in
608# sync with the one that import() fills up.
609my %dummy_H = (
610 charnames_stringified_names => "",
611 charnames_stringified_ords => "",
612 charnames_scripts => "",
613 charnames_full => 1,
38f4139d 614 charnames_loose => 0,
03f95285
KW
615 charnames_short => 0,
616 );
617
63098191 618
fb121860
KW
619sub lookup_name ($$$) {
620 my ($name, $wants_ord, $runtime) = @_;
63098191 621
fb121860
KW
622 # Lookup the name or sequence $name in the tables. If $wants_ord is false,
623 # returns the string equivalent of $name; if true, returns the ordinal value
624 # instead, but in this case $name must not be a sequence; otherwise undef is
625 # returned and a warning raised. $runtime is 0 if compiletime, otherwise
626 # gives the number of stack frames to go back to get the application caller
627 # info.
628 # If $name is not found, returns undef in runtime with no warning; and in
629 # compiletime, the Unicode replacement character, with a warning.
63098191 630
fb121860
KW
631 # It looks first in the aliases, then in the large table of official Unicode
632 # names.
84374e30 633
9deebca3 634 my $utf8; # The string result
e79869e1 635 my $save_input;
b177ca84 636
84374e30 637 if ($runtime) {
03f95285 638
fb121860
KW
639 my $hints_ref = (caller($runtime))[10];
640
03f95285
KW
641 # If we didn't import anything (which happens with 'use charnames ()',
642 # substitute a dummy structure.
643 $hints_ref = \%dummy_H if ! defined $hints_ref
38f4139d
KW
644 || (! defined $hints_ref->{charnames_full}
645 && ! defined $hints_ref->{charnames_loose});
03f95285 646
84374e30
KW
647 # At runtime, but currently not at compile time, $^H gets
648 # stringified, so un-stringify back to the original data structures.
649 # These get thrown away by perl before the next invocation
650 # Also fill in the hash with the non-stringified data.
03f95285 651 # N.B. New fields must be also added to %dummy_H
84374e30 652
03f95285
KW
653 %{$^H{charnames_name_aliases}} = split ',',
654 $hints_ref->{charnames_stringified_names};
655 %{$^H{charnames_ord_aliases}} = split ',',
656 $hints_ref->{charnames_stringified_ords};
e79869e1 657 $^H{charnames_scripts} = $hints_ref->{charnames_scripts};
84374e30 658 $^H{charnames_full} = $hints_ref->{charnames_full};
38f4139d 659 $^H{charnames_loose} = $hints_ref->{charnames_loose};
84374e30
KW
660 $^H{charnames_short} = $hints_ref->{charnames_short};
661 }
662
38f4139d
KW
663 my $loose = $^H{charnames_loose};
664 my $lookup_name; # Input name suitably modified for grepping for in the
665 # table
666
232cbbee 667 # User alias should be checked first or else can't override ours, and if we
9deebca3 668 # were to add any, could conflict with theirs.
84374e30 669 if (exists $^H{charnames_ord_aliases}{$name}) {
f1ccd77d 670 $utf8 = $^H{charnames_ord_aliases}{$name};
16036bcd 671 }
84374e30
KW
672 elsif (exists $^H{charnames_name_aliases}{$name}) {
673 $name = $^H{charnames_name_aliases}{$name};
38f4139d
KW
674 $save_input = $lookup_name = $name; # Cache the result for any error
675 # message
676 # The aliases are documented to not match loosely, so change loose match
677 # into full.
678 if ($loose) {
679 $loose = 0;
680 $^H{charnames_full} = 1;
681 }
52ea3e69 682 }
38f4139d
KW
683 else {
684
685 # Here, not a user alias. That means that loose matching may be in
686 # effect; will have to modify the input name.
687 $lookup_name = $name;
688 if ($loose) {
689 $lookup_name = uc $lookup_name;
690
691 # Squeeze out all underscores
692 $lookup_name =~ s/_//g;
693
694 # Remove all medial hyphens
695 $lookup_name =~ s/ (?<= \S ) - (?= \S )//gx;
696
697 # Squeeze out all spaces
698 $lookup_name =~ s/\s//g;
699 }
700
701 # Here, $lookup_name has been modified as necessary for looking in the
702 # hashes. Check the system alias files next. Most of these aliases are
703 # the same for both strict and loose matching. To save space, the ones
704 # which differ are in their own separate hash, which is checked if loose
705 # matching is selected and the regular match fails. To save time, the
706 # loose hashes could be expanded to include all aliases, and there would
707 # only have to be one check. But if someone specifies :loose, they are
708 # interested in convenience over speed, and the time for this second check
709 # is miniscule compared to the rest of the routine.
710 if (exists $system_aliases{$lookup_name}) {
711 $utf8 = $system_aliases{$lookup_name};
712 }
713 elsif ($loose && exists $loose_system_aliases{$lookup_name}) {
714 $utf8 = $loose_system_aliases{$lookup_name};
715 }
716 elsif (exists $deprecated_aliases{$lookup_name}) {
717 require warnings;
718 warnings::warnif('deprecated',
719 "Unicode character name \"$name\" is deprecated, use \""
720 . viacode(ord $deprecated_aliases{$lookup_name})
721 . "\" instead");
722 $utf8 = $deprecated_aliases{$lookup_name};
723 }
724 elsif ($loose && exists $loose_deprecated_aliases{$lookup_name}) {
725 require warnings;
726 warnings::warnif('deprecated',
727 "Unicode character name \"$name\" is deprecated, use \""
728 . viacode(ord $loose_deprecated_aliases{$lookup_name})
729 . "\" instead");
730 $utf8 = $loose_deprecated_aliases{$lookup_name};
731 }
52ea3e69 732 }
b177ca84 733
38f4139d 734 my @off; # Offsets into table of pattern match begin and end
52ea3e69 735
38f4139d 736 # If haven't found it yet...
f1ccd77d 737 if (! defined $utf8) {
35c0985d 738
9deebca3 739 # See if has looked this input up earlier.
38f4139d 740 if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) {
f1ccd77d 741 $utf8 = $full_names_cache{$name};
35c0985d 742 }
38f4139d
KW
743 elsif ($loose && exists $loose_names_cache{$name}) {
744 $utf8 = $loose_names_cache{$name};
745 }
746 else { # Here, must do a look-up
747
748 # If full or loose matching succeeded, points to where to cache the
749 # result
750 my $cache_ref;
35c0985d 751
84374e30
KW
752 ## Suck in the code/name list as a big string.
753 ## Lines look like:
73d9566f 754 ## "00052\tLATIN CAPITAL LETTER R\n"
fb121860
KW
755 # or
756 # "0052 0303\tLATIN CAPITAL LETTER R WITH TILDE\n"
84374e30
KW
757 $txt = do "unicore/Name.pl" unless $txt;
758
759 ## @off will hold the index into the code/name string of the start and
760 ## end of the name as we find it.
761
38f4139d
KW
762 ## If :loose, look for a loose match; if :full, look for the name
763 ## exactly
6294fed8
KW
764 # First, see if the name is one which is algorithmically determinable.
765 # The subroutine is included in Name.pl. The table contained in
766 # $txt doesn't contain these. Experiments show that checking
767 # for these before checking for the regular names has no
768 # noticeable impact on performance for the regular names, but
769 # the other way around slows down finding these immensely.
770 # Algorithmically determinables are not placed in the cache because
771 # that uses up memory, and finding these again is fast.
38f4139d
KW
772 if (($loose || $^H{charnames_full})
773 && (defined (my $ord = name_to_code_point_special($lookup_name, $loose))))
774 {
775 $utf8 = pack("U", $ord);
776 }
777 else {
778
779 # Not algorithmically determinable; look up in the table. The name
780 # will be turned into a regex, so quote any meta characters.
781 $lookup_name = quotemeta $lookup_name;
782
783 if ($loose) {
784
785 # For loose matches, $lookup_name has already squeezed out the
786 # non-essential characters. We have to add in code to make the
787 # squeezed version match the non-squeezed equivalent in the table.
788 # The only remaining hyphens are ones that start or end a word in
789 # the original. They have been quoted in $lookup_name so they look
790 # like "\-". Change all other characters except the backslash
791 # quotes for any metacharacters, and the final character, so that
792 # e.g., COLON gets transformed into: /C[- ]?O[- ]?L[- ]?O[- ]?N/
793 $lookup_name =~ s/ (?! \\ -) # Don't do this to the \- sequence
794 ( [^-\\] ) # Nor the "-" within that sequence,
795 # nor the "\" that quotes metachars,
796 # but otherwise put the char into $1
797 (?=.) # And don't do it for the final char
798 /$1\[- \]?/gx; # And add an optional blank or
799 # '-' after each $1 char
800
801 # Those remaining hyphens were originally at the beginning or end of
802 # a word, so they can match either a blank before or after, but not
803 # both. (Keep in mind that they have been quoted, so are a '\-'
804 # sequence)
805 $lookup_name =~ s/\\ -/(?:- | -)/xg;
fb121860 806 }
5bd59e57 807
38f4139d
KW
808 # Do the lookup in the full table if asked for, and if succeeds
809 # save the offsets and set where to cache the result.
810 if (($loose || $^H{charnames_full}) && $txt =~ /\t$lookup_name$/m) {
811 @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
812 $cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache;
52fb7278 813 }
38f4139d 814 else {
84374e30 815
6294fed8
KW
816 # Here, didn't look for, or didn't find the name.
817 # If :short is allowed, see if input is like "greek:Sigma".
818 # Keep in mind that $lookup_name has had the metas quoted.
819 my $scripts_trie = "";
820 my $name_has_uppercase;
821 if (($^H{charnames_short})
822 && $lookup_name =~ /^ (?: \\ \s)* # Quoted space
823 (.+?) # $1 = the script
824 (?: \\ \s)*
825 \\ : # Quoted colon
826 (?: \\ \s)*
827 (.+?) # $2 = the name
828 (?: \\ \s)* $
829 /xs)
830 {
831 # Even in non-loose matching, the script traditionally has been
832 # case insensitve
833 $scripts_trie = "\U$1";
834 $lookup_name = $2;
835
836 # Use original name to find its input casing, but ignore the
837 # script part of that to make the determination.
838 $save_input = $name if ! defined $save_input;
839 $name =~ s/.*?://;
840 $name_has_uppercase = $name =~ /[[:upper:]]/;
841 }
842 else { # Otherwise look in allowed scripts
843 $scripts_trie = $^H{charnames_scripts};
844
845 # Use original name to find its input casing
846 $name_has_uppercase = $name =~ /[[:upper:]]/;
847 }
848
849 my $case = $name_has_uppercase ? "CAPITAL" : "SMALL";
850 if (! $scripts_trie
851 || $txt !~
852 /\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm)
853 {
854 # Here we still don't have it, give up.
855 return if $runtime;
856
857 # May have zapped input name, get it again.
858 $name = (defined $save_input) ? $save_input : $_[0];
859 carp "Unknown charname '$name'";
860 return ($wants_ord) ? 0xFFFD : pack("U", 0xFFFD);
861 }
862
863 # Here have found the input name in the table.
864 @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
38f4139d 865 }
b1c167a3 866
69ccf208
KW
867 # Here, the input name has been found; we haven't set up the output,
868 # but we know where in the string
fb121860 869 # the name starts. The string is set up so that for single characters
98dc9551 870 # (and not named sequences), the name is preceded immediately by a
fb121860 871 # tab and 5 hex digits for its code, with a \n before those. Named
98dc9551 872 # sequences won't have the 7th preceding character be a \n.
fb121860
KW
873 # (Actually, for the very first entry in the table this isn't strictly
874 # true: subtracting 7 will yield -1, and the substr below will
875 # therefore yield the very last character in the table, which should
876 # also be a \n, so the statement works anyway.)
877 if (substr($txt, $off[0] - 7, 1) eq "\n") {
878 $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5));
38f4139d
KW
879
880 # Handle the single loose matching special case, in which two names
881 # differ only by a single medial hyphen. If the original had a
882 # hyphen (or more) in the right place, then it is that one.
883 $utf8 = $HANGUL_JUNGSEONG_O_E_utf8
884 if $loose
885 && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8
886 && $name =~ m/O \s* - [-\s]* E/ix;
887 # Note that this wouldn't work if there were a 2nd
888 # OE in the name
fb121860
KW
889 }
890 else {
891
892 # Here, is a named sequence. Need to go looking for the beginning,
893 # which is just after the \n from the previous entry in the table.
894 # The +1 skips past that newline, or, if the rindex() fails, to put
895 # us to an offset of zero.
896 my $charstart = rindex($txt, "\n", $off[0] - 7) + 1;
897 $utf8 = pack("U*", map { CORE::hex }
898 split " ", substr($txt, $charstart, $off[0] - $charstart - 1));
899 }
5bd59e57 900 }
84374e30
KW
901
902 # Cache the input so as to not have to search the large table
903 # again, but only if it came from the one search that we cache.
38f4139d
KW
904 # (Haven't bothered with the pain of sorting out scoping issues for the
905 # scripts searches.)
906 $cache_ref->{$name} = $utf8 if defined $cache_ref;
35c0985d 907 }
423cee85 908 }
b177ca84 909
63098191 910
fb121860
KW
911 # Here, have the utf8. If the return is to be an ord, must be any single
912 # character.
913 if ($wants_ord) {
914 return ord($utf8) if length $utf8 == 1;
915 }
916 else {
917
918 # Here, wants string output. If utf8 is acceptable, just return what
919 # we've got; otherwise attempt to convert it to non-utf8 and return that.
920 my $in_bytes = ($runtime)
921 ? (caller $runtime)[8] & $bytes::hint_bits
922 : $^H & $bytes::hint_bits;
923 return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg
924 # means don't die on failure
925 }
926
927 # Here, there is an error: either there are too many characters, or the
928 # result string needs to be non-utf8, and at least one character requires
929 # utf8. Prefer any official name over the input one for the error message.
e79869e1
KW
930 if (@off) {
931 $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
932 }
933 else {
934 $name = (defined $save_input) ? $save_input : $_[0];
935 }
fb121860
KW
936
937 if ($wants_ord) {
938 # Only way to get here in this case is if result too long. Message
939 # assumes that our only caller that requires single char result is
940 # vianame.
941 carp "charnames::vianame() doesn't handle named sequences ($name). Use charnames::string_vianame() instead";
942 return;
943 }
944
945 # Only other possible failure here is from use bytes.
946 if ($runtime) {
947 carp not_legal_use_bytes_msg($name, $utf8);
948 return;
949 } else {
950 croak not_legal_use_bytes_msg($name, $utf8);
951 }
952
63098191
KW
953} # lookup_name
954
955sub charnames {
63098191 956
9deebca3
KW
957 # For \N{...}. Looks up the character name and returns the string
958 # representation of it.
63098191 959
fb121860
KW
960 # The first 0 arg means wants a string returned; the second that we are in
961 # compile time
962 return lookup_name($_[0], 0, 0);
63098191 963}
423cee85 964
b177ca84
JF
965sub import
966{
967 shift; ## ignore class name
968
35c0985d 969 if (not @_) {
1f874cb6 970 carp("'use charnames' needs explicit imports list");
b177ca84 971 }
423cee85 972 $^H{charnames} = \&charnames ;
84374e30
KW
973 $^H{charnames_ord_aliases} = {};
974 $^H{charnames_name_aliases} = {};
975 $^H{charnames_inverse_ords} = {};
03f95285
KW
976 # New fields must be added to %dummy_H, and the code in lookup_name()
977 # that copies fields from the runtime structure
b177ca84
JF
978
979 ##
980 ## fill %h keys with our @_ args.
981 ##
35c0985d 982 my ($promote, %h, @args) = (0);
e5c3f898
MG
983 while (my $arg = shift) {
984 if ($arg eq ":alias") {
51cf30b6 985 @_ or
52fb7278 986 croak ":alias needs an argument in charnames";
35c0985d
MB
987 my $alias = shift;
988 if (ref $alias) {
52fb7278
KW
989 ref $alias eq "HASH" or
990 croak "Only HASH reference supported as argument to :alias";
991 alias ($alias);
992 next;
35c0985d 993 }
51cf30b6 994 if ($alias =~ m{:(\w+)$}) {
38f4139d 995 $1 eq "full" || $1 eq "loose" || $1 eq "short" and
52fb7278
KW
996 croak ":alias cannot use existing pragma :$1 (reversed order?)";
997 alias_file ($1) and $promote = 1;
998 next;
35c0985d 999 }
51cf30b6
MB
1000 alias_file ($alias);
1001 next;
1002 }
38f4139d
KW
1003 if (substr($arg, 0, 1) eq ':'
1004 and ! ($arg eq ":full" || $arg eq ":short" || $arg eq ":loose"))
1005 {
e5c3f898 1006 warn "unsupported special '$arg' in charnames";
51cf30b6 1007 next;
35c0985d 1008 }
e5c3f898 1009 push @args, $arg;
35c0985d
MB
1010 }
1011 @args == 0 && $promote and @args = (":full");
1012 @h{@args} = (1) x @args;
b177ca84 1013
38f4139d
KW
1014 # Don't leave these undefined as are tested for in lookup_names
1015 $^H{charnames_full} = delete $h{':full'} || 0;
1016 $^H{charnames_loose} = delete $h{':loose'} || 0;
03f95285 1017 $^H{charnames_short} = delete $h{':short'} || 0;
363879a0 1018 my @scripts = map { uc quotemeta } keys %h;
b177ca84
JF
1019
1020 ##
1021 ## If utf8? warnings are enabled, and some scripts were given,
281aa49e 1022 ## see if at least we can find one letter from each script.
b177ca84 1023 ##
e79869e1 1024 if (warnings::enabled('utf8') && @scripts) {
35c0985d
MB
1025 $txt = do "unicore/Name.pl" unless $txt;
1026
e79869e1 1027 for my $script (@scripts) {
73d9566f 1028 if (not $txt =~ m/\t$script (?:CAPITAL |SMALL )?LETTER /) {
52fb7278 1029 warnings::warn('utf8', "No such script: '$script'");
e79869e1 1030 $script = quotemeta $script; # Escape it, for use in the re.
b177ca84 1031 }
35c0985d 1032 }
bd62941a 1033 }
84374e30
KW
1034
1035 # %^H gets stringified, so serialize it ourselves so can extract the
1036 # real data back later.
1037 $^H{charnames_stringified_ords} = join ",", %{$^H{charnames_ord_aliases}};
1038 $^H{charnames_stringified_names} = join ",", %{$^H{charnames_name_aliases}};
1039 $^H{charnames_stringified_inverse_ords} = join ",", %{$^H{charnames_inverse_ords}};
38f4139d
KW
1040
1041 # Modify the input script names for loose name matching if that is also
1042 # specified, similar to the way the base character name is prepared. They
1043 # don't (currently, and hopefully never will) have dashes. These go into a
1044 # regex, and have already been uppercased and quotemeta'd. Squeeze out all
1045 # input underscores, blanks, and dashes. Then convert so will match a blank
1046 # between any characters.
1047 if ($^H{charnames_loose}) {
1048 for (my $i = 0; $i < @scripts; $i++) {
1049 $scripts[$i] =~ s/[_ -]//g;
1050 $scripts[$i] =~ s/ ( [^\\] ) (?= . ) /$1\\ ?/gx;
1051 }
1052 }
1053
e79869e1 1054 $^H{charnames_scripts} = join "|", @scripts; # Stringifiy them as a trie
35c0985d 1055} # import
423cee85 1056
84374e30
KW
1057# Cache of already looked-up values. This is set to only contain
1058# official values, and user aliases can't override them, so scoping is
1059# not an issue.
1060my %viacode;
63098191
KW
1061
1062sub viacode {
1063
1064 # Returns the name of the code point argument
4e2cda5d 1065
35c0985d
MB
1066 if (@_ != 1) {
1067 carp "charnames::viacode() expects one argument";
bd5c3bd9 1068 return;
35c0985d 1069 }
f0175764 1070
35c0985d 1071 my $arg = shift;
b177ca84 1072
e5432b89
KW
1073 # This is derived from Unicode::UCD, where it is nearly the same as the
1074 # function _getcode(), but here it makes sure that even a hex argument
1075 # has the proper number of leading zeros, which is critical in
1076 # matching against $txt below
281aa49e 1077 # Must check if decimal first; see comments at that definition
35c0985d 1078 my $hex;
232cbbee 1079 if ($arg =~ $decimal_qr) {
b1c167a3 1080 $hex = sprintf "%05X", $arg;
232cbbee 1081 } elsif ($arg =~ $hex_qr) {
e10d7780 1082 # Below is the line that differs from the _getcode() source
b1c167a3 1083 $hex = sprintf "%05X", hex $1;
35c0985d
MB
1084 } else {
1085 carp("unexpected arg \"$arg\" to charnames::viacode()");
1086 return;
1087 }
b177ca84 1088
35c0985d 1089 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 1090
ac046fe1
KW
1091 # If the code point is above the max in the table, there's no point
1092 # looking through it. Checking the length first is slightly faster
1093 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) {
1094 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 1095
5bd59e57
KW
1096 # See if the name is algorithmically determinable.
1097 my $algorithmic = code_point_to_name_special(CORE::hex $hex);
1098 if (defined $algorithmic) {
1099 $viacode{$hex} = $algorithmic;
1100 return $algorithmic;
1101 }
1102
ac046fe1
KW
1103 # Return the official name, if exists. It's unclear to me (khw) at
1104 # this juncture if it is better to return a user-defined override, so
1105 # leaving it as is for now.
73d9566f 1106 if ($txt =~ m/^$hex\t/m) {
f3227b74 1107
52fb7278
KW
1108 # The name starts with the next character and goes up to the
1109 # next new-line. Using capturing parentheses above instead of
1110 # @+ more than doubles the execution time in Perl 5.13
f3227b74 1111 $viacode{$hex} = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
52fb7278 1112 return $viacode{$hex};
ac046fe1 1113 }
232cbbee
KW
1114 }
1115
1116 # See if there is a user name for it, before giving up completely.
03f95285
KW
1117 # First get the scoped aliases, give up if have none.
1118 my $H_ref = (caller(0))[10];
1119 return if ! defined $H_ref
1120 || ! exists $H_ref->{charnames_stringified_inverse_ords};
1121
84374e30 1122 my %code_point_aliases = split ',',
03f95285 1123 $H_ref->{charnames_stringified_inverse_ords};
84374e30 1124 if (! exists $code_point_aliases{$hex}) {
ac046fe1
KW
1125 if (CORE::hex($hex) > 0x10FFFF) {
1126 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
1127 }
1128 return;
1129 }
bd5c3bd9 1130
84374e30 1131 return $code_point_aliases{$hex};
35c0985d 1132} # viacode
daf0d493
JH
1133
1134sub vianame
1135{
35c0985d
MB
1136 if (@_ != 1) {
1137 carp "charnames::vianame() expects one name argument";
1138 return ()
1139 }
daf0d493 1140
63098191
KW
1141 # Looks up the character name and returns its ordinal if
1142 # found, undef otherwise.
daf0d493 1143
63098191 1144 my $arg = shift;
dbc0d4f2 1145
63098191 1146 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d 1147
fb121860
KW
1148 # khw claims that this is poor interface design. The function should
1149 # return either a an ord or a chr for all inputs; not be bipolar. But
1150 # can't change it because of backward compatibility. New code can use
1151 # string_vianame() instead.
5a7fb30a
KW
1152 my $ord = CORE::hex $1;
1153 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
fb121860 1154 carp not_legal_use_bytes_msg($arg, chr $ord);
5a7fb30a 1155 return;
63098191 1156 }
daf0d493 1157
fb121860
KW
1158 # The first 1 arg means wants an ord returned; the second that we are in
1159 # runtime, and this is the first level routine called from the user
1160 return lookup_name($arg, 1, 1);
35c0985d 1161} # vianame
b177ca84 1162
fb121860
KW
1163sub string_vianame {
1164
1165 # Looks up the character name and returns its string representation if
1166 # found, undef otherwise.
1167
1168 if (@_ != 1) {
1169 carp "charnames::string_vianame() expects one name argument";
1170 return;
1171 }
1172
1173 my $arg = shift;
1174
1175 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
1176
1177 my $ord = CORE::hex $1;
1178 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
1179
1180 carp not_legal_use_bytes_msg($arg, chr $ord);
1181 return;
1182 }
1183
1184 # The 0 arg means wants a string returned; the 1 arg means that we are in
1185 # runtime, and this is the first level routine called from the user
1186 return lookup_name($arg, 0, 1);
1187} # string_vianame
1188
1189
423cee85
JH
1190
11911;
1192__END__
1193
1194=head1 NAME
1195
fb121860 1196charnames - access to Unicode character names and named character sequences; also define character names
423cee85
JH
1197
1198=head1 SYNOPSIS
1199
bcc08981
KW
1200 use charnames ':full';
1201 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
1202 print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}",
1203 " is an officially named sequence of two Unicode characters\n";
1204
38f4139d
KW
1205 use charnames ':loose';
1206 print "\N{Greek small-letter sigma}",
1207 "can be used to ignore case, underscores, most blanks,"
1208 "and when you aren't sure if the official name has hyphens\n";
1209
bcc08981
KW
1210 use charnames ':short';
1211 print "\N{greek:Sigma} is an upper-case sigma.\n";
1212
1213 use charnames qw(cyrillic greek);
1214 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
1215
1216 use charnames ":full", ":alias" => {
1217 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
1218 mychar => 0xE8000, # Private use area
1219 };
1220 print "\N{e_ACUTE} is a small letter e with an acute.\n";
14aeae98 1221 print "\N{mychar} allows me to name private use characters.\n";
bcc08981
KW
1222
1223 use charnames ();
1224 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
1225 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
1226 # "10330"
1227 print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on
1228 # ASCII platforms;
1229 # 193 on EBCDIC
1230 print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A"
b177ca84 1231
423cee85
JH
1232=head1 DESCRIPTION
1233
da9dec57 1234Pragma C<use charnames> is used to gain access to the names of the
fb121860
KW
1235Unicode characters and named character sequences, and to allow you to define
1236your own character and character sequence names.
1237
1238All forms of the pragma enable use of the following 3 functions:
1239
1240=over
1241
1242=item *
1243
1244L</charnames::string_vianame(I<name>)> for run-time lookup of a
1245either a character name or a named character sequence, returning its string
1246representation
1247
1248=item *
1249
1250L</charnames::vianame(I<name>)> for run-time lookup of a
1251character name (but not a named character sequence) to get its ordinal value
1252(code point)
da9dec57 1253
fb121860 1254=item *
da9dec57 1255
fb121860
KW
1256L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its
1257Unicode name.
1258
1259=back
1260
1261All forms other than C<S<"use charnames ();">> also enable the use of
da9dec57 1262C<\N{I<CHARNAME>}> sequences to compile a Unicode character into a
8ebef31d 1263string, based on its name.
da9dec57
KW
1264
1265Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
1266also inserts a character into a string, but doesn't require the use of
1267this pragma. The character it inserts is the one whose code point
1268(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
1269the Unicode (white background, black foreground) smiley face; it doesn't
1270require this pragma, whereas the equivalent, C<"\N{WHITE SMILING FACE}">
1271does.
d9f23c72 1272Also note, C<\N{I<...>}> can mean a regex quantifier instead of a character
8ebef31d
KW
1273name, when the I<...> is a number (or comma separated pair of numbers
1274(see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
da9dec57 1275
38f4139d
KW
1276The C<charnames> pragma supports arguments C<:full>, C<:loose>, C<:short>,
1277script names and L<customized aliases|/CUSTOM ALIASES>.
1278
1279If C<:full> is present, for expansion of
da9dec57 1280C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
38f4139d
KW
1281standard Unicode character names.
1282
1283C<:loose> is a variant of C<:full> which allows I<CHARNAME> to be less
1284precisely specified. Details are in L</LOOSE MATCHES>.
1285
1286If C<:short> is present, and
da9dec57 1287I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
14aeae98
KW
1288as a letter in script I<SCRIPT>, as described in the next paragraph.
1289Or, if C<use charnames> is used
da9dec57
KW
1290with script name arguments, then for C<\N{I<CHARNAME>}> the name
1291I<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
1292specified order). Customized aliases can override these, and are explained in
1293L</CUSTOM ALIASES>.
423cee85 1294
da9dec57 1295For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>
14aeae98 1296this pragma looks in the table of standard Unicode names for the names
423cee85
JH
1297
1298 SCRIPTNAME CAPITAL LETTER CHARNAME
1299 SCRIPTNAME SMALL LETTER CHARNAME
1300 SCRIPTNAME LETTER CHARNAME
1301
14aeae98 1302If I<CHARNAME> is all lowercase,
daf0d493 1303then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
14aeae98 1304is ignored, and both I<CHARNAME> and I<SCRIPTNAME> are converted to all
38f4139d
KW
1305uppercase for look-up. Other than that, both of them follow L<loose|/LOOSE
1306MATCHES> rules if C<:loose> is also specified; strict otherwise.
daf0d493 1307
da9dec57
KW
1308Note that C<\N{...}> is compile-time; it's a special form of string
1309constant used inside double-quotish strings; this means that you cannot
4e2cda5d 1310use variables inside the C<\N{...}>. If you want similar run-time
fb121860
KW
1311functionality, use
1312L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>.
423cee85 1313
301a3cda 1314For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
da9dec57
KW
1315there are no official Unicode names but you can use instead the ISO 6429
1316names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF,
1f31fcd4 1317ESC, ...). In Unicode 3.2 (as of Perl 5.8) some naming changes took
b59ae8bb 1318place, and ISO 6429 was updated, see L</ALIASES>. Since Unicode 6.0, it
d9f23c72
KW
1319is deprecated to use C<BELL>. Instead use C<ALERT> (but C<BEL> will continue
1320to work).
301a3cda 1321
e5432b89
KW
1322If the input name is unknown, C<\N{NAME}> raises a warning and
1323substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
1324
8ebef31d
KW
1325For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the
1326input name is that of a character that won't fit into a byte (i.e., whose
1327ordinal is above 255).
e5432b89 1328
da9dec57
KW
1329Otherwise, any string that includes a C<\N{I<charname>}> or
1330C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
1331L<perlunicode/Byte and Character Semantics>).
1332
38f4139d
KW
1333=head1 LOOSE MATCHES
1334
1335By specifying C<:loose>, Unicode's L<loose character name
5ef88e32 1336matching|http://www.unicode.org/reports/tr44#Matching_Rules> rules are
38f4139d
KW
1337selected instead of the strict exact match used otherwise.
1338That means that I<CHARNAME> doesn't have to be so precisely specified.
1339Upper/lower case doesn't matter (except with scripts as mentioned above), nor
1340do any underscores, and the only hyphens that matter are those at the
1341beginning or end of a word in the name (with one exception: the hyphen in
1342U+1180 C<HANGUL JUNGSEONG O-E> does matter).
1343Also, blanks not adjacent to hyphens don't matter.
1344The official Unicode names are quite variable as to where they use hyphens
1345versus spaces to separate word-like units, and this option allows you to not
1346have to care as much.
1347The reason non-medial hyphens matter is because of cases like
1348U+0F60 C<TIBETAN LETTER -A> versus U+0F68 C<TIBETAN LETTER A>.
1349The hyphen here is significant, as is the space before it, and so both must be
1350included.
1351
1352C<:loose> slows down look-ups by a factor of 2 to 3 versus
1353C<:full>, but the trade-off may be worth it to you. Each individual look-up
1354takes very little time, and the results are cached, so the speed difference
1355would become a factor only in programs that do look-ups of many different
1356spellings, and probably only when those look-ups are through vianame() and
1357string_vianame(), since C<\N{...}> look-ups are done at compile time.
1358
5ffe0e96 1359=head1 ALIASES
423cee85 1360
14aeae98
KW
1361A few aliases have been defined for convenience; instead of having
1362to use the official names,
423cee85 1363
5ffe0e96
MB
1364 LINE FEED (LF)
1365 FORM FEED (FF)
1366 CARRIAGE RETURN (CR)
1367 NEXT LINE (NEL)
423cee85 1368
e5432b89 1369(yes, with parentheses), one can use
d5448623 1370
5ffe0e96
MB
1371 LINE FEED
1372 FORM FEED
1373 CARRIAGE RETURN
1374 NEXT LINE
1375 LF
1376 FF
1377 CR
1378 NEL
1379
16036bcd
KW
1380All the other standard abbreviations for the controls, such as C<ACK> for
1381C<ACKNOWLEDGE> also can be used.
1382
5ffe0e96
MB
1383One can also use
1384
1385 BYTE ORDER MARK
1386 BOM
1387
16036bcd
KW
1388and these abbreviations
1389
1390 Abbreviation Full Name
1391
1392 CGJ COMBINING GRAPHEME JOINER
1393 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
1394 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
1395 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
1396 LRE LEFT-TO-RIGHT EMBEDDING
1397 LRM LEFT-TO-RIGHT MARK
1398 LRO LEFT-TO-RIGHT OVERRIDE
1399 MMSP MEDIUM MATHEMATICAL SPACE
1400 MVS MONGOLIAN VOWEL SEPARATOR
1401 NBSP NO-BREAK SPACE
1402 NNBSP NARROW NO-BREAK SPACE
1403 PDF POP DIRECTIONAL FORMATTING
1404 RLE RIGHT-TO-LEFT EMBEDDING
1405 RLM RIGHT-TO-LEFT MARK
1406 RLO RIGHT-TO-LEFT OVERRIDE
1407 SHY SOFT HYPHEN
1408 VS1 VARIATION SELECTOR-1
1409 .
1410 .
1411 .
1412 VS256 VARIATION SELECTOR-256
1413 WJ WORD JOINER
1414 ZWJ ZERO WIDTH JOINER
1415 ZWNJ ZERO WIDTH NON-JOINER
1416 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
1417
1418For backward compatibility one can use the old names for
1419certain C0 and C1 controls
1420
1421 old new
1422
5ffe0e96
MB
1423 FILE SEPARATOR INFORMATION SEPARATOR FOUR
1424 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
1425 HORIZONTAL TABULATION CHARACTER TABULATION
1426 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
1427 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
1428 WITH JUSTIFICATION
5ffe0e96
MB
1429 PARTIAL LINE DOWN PARTIAL LINE FORWARD
1430 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
1431 RECORD SEPARATOR INFORMATION SEPARATOR TWO
1432 REVERSE INDEX REVERSE LINE FEED
1433 UNIT SEPARATOR INFORMATION SEPARATOR ONE
1434 VERTICAL TABULATION LINE TABULATION
1435 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
1436
1437but the old names in addition to giving the character
1438will also give a warning about being deprecated.
423cee85 1439
16036bcd
KW
1440And finally, certain published variants are usable, including some for
1441controls that have no Unicode names:
1442
1f31fcd4
KW
1443 name character
1444
52fb7278 1445 END OF PROTECTED AREA END OF GUARDED AREA, U+0097
1f31fcd4
KW
1446 HIGH OCTET PRESET U+0081
1447 HOP U+0081
1448 IND U+0084
1449 INDEX U+0084
1450 PAD U+0080
1451 PADDING CHARACTER U+0080
1452 PRIVATE USE 1 PRIVATE USE ONE, U+0091
1453 PRIVATE USE 2 PRIVATE USE TWO, U+0092
1454 SGC U+0099
1455 SINGLE GRAPHIC CHARACTER INTRODUCER U+0099
1456 SINGLE-SHIFT 2 SINGLE SHIFT TWO, U+008E
1457 SINGLE-SHIFT 3 SINGLE SHIFT THREE, U+008F
1458 START OF PROTECTED AREA START OF GUARDED AREA, U+0096
16036bcd 1459
35c0985d
MB
1460=head1 CUSTOM ALIASES
1461
1f31fcd4
KW
1462You can add customized aliases to standard (C<:full>) Unicode naming
1463conventions. The aliases override any standard definitions, so, if
da9dec57
KW
1464you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
1465mean C<"B">, etc.
55bc7d3c
KW
1466
1467Note that an alias should not be something that is a legal curly
1468brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
e5432b89
KW
1469C<\N{123}> means to match 123 non-newline characters, and is not treated as a
1470charnames alias. Aliases are discouraged from beginning with anything
1471other than an alphabetic character and from containing anything other
1472than alphanumerics, spaces, dashes, parentheses, and underscores.
1473Currently they must be ASCII.
1474
38f4139d
KW
1475An alias can map to either an official Unicode character name (not a loose
1476matched name) or to a
e5432b89
KW
1477numeric code point (ordinal). The latter is useful for assigning names
1478to code points in Unicode private use areas such as U+E800 through
f12d74c0
KW
1479U+F8FF.
1480A numeric code point must be a non-negative integer or a string beginning
1481with C<"U+"> or C<"0x"> with the remainder considered to be a
1482hexadecimal integer. A literal numeric constant must be unsigned; it
1483will be interpreted as hex if it has a leading zero or contains
1484non-decimal hex digits; otherwise it will be interpreted as decimal.
232cbbee 1485
da9dec57 1486Aliases are added either by the use of anonymous hashes:
35c0985d 1487
da9dec57 1488 use charnames ":alias" => {
35c0985d 1489 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee 1490 mychar1 => 0xE8000,
35c0985d
MB
1491 };
1492 my $str = "\N{e_ACUTE}";
1493
da9dec57 1494or by using a file containing aliases:
35c0985d 1495
da9dec57 1496 use charnames ":alias" => "pro";
35c0985d 1497
8ebef31d 1498This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
da9dec57 1499file should return a list in plain perl:
35c0985d
MB
1500
1501 (
1502 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
1503 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
1504 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
1505 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
1506 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
1507 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
1508 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
f12d74c0 1509 mychar2 => "U+E8001",
35c0985d
MB
1510 );
1511
da9dec57
KW
1512Both these methods insert C<":full"> automatically as the first argument (if no
1513other argument is given), and you can give the C<":full"> explicitly as
1514well, like
35c0985d 1515
da9dec57 1516 use charnames ":full", ":alias" => "pro";
35c0985d 1517
38f4139d
KW
1518C<":loose"> has no effect with these. Input names must match exactly, using
1519C<":full"> rules.
1520
14aeae98 1521Also, both these methods currently allow only single characters to be named.
8ebef31d
KW
1522To name a sequence of characters, use a
1523L<custom translator|/CUSTOM TRANSLATORS> (described below).
1524
da9dec57 1525=head1 charnames::viacode(I<code>)
b177ca84
JF
1526
1527Returns the full name of the character indicated by the numeric code.
da9dec57 1528For example,
b177ca84
JF
1529
1530 print charnames::viacode(0x2722);
1531
1532prints "FOUR TEARDROP-SPOKED ASTERISK".
1533
232cbbee 1534The name returned is the official name for the code point, if
8ebef31d 1535available; otherwise your custom alias for it. This means that your
232cbbee 1536alias will only be returned for code points that don't have an official
14aeae98 1537Unicode name (nor a Unicode version 1 name), such as private use code
232cbbee 1538points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
da9dec57
KW
1539If you define more than one name for the code point, it is indeterminate
1540which one will be returned.
1541
1542The function returns C<undef> if no name is known for the code point.
1543In Unicode the proper name of these is the empty string, which
1544C<undef> stringifies to. (If you ask for a code point past the legal
1545Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
f12d74c0
KW
1546get C<undef> plus a warning.)
1547
1548The input number must be a non-negative integer or a string beginning
1549with C<"U+"> or C<"0x"> with the remainder considered to be a
1550hexadecimal integer. A literal numeric constant must be unsigned; it
1551will be interpreted as hex if it has a leading zero or contains
1552non-decimal hex digits; otherwise it will be interpreted as decimal.
daf0d493 1553
d9f23c72 1554Notice that the name returned for U+FEFF is "ZERO WIDTH NO-BREAK
274085e3
PN
1555SPACE", not "BYTE ORDER MARK".
1556
fb121860 1557=head1 charnames::string_vianame(I<name>)
daf0d493 1558
fb121860
KW
1559This is a runtime equivalent to C<\N{...}>. I<name> can be any expression
1560that evaluates to a name accepted by C<\N{...}> under the L<C<:full>
1561option|/DESCRIPTION> to C<charnames>. In addition, any other options for the
38f4139d
KW
1562controlling C<"use charnames"> in the same scope apply, like C<:loose> or any
1563L<script list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM
1564ALIASES> you may have defined.
daf0d493 1565
fb121860
KW
1566The only difference is that if the input name is unknown, C<string_vianame>
1567returns C<undef> instead of the REPLACEMENT CHARACTER and does not raise a
1568warning message.
daf0d493 1569
fb121860
KW
1570=head1 charnames::vianame(I<name>)
1571
1572This is similar to C<string_vianame>. The main difference is that under most
5ef88e32 1573circumstances, vianame returns an ordinal code
fb121860 1574point, whereas C<string_vianame> returns a string. For example,
daf0d493 1575
fb121860 1576 printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
b177ca84 1577
fb121860 1578prints "U+2722".
1f31fcd4 1579
fb121860
KW
1580This leads to the other two differences. Since a single code point is
1581returned, the function can't handle named character sequences, as these are
14aeae98
KW
1582composed of multiple characters (it returns C<undef> for these. And, the code
1583point can be that of any
fb121860 1584character, even ones that aren't legal under the C<S<use bytes>> pragma,
b177ca84 1585
5ef88e32
KW
1586See L</BUGS> for the circumstances in which the behavior differs
1587from that described above.
1588
5ffe0e96 1589=head1 CUSTOM TRANSLATORS
52ea3e69 1590
5ffe0e96 1591The mechanism of translation of C<\N{...}> escapes is general and not
5ef88e32 1592hardwired into F<charnames.pm>. A module can install custom
5ffe0e96
MB
1593translations (inside the scope which C<use>s the module) with the
1594following magic incantation:
52ea3e69 1595
5ffe0e96 1596 sub import {
52fb7278
KW
1597 shift;
1598 $^H{charnames} = \&translator;
5ffe0e96 1599 }
52ea3e69 1600
da9dec57 1601Here translator() is a subroutine which takes I<CHARNAME> as an
5ffe0e96 1602argument, and returns text to insert into the string instead of the
5ef88e32
KW
1603C<\N{I<CHARNAME>}> escape.
1604
1605This is the only way you can create a custom named sequence of code points.
1606
1607Since the text to insert should be different
5ffe0e96
MB
1608in C<bytes> mode and out of it, the function should check the current
1609state of C<bytes>-flag as in:
52ea3e69 1610
52fb7278 1611 use bytes (); # for $bytes::hint_bits
5ffe0e96 1612 sub translator {
52fb7278
KW
1613 if ($^H & $bytes::hint_bits) {
1614 return bytes_translator(@_);
1615 }
1616 else {
1617 return utf8_translator(@_);
1618 }
5ffe0e96 1619 }
52ea3e69 1620
da9dec57 1621See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
f0175764 1622
9e808deb
KW
1623Of course, C<vianame>, C<viacode>, and C<string_vianame> would need to be
1624overridden as well.
1f31fcd4 1625
423cee85
JH
1626=head1 BUGS
1627
14aeae98 1628vianame() normally returns an ordinal code point, but when the input name is of
8ebef31d
KW
1629the form C<U+...>, it returns a chr instead. In this case, if C<use bytes> is
1630in effect and the character won't fit into a byte, it returns C<undef> and
1631raises a warning.
55bc7d3c 1632
16036bcd
KW
1633Names must be ASCII characters only, which means that you are out of luck if
1634you want to create aliases in a language where some or all the characters of
1635the desired aliases are non-ASCII.
bee80e93 1636
f12d74c0
KW
1637Since evaluation of the translation function (see L</CUSTOM
1638TRANSLATORS>) happens in the middle of compilation (of a string
1639literal), the translation function should not do any C<eval>s or
1640C<require>s. This restriction should be lifted (but is low priority) in
1641a future version of Perl.
423cee85
JH
1642
1643=cut
0eacc33e 1644
52fb7278 1645# ex: set ts=8 sts=2 sw=2 et: