This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
pod nits
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
51cf30b6 4use File::Spec;
889a6fe0 5our $VERSION = '1.27';
a03f0b9f 6use unicore::Name; # mktables-generated algorithmically-defined names
b75c8c73 7
52fb7278 8use bytes (); # for $bytes::hint_bits
123148a1 9use re "/aa"; # Everything in here should be ASCII
423cee85 10
38f4139d
KW
11# Translate between Unicode character names and their code points.
12#
13# The official names with their code points are stored in a table in
14# lib/unicore/Name.pl which is read in as a large string (almost 3/4 Mb in
15# Unicode 6.0). Each code point/name combination is separated by a \n in the
16# string. (Some of the CJK and the Hangul syllable names are determined
a03f0b9f
KW
17# instead algorithmically via subroutines stored instead in
18# lib/unicore/Name.pm). Because of the large size of this table, it isn't
19# converted into hashes for faster lookup.
38f4139d
KW
20#
21# But, user defined aliases are stored in their own hashes, as are Perl
22# extensions to the official names. These are checked first before looking at
23# the official table.
24#
25# Basically, the table is grepped for the input code point (viacode()) or
26# name (the other functions), and the corresponding value on the same line is
27# returned. The grepping is done by turning the input into a regular
28# expression. Thus, the same table does double duty, used by both name and
29# code point lookup. (If we were to have hashes, we would need two, one for
30# each lookup direction.)
31#
32# For loose name matching, the logical thing would be to have a table
33# with all the ignorable characters squeezed out, and then grep it with the
34# similiarly-squeezed input name. (And this is in fact how the lookups are
35# done with the small Perl extension hashes.) But since we need to be able to
36# go from code point to official name, the original table would still need to
37# exist. Due to the large size of the table, it was decided to not read
38# another very large string into memory for a second table. Instead, the
39# regular expression of the input name is modified to have optional spaces and
40# dashes between characters. For example, in strict matching, the regular
41# expression would be:
42# qr/\tDIGIT ONE$/m
43# Under loose matching, the blank would be squeezed out, and the re would be:
44# qr/\tD[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m
45# which matches a blank or dash between any characters in the official table.
46#
47# This is also how script lookup is done. Basically the re looks like
48# qr/ (?:LATIN|GREEK|CYRILLIC) (?:SMALL )?LETTER $name/
49# where $name is the loose or strict regex for the remainder of the name.
50
fb121860
KW
51# The hashes are stored as utf8 strings. This makes it easier to deal with
52# sequences. I (khw) also tried making Name.pl utf8, but it slowed things
53# down by a factor of 7. I then tried making Name.pl store the ut8
54# equivalents but not calling them utf8. That led to similar speed as leaving
55# it alone, but since that is harder for a human to parse, I left it as-is.
56
232cbbee 57my %system_aliases = (
69ccf208 58 # Synonyms for the icky 3.2 names that have parentheses.
bcc08981
KW
59 'LINE FEED' => pack("U", 0x0A), # LINE FEED (LF)
60 'FORM FEED' => pack("U", 0x0C), # FORM FEED (FF)
61 'CARRIAGE RETURN' => pack("U", 0x0D), # CARRIAGE RETURN (CR)
62 'NEXT LINE' => pack("U", 0x85), # NEXT LINE (NEL)
63
64 # Some variant names from Wikipedia
65 'SINGLE-SHIFT 2' => pack("U", 0x8E),
66 'SINGLE-SHIFT 3' => pack("U", 0x8F),
67 'PRIVATE USE 1' => pack("U", 0x91),
68 'PRIVATE USE 2' => pack("U", 0x92),
69 'START OF PROTECTED AREA' => pack("U", 0x96),
70 'END OF PROTECTED AREA' => pack("U", 0x97),
71
72 # Convenience. Standard abbreviations for the controls
73 'NUL' => pack("U", 0x00), # NULL
74 'SOH' => pack("U", 0x01), # START OF HEADING
75 'STX' => pack("U", 0x02), # START OF TEXT
76 'ETX' => pack("U", 0x03), # END OF TEXT
77 'EOT' => pack("U", 0x04), # END OF TRANSMISSION
78 'ENQ' => pack("U", 0x05), # ENQUIRY
79 'ACK' => pack("U", 0x06), # ACKNOWLEDGE
3ffed8c2 80 'BEL' => pack("U", 0x07), # ALERT; formerly BELL
bcc08981
KW
81 'BS' => pack("U", 0x08), # BACKSPACE
82 'HT' => pack("U", 0x09), # HORIZONTAL TABULATION
83 'LF' => pack("U", 0x0A), # LINE FEED (LF)
84 'VT' => pack("U", 0x0B), # VERTICAL TABULATION
85 'FF' => pack("U", 0x0C), # FORM FEED (FF)
86 'CR' => pack("U", 0x0D), # CARRIAGE RETURN (CR)
87 'SO' => pack("U", 0x0E), # SHIFT OUT
88 'SI' => pack("U", 0x0F), # SHIFT IN
89 'DLE' => pack("U", 0x10), # DATA LINK ESCAPE
90 'DC1' => pack("U", 0x11), # DEVICE CONTROL ONE
91 'DC2' => pack("U", 0x12), # DEVICE CONTROL TWO
92 'DC3' => pack("U", 0x13), # DEVICE CONTROL THREE
93 'DC4' => pack("U", 0x14), # DEVICE CONTROL FOUR
94 'NAK' => pack("U", 0x15), # NEGATIVE ACKNOWLEDGE
95 'SYN' => pack("U", 0x16), # SYNCHRONOUS IDLE
96 'ETB' => pack("U", 0x17), # END OF TRANSMISSION BLOCK
97 'CAN' => pack("U", 0x18), # CANCEL
98 'EOM' => pack("U", 0x19), # END OF MEDIUM
99 'SUB' => pack("U", 0x1A), # SUBSTITUTE
100 'ESC' => pack("U", 0x1B), # ESCAPE
101 'FS' => pack("U", 0x1C), # FILE SEPARATOR
102 'GS' => pack("U", 0x1D), # GROUP SEPARATOR
103 'RS' => pack("U", 0x1E), # RECORD SEPARATOR
104 'US' => pack("U", 0x1F), # UNIT SEPARATOR
105 'DEL' => pack("U", 0x7F), # DELETE
106 'BPH' => pack("U", 0x82), # BREAK PERMITTED HERE
107 'NBH' => pack("U", 0x83), # NO BREAK HERE
108 'NEL' => pack("U", 0x85), # NEXT LINE (NEL)
109 'SSA' => pack("U", 0x86), # START OF SELECTED AREA
110 'ESA' => pack("U", 0x87), # END OF SELECTED AREA
111 'HTS' => pack("U", 0x88), # CHARACTER TABULATION SET
112 'HTJ' => pack("U", 0x89), # CHARACTER TABULATION WITH JUSTIFICATION
113 'VTS' => pack("U", 0x8A), # LINE TABULATION SET
114 'PLD' => pack("U", 0x8B), # PARTIAL LINE FORWARD
115 'PLU' => pack("U", 0x8C), # PARTIAL LINE BACKWARD
25aa614d 116 'RI' => pack("U", 0x8D), # REVERSE LINE FEED
bcc08981
KW
117 'SS2' => pack("U", 0x8E), # SINGLE SHIFT TWO
118 'SS3' => pack("U", 0x8F), # SINGLE SHIFT THREE
119 'DCS' => pack("U", 0x90), # DEVICE CONTROL STRING
120 'PU1' => pack("U", 0x91), # PRIVATE USE ONE
121 'PU2' => pack("U", 0x92), # PRIVATE USE TWO
122 'STS' => pack("U", 0x93), # SET TRANSMIT STATE
123 'CCH' => pack("U", 0x94), # CANCEL CHARACTER
25aa614d 124 'MW' => pack("U", 0x95), # MESSAGE WAITING
bcc08981
KW
125 'SPA' => pack("U", 0x96), # START OF GUARDED AREA
126 'EPA' => pack("U", 0x97), # END OF GUARDED AREA
127 'SOS' => pack("U", 0x98), # START OF STRING
128 'SCI' => pack("U", 0x9A), # SINGLE CHARACTER INTRODUCER
129 'CSI' => pack("U", 0x9B), # CONTROL SEQUENCE INTRODUCER
25aa614d 130 'ST' => pack("U", 0x9C), # STRING TERMINATOR
bcc08981 131 'OSC' => pack("U", 0x9D), # OPERATING SYSTEM COMMAND
25aa614d 132 'PM' => pack("U", 0x9E), # PRIVACY MESSAGE
bcc08981
KW
133 'APC' => pack("U", 0x9F), # APPLICATION PROGRAM COMMAND
134
81965e2b
KW
135 # There are no names for these in the Unicode standard; perhaps should be
136 # deprecated, but then again there are no alternative names, so am not
137 # deprecating. And if did, the code would have to change to not recommend
138 # an alternative for these.
bcc08981
KW
139 'PADDING CHARACTER' => pack("U", 0x80),
140 'PAD' => pack("U", 0x80),
141 'HIGH OCTET PRESET' => pack("U", 0x81),
142 'HOP' => pack("U", 0x81),
143 'INDEX' => pack("U", 0x84),
144 'IND' => pack("U", 0x84),
145 'SINGLE GRAPHIC CHARACTER INTRODUCER' => pack("U", 0x99),
146 'SGC' => pack("U", 0x99),
147
81965e2b
KW
148 # More convenience. For further convenience, it is suggested some way of
149 # using the NamesList aliases be implemented, but there are ambiguities in
bcc08981
KW
150 # NamesList.txt
151 'BOM' => pack("U", 0xFEFF), # BYTE ORDER MARK
152 'BYTE ORDER MARK'=> pack("U", 0xFEFF),
153 'CGJ' => pack("U", 0x034F), # COMBINING GRAPHEME JOINER
154 'FVS1' => pack("U", 0x180B), # MONGOLIAN FREE VARIATION SELECTOR ONE
155 'FVS2' => pack("U", 0x180C), # MONGOLIAN FREE VARIATION SELECTOR TWO
156 'FVS3' => pack("U", 0x180D), # MONGOLIAN FREE VARIATION SELECTOR THREE
157 'LRE' => pack("U", 0x202A), # LEFT-TO-RIGHT EMBEDDING
158 'LRM' => pack("U", 0x200E), # LEFT-TO-RIGHT MARK
159 'LRO' => pack("U", 0x202D), # LEFT-TO-RIGHT OVERRIDE
160 'MMSP' => pack("U", 0x205F), # MEDIUM MATHEMATICAL SPACE
161 'MVS' => pack("U", 0x180E), # MONGOLIAN VOWEL SEPARATOR
162 'NBSP' => pack("U", 0x00A0), # NO-BREAK SPACE
163 'NNBSP' => pack("U", 0x202F), # NARROW NO-BREAK SPACE
164 'PDF' => pack("U", 0x202C), # POP DIRECTIONAL FORMATTING
165 'RLE' => pack("U", 0x202B), # RIGHT-TO-LEFT EMBEDDING
166 'RLM' => pack("U", 0x200F), # RIGHT-TO-LEFT MARK
167 'RLO' => pack("U", 0x202E), # RIGHT-TO-LEFT OVERRIDE
168 'SHY' => pack("U", 0x00AD), # SOFT HYPHEN
169 'VS1' => pack("U", 0xFE00), # VARIATION SELECTOR-1
170 'VS2' => pack("U", 0xFE01), # VARIATION SELECTOR-2
171 'VS3' => pack("U", 0xFE02), # VARIATION SELECTOR-3
172 'VS4' => pack("U", 0xFE03), # VARIATION SELECTOR-4
173 'VS5' => pack("U", 0xFE04), # VARIATION SELECTOR-5
174 'VS6' => pack("U", 0xFE05), # VARIATION SELECTOR-6
175 'VS7' => pack("U", 0xFE06), # VARIATION SELECTOR-7
176 'VS8' => pack("U", 0xFE07), # VARIATION SELECTOR-8
177 'VS9' => pack("U", 0xFE08), # VARIATION SELECTOR-9
178 'VS10' => pack("U", 0xFE09), # VARIATION SELECTOR-10
179 'VS11' => pack("U", 0xFE0A), # VARIATION SELECTOR-11
180 'VS12' => pack("U", 0xFE0B), # VARIATION SELECTOR-12
181 'VS13' => pack("U", 0xFE0C), # VARIATION SELECTOR-13
182 'VS14' => pack("U", 0xFE0D), # VARIATION SELECTOR-14
183 'VS15' => pack("U", 0xFE0E), # VARIATION SELECTOR-15
184 'VS16' => pack("U", 0xFE0F), # VARIATION SELECTOR-16
185 'VS17' => pack("U", 0xE0100), # VARIATION SELECTOR-17
186 'VS18' => pack("U", 0xE0101), # VARIATION SELECTOR-18
187 'VS19' => pack("U", 0xE0102), # VARIATION SELECTOR-19
188 'VS20' => pack("U", 0xE0103), # VARIATION SELECTOR-20
189 'VS21' => pack("U", 0xE0104), # VARIATION SELECTOR-21
190 'VS22' => pack("U", 0xE0105), # VARIATION SELECTOR-22
191 'VS23' => pack("U", 0xE0106), # VARIATION SELECTOR-23
192 'VS24' => pack("U", 0xE0107), # VARIATION SELECTOR-24
193 'VS25' => pack("U", 0xE0108), # VARIATION SELECTOR-25
194 'VS26' => pack("U", 0xE0109), # VARIATION SELECTOR-26
195 'VS27' => pack("U", 0xE010A), # VARIATION SELECTOR-27
196 'VS28' => pack("U", 0xE010B), # VARIATION SELECTOR-28
197 'VS29' => pack("U", 0xE010C), # VARIATION SELECTOR-29
198 'VS30' => pack("U", 0xE010D), # VARIATION SELECTOR-30
199 'VS31' => pack("U", 0xE010E), # VARIATION SELECTOR-31
200 'VS32' => pack("U", 0xE010F), # VARIATION SELECTOR-32
201 'VS33' => pack("U", 0xE0110), # VARIATION SELECTOR-33
202 'VS34' => pack("U", 0xE0111), # VARIATION SELECTOR-34
203 'VS35' => pack("U", 0xE0112), # VARIATION SELECTOR-35
204 'VS36' => pack("U", 0xE0113), # VARIATION SELECTOR-36
205 'VS37' => pack("U", 0xE0114), # VARIATION SELECTOR-37
206 'VS38' => pack("U", 0xE0115), # VARIATION SELECTOR-38
207 'VS39' => pack("U", 0xE0116), # VARIATION SELECTOR-39
208 'VS40' => pack("U", 0xE0117), # VARIATION SELECTOR-40
209 'VS41' => pack("U", 0xE0118), # VARIATION SELECTOR-41
210 'VS42' => pack("U", 0xE0119), # VARIATION SELECTOR-42
211 'VS43' => pack("U", 0xE011A), # VARIATION SELECTOR-43
212 'VS44' => pack("U", 0xE011B), # VARIATION SELECTOR-44
213 'VS45' => pack("U", 0xE011C), # VARIATION SELECTOR-45
214 'VS46' => pack("U", 0xE011D), # VARIATION SELECTOR-46
215 'VS47' => pack("U", 0xE011E), # VARIATION SELECTOR-47
216 'VS48' => pack("U", 0xE011F), # VARIATION SELECTOR-48
217 'VS49' => pack("U", 0xE0120), # VARIATION SELECTOR-49
218 'VS50' => pack("U", 0xE0121), # VARIATION SELECTOR-50
219 'VS51' => pack("U", 0xE0122), # VARIATION SELECTOR-51
220 'VS52' => pack("U", 0xE0123), # VARIATION SELECTOR-52
221 'VS53' => pack("U", 0xE0124), # VARIATION SELECTOR-53
222 'VS54' => pack("U", 0xE0125), # VARIATION SELECTOR-54
223 'VS55' => pack("U", 0xE0126), # VARIATION SELECTOR-55
224 'VS56' => pack("U", 0xE0127), # VARIATION SELECTOR-56
225 'VS57' => pack("U", 0xE0128), # VARIATION SELECTOR-57
226 'VS58' => pack("U", 0xE0129), # VARIATION SELECTOR-58
227 'VS59' => pack("U", 0xE012A), # VARIATION SELECTOR-59
228 'VS60' => pack("U", 0xE012B), # VARIATION SELECTOR-60
229 'VS61' => pack("U", 0xE012C), # VARIATION SELECTOR-61
230 'VS62' => pack("U", 0xE012D), # VARIATION SELECTOR-62
231 'VS63' => pack("U", 0xE012E), # VARIATION SELECTOR-63
232 'VS64' => pack("U", 0xE012F), # VARIATION SELECTOR-64
233 'VS65' => pack("U", 0xE0130), # VARIATION SELECTOR-65
234 'VS66' => pack("U", 0xE0131), # VARIATION SELECTOR-66
235 'VS67' => pack("U", 0xE0132), # VARIATION SELECTOR-67
236 'VS68' => pack("U", 0xE0133), # VARIATION SELECTOR-68
237 'VS69' => pack("U", 0xE0134), # VARIATION SELECTOR-69
238 'VS70' => pack("U", 0xE0135), # VARIATION SELECTOR-70
239 'VS71' => pack("U", 0xE0136), # VARIATION SELECTOR-71
240 'VS72' => pack("U", 0xE0137), # VARIATION SELECTOR-72
241 'VS73' => pack("U", 0xE0138), # VARIATION SELECTOR-73
242 'VS74' => pack("U", 0xE0139), # VARIATION SELECTOR-74
243 'VS75' => pack("U", 0xE013A), # VARIATION SELECTOR-75
244 'VS76' => pack("U", 0xE013B), # VARIATION SELECTOR-76
245 'VS77' => pack("U", 0xE013C), # VARIATION SELECTOR-77
246 'VS78' => pack("U", 0xE013D), # VARIATION SELECTOR-78
247 'VS79' => pack("U", 0xE013E), # VARIATION SELECTOR-79
248 'VS80' => pack("U", 0xE013F), # VARIATION SELECTOR-80
249 'VS81' => pack("U", 0xE0140), # VARIATION SELECTOR-81
250 'VS82' => pack("U", 0xE0141), # VARIATION SELECTOR-82
251 'VS83' => pack("U", 0xE0142), # VARIATION SELECTOR-83
252 'VS84' => pack("U", 0xE0143), # VARIATION SELECTOR-84
253 'VS85' => pack("U", 0xE0144), # VARIATION SELECTOR-85
254 'VS86' => pack("U", 0xE0145), # VARIATION SELECTOR-86
255 'VS87' => pack("U", 0xE0146), # VARIATION SELECTOR-87
256 'VS88' => pack("U", 0xE0147), # VARIATION SELECTOR-88
257 'VS89' => pack("U", 0xE0148), # VARIATION SELECTOR-89
258 'VS90' => pack("U", 0xE0149), # VARIATION SELECTOR-90
259 'VS91' => pack("U", 0xE014A), # VARIATION SELECTOR-91
260 'VS92' => pack("U", 0xE014B), # VARIATION SELECTOR-92
261 'VS93' => pack("U", 0xE014C), # VARIATION SELECTOR-93
262 'VS94' => pack("U", 0xE014D), # VARIATION SELECTOR-94
263 'VS95' => pack("U", 0xE014E), # VARIATION SELECTOR-95
264 'VS96' => pack("U", 0xE014F), # VARIATION SELECTOR-96
265 'VS97' => pack("U", 0xE0150), # VARIATION SELECTOR-97
266 'VS98' => pack("U", 0xE0151), # VARIATION SELECTOR-98
267 'VS99' => pack("U", 0xE0152), # VARIATION SELECTOR-99
268 'VS100' => pack("U", 0xE0153), # VARIATION SELECTOR-100
269 'VS101' => pack("U", 0xE0154), # VARIATION SELECTOR-101
270 'VS102' => pack("U", 0xE0155), # VARIATION SELECTOR-102
271 'VS103' => pack("U", 0xE0156), # VARIATION SELECTOR-103
272 'VS104' => pack("U", 0xE0157), # VARIATION SELECTOR-104
273 'VS105' => pack("U", 0xE0158), # VARIATION SELECTOR-105
274 'VS106' => pack("U", 0xE0159), # VARIATION SELECTOR-106
275 'VS107' => pack("U", 0xE015A), # VARIATION SELECTOR-107
276 'VS108' => pack("U", 0xE015B), # VARIATION SELECTOR-108
277 'VS109' => pack("U", 0xE015C), # VARIATION SELECTOR-109
278 'VS110' => pack("U", 0xE015D), # VARIATION SELECTOR-110
279 'VS111' => pack("U", 0xE015E), # VARIATION SELECTOR-111
280 'VS112' => pack("U", 0xE015F), # VARIATION SELECTOR-112
281 'VS113' => pack("U", 0xE0160), # VARIATION SELECTOR-113
282 'VS114' => pack("U", 0xE0161), # VARIATION SELECTOR-114
283 'VS115' => pack("U", 0xE0162), # VARIATION SELECTOR-115
284 'VS116' => pack("U", 0xE0163), # VARIATION SELECTOR-116
285 'VS117' => pack("U", 0xE0164), # VARIATION SELECTOR-117
286 'VS118' => pack("U", 0xE0165), # VARIATION SELECTOR-118
287 'VS119' => pack("U", 0xE0166), # VARIATION SELECTOR-119
288 'VS120' => pack("U", 0xE0167), # VARIATION SELECTOR-120
289 'VS121' => pack("U", 0xE0168), # VARIATION SELECTOR-121
290 'VS122' => pack("U", 0xE0169), # VARIATION SELECTOR-122
291 'VS123' => pack("U", 0xE016A), # VARIATION SELECTOR-123
292 'VS124' => pack("U", 0xE016B), # VARIATION SELECTOR-124
293 'VS125' => pack("U", 0xE016C), # VARIATION SELECTOR-125
294 'VS126' => pack("U", 0xE016D), # VARIATION SELECTOR-126
295 'VS127' => pack("U", 0xE016E), # VARIATION SELECTOR-127
296 'VS128' => pack("U", 0xE016F), # VARIATION SELECTOR-128
297 'VS129' => pack("U", 0xE0170), # VARIATION SELECTOR-129
298 'VS130' => pack("U", 0xE0171), # VARIATION SELECTOR-130
299 'VS131' => pack("U", 0xE0172), # VARIATION SELECTOR-131
300 'VS132' => pack("U", 0xE0173), # VARIATION SELECTOR-132
301 'VS133' => pack("U", 0xE0174), # VARIATION SELECTOR-133
302 'VS134' => pack("U", 0xE0175), # VARIATION SELECTOR-134
303 'VS135' => pack("U", 0xE0176), # VARIATION SELECTOR-135
304 'VS136' => pack("U", 0xE0177), # VARIATION SELECTOR-136
305 'VS137' => pack("U", 0xE0178), # VARIATION SELECTOR-137
306 'VS138' => pack("U", 0xE0179), # VARIATION SELECTOR-138
307 'VS139' => pack("U", 0xE017A), # VARIATION SELECTOR-139
308 'VS140' => pack("U", 0xE017B), # VARIATION SELECTOR-140
309 'VS141' => pack("U", 0xE017C), # VARIATION SELECTOR-141
310 'VS142' => pack("U", 0xE017D), # VARIATION SELECTOR-142
311 'VS143' => pack("U", 0xE017E), # VARIATION SELECTOR-143
312 'VS144' => pack("U", 0xE017F), # VARIATION SELECTOR-144
313 'VS145' => pack("U", 0xE0180), # VARIATION SELECTOR-145
314 'VS146' => pack("U", 0xE0181), # VARIATION SELECTOR-146
315 'VS147' => pack("U", 0xE0182), # VARIATION SELECTOR-147
316 'VS148' => pack("U", 0xE0183), # VARIATION SELECTOR-148
317 'VS149' => pack("U", 0xE0184), # VARIATION SELECTOR-149
318 'VS150' => pack("U", 0xE0185), # VARIATION SELECTOR-150
319 'VS151' => pack("U", 0xE0186), # VARIATION SELECTOR-151
320 'VS152' => pack("U", 0xE0187), # VARIATION SELECTOR-152
321 'VS153' => pack("U", 0xE0188), # VARIATION SELECTOR-153
322 'VS154' => pack("U", 0xE0189), # VARIATION SELECTOR-154
323 'VS155' => pack("U", 0xE018A), # VARIATION SELECTOR-155
324 'VS156' => pack("U", 0xE018B), # VARIATION SELECTOR-156
325 'VS157' => pack("U", 0xE018C), # VARIATION SELECTOR-157
326 'VS158' => pack("U", 0xE018D), # VARIATION SELECTOR-158
327 'VS159' => pack("U", 0xE018E), # VARIATION SELECTOR-159
328 'VS160' => pack("U", 0xE018F), # VARIATION SELECTOR-160
329 'VS161' => pack("U", 0xE0190), # VARIATION SELECTOR-161
330 'VS162' => pack("U", 0xE0191), # VARIATION SELECTOR-162
331 'VS163' => pack("U", 0xE0192), # VARIATION SELECTOR-163
332 'VS164' => pack("U", 0xE0193), # VARIATION SELECTOR-164
333 'VS165' => pack("U", 0xE0194), # VARIATION SELECTOR-165
334 'VS166' => pack("U", 0xE0195), # VARIATION SELECTOR-166
335 'VS167' => pack("U", 0xE0196), # VARIATION SELECTOR-167
336 'VS168' => pack("U", 0xE0197), # VARIATION SELECTOR-168
337 'VS169' => pack("U", 0xE0198), # VARIATION SELECTOR-169
338 'VS170' => pack("U", 0xE0199), # VARIATION SELECTOR-170
339 'VS171' => pack("U", 0xE019A), # VARIATION SELECTOR-171
340 'VS172' => pack("U", 0xE019B), # VARIATION SELECTOR-172
341 'VS173' => pack("U", 0xE019C), # VARIATION SELECTOR-173
342 'VS174' => pack("U", 0xE019D), # VARIATION SELECTOR-174
343 'VS175' => pack("U", 0xE019E), # VARIATION SELECTOR-175
344 'VS176' => pack("U", 0xE019F), # VARIATION SELECTOR-176
345 'VS177' => pack("U", 0xE01A0), # VARIATION SELECTOR-177
346 'VS178' => pack("U", 0xE01A1), # VARIATION SELECTOR-178
347 'VS179' => pack("U", 0xE01A2), # VARIATION SELECTOR-179
348 'VS180' => pack("U", 0xE01A3), # VARIATION SELECTOR-180
349 'VS181' => pack("U", 0xE01A4), # VARIATION SELECTOR-181
350 'VS182' => pack("U", 0xE01A5), # VARIATION SELECTOR-182
351 'VS183' => pack("U", 0xE01A6), # VARIATION SELECTOR-183
352 'VS184' => pack("U", 0xE01A7), # VARIATION SELECTOR-184
353 'VS185' => pack("U", 0xE01A8), # VARIATION SELECTOR-185
354 'VS186' => pack("U", 0xE01A9), # VARIATION SELECTOR-186
355 'VS187' => pack("U", 0xE01AA), # VARIATION SELECTOR-187
356 'VS188' => pack("U", 0xE01AB), # VARIATION SELECTOR-188
357 'VS189' => pack("U", 0xE01AC), # VARIATION SELECTOR-189
358 'VS190' => pack("U", 0xE01AD), # VARIATION SELECTOR-190
359 'VS191' => pack("U", 0xE01AE), # VARIATION SELECTOR-191
360 'VS192' => pack("U", 0xE01AF), # VARIATION SELECTOR-192
361 'VS193' => pack("U", 0xE01B0), # VARIATION SELECTOR-193
362 'VS194' => pack("U", 0xE01B1), # VARIATION SELECTOR-194
363 'VS195' => pack("U", 0xE01B2), # VARIATION SELECTOR-195
364 'VS196' => pack("U", 0xE01B3), # VARIATION SELECTOR-196
365 'VS197' => pack("U", 0xE01B4), # VARIATION SELECTOR-197
366 'VS198' => pack("U", 0xE01B5), # VARIATION SELECTOR-198
367 'VS199' => pack("U", 0xE01B6), # VARIATION SELECTOR-199
368 'VS200' => pack("U", 0xE01B7), # VARIATION SELECTOR-200
369 'VS201' => pack("U", 0xE01B8), # VARIATION SELECTOR-201
370 'VS202' => pack("U", 0xE01B9), # VARIATION SELECTOR-202
371 'VS203' => pack("U", 0xE01BA), # VARIATION SELECTOR-203
372 'VS204' => pack("U", 0xE01BB), # VARIATION SELECTOR-204
373 'VS205' => pack("U", 0xE01BC), # VARIATION SELECTOR-205
374 'VS206' => pack("U", 0xE01BD), # VARIATION SELECTOR-206
375 'VS207' => pack("U", 0xE01BE), # VARIATION SELECTOR-207
376 'VS208' => pack("U", 0xE01BF), # VARIATION SELECTOR-208
377 'VS209' => pack("U", 0xE01C0), # VARIATION SELECTOR-209
378 'VS210' => pack("U", 0xE01C1), # VARIATION SELECTOR-210
379 'VS211' => pack("U", 0xE01C2), # VARIATION SELECTOR-211
380 'VS212' => pack("U", 0xE01C3), # VARIATION SELECTOR-212
381 'VS213' => pack("U", 0xE01C4), # VARIATION SELECTOR-213
382 'VS214' => pack("U", 0xE01C5), # VARIATION SELECTOR-214
383 'VS215' => pack("U", 0xE01C6), # VARIATION SELECTOR-215
384 'VS216' => pack("U", 0xE01C7), # VARIATION SELECTOR-216
385 'VS217' => pack("U", 0xE01C8), # VARIATION SELECTOR-217
386 'VS218' => pack("U", 0xE01C9), # VARIATION SELECTOR-218
387 'VS219' => pack("U", 0xE01CA), # VARIATION SELECTOR-219
388 'VS220' => pack("U", 0xE01CB), # VARIATION SELECTOR-220
389 'VS221' => pack("U", 0xE01CC), # VARIATION SELECTOR-221
390 'VS222' => pack("U", 0xE01CD), # VARIATION SELECTOR-222
391 'VS223' => pack("U", 0xE01CE), # VARIATION SELECTOR-223
392 'VS224' => pack("U", 0xE01CF), # VARIATION SELECTOR-224
393 'VS225' => pack("U", 0xE01D0), # VARIATION SELECTOR-225
394 'VS226' => pack("U", 0xE01D1), # VARIATION SELECTOR-226
395 'VS227' => pack("U", 0xE01D2), # VARIATION SELECTOR-227
396 'VS228' => pack("U", 0xE01D3), # VARIATION SELECTOR-228
397 'VS229' => pack("U", 0xE01D4), # VARIATION SELECTOR-229
398 'VS230' => pack("U", 0xE01D5), # VARIATION SELECTOR-230
399 'VS231' => pack("U", 0xE01D6), # VARIATION SELECTOR-231
400 'VS232' => pack("U", 0xE01D7), # VARIATION SELECTOR-232
401 'VS233' => pack("U", 0xE01D8), # VARIATION SELECTOR-233
402 'VS234' => pack("U", 0xE01D9), # VARIATION SELECTOR-234
403 'VS235' => pack("U", 0xE01DA), # VARIATION SELECTOR-235
404 'VS236' => pack("U", 0xE01DB), # VARIATION SELECTOR-236
405 'VS237' => pack("U", 0xE01DC), # VARIATION SELECTOR-237
406 'VS238' => pack("U", 0xE01DD), # VARIATION SELECTOR-238
407 'VS239' => pack("U", 0xE01DE), # VARIATION SELECTOR-239
408 'VS240' => pack("U", 0xE01DF), # VARIATION SELECTOR-240
409 'VS241' => pack("U", 0xE01E0), # VARIATION SELECTOR-241
410 'VS242' => pack("U", 0xE01E1), # VARIATION SELECTOR-242
411 'VS243' => pack("U", 0xE01E2), # VARIATION SELECTOR-243
412 'VS244' => pack("U", 0xE01E3), # VARIATION SELECTOR-244
413 'VS245' => pack("U", 0xE01E4), # VARIATION SELECTOR-245
414 'VS246' => pack("U", 0xE01E5), # VARIATION SELECTOR-246
415 'VS247' => pack("U", 0xE01E6), # VARIATION SELECTOR-247
416 'VS248' => pack("U", 0xE01E7), # VARIATION SELECTOR-248
417 'VS249' => pack("U", 0xE01E8), # VARIATION SELECTOR-249
418 'VS250' => pack("U", 0xE01E9), # VARIATION SELECTOR-250
419 'VS251' => pack("U", 0xE01EA), # VARIATION SELECTOR-251
420 'VS252' => pack("U", 0xE01EB), # VARIATION SELECTOR-252
421 'VS253' => pack("U", 0xE01EC), # VARIATION SELECTOR-253
422 'VS254' => pack("U", 0xE01ED), # VARIATION SELECTOR-254
423 'VS255' => pack("U", 0xE01EE), # VARIATION SELECTOR-255
424 'VS256' => pack("U", 0xE01EF), # VARIATION SELECTOR-256
425 'WJ' => pack("U", 0x2060), # WORD JOINER
426 'ZWJ' => pack("U", 0x200D), # ZERO WIDTH JOINER
427 'ZWNJ' => pack("U", 0x200C), # ZERO WIDTH NON-JOINER
428 'ZWSP' => pack("U", 0x200B), # ZERO WIDTH SPACE
429);
52ea3e69 430
38f4139d
KW
431# These are the aliases above that differ under :loose and :full matching
432# because the :full versions have blanks or hyphens in them.
433my %loose_system_aliases = (
434 'LINEFEED' => pack("U", 0x0A),
435 'FORMFEED' => pack("U", 0x0C),
436 'CARRIAGERETURN' => pack("U", 0x0D),
437 'NEXTLINE' => pack("U", 0x85),
438 'SINGLESHIFT2' => pack("U", 0x8E),
439 'SINGLESHIFT3' => pack("U", 0x8F),
440 'PRIVATEUSE1' => pack("U", 0x91),
441 'PRIVATEUSE2' => pack("U", 0x92),
442 'STARTOFPROTECTEDAREA' => pack("U", 0x96),
443 'ENDOFPROTECTEDAREA' => pack("U", 0x97),
444 'PADDINGCHARACTER' => pack("U", 0x80),
445 'HIGHOCTETPRESET' => pack("U", 0x81),
446 'SINGLEGRAPHICCHARACTERINTRODUCER' => pack("U", 0x99),
447 'BYTEORDERMARK' => pack("U", 0xFEFF),
448);
449
232cbbee 450my %deprecated_aliases = (
bcc08981
KW
451 # Pre-3.2 compatibility (only for the first 256 characters).
452 # Use of these gives deprecated message.
453 'HORIZONTAL TABULATION' => pack("U", 0x09), # CHARACTER TABULATION
454 'VERTICAL TABULATION' => pack("U", 0x0B), # LINE TABULATION
455 'FILE SEPARATOR' => pack("U", 0x1C), # INFORMATION SEPARATOR FOUR
456 'GROUP SEPARATOR' => pack("U", 0x1D), # INFORMATION SEPARATOR THREE
457 'RECORD SEPARATOR' => pack("U", 0x1E), # INFORMATION SEPARATOR TWO
458 'UNIT SEPARATOR' => pack("U", 0x1F), # INFORMATION SEPARATOR ONE
459 'HORIZONTAL TABULATION SET' => pack("U", 0x88), # CHARACTER TABULATION SET
460 'HORIZONTAL TABULATION WITH JUSTIFICATION' => pack("U", 0x89), # CHARACTER TABULATION WITH JUSTIFICATION
461 'PARTIAL LINE DOWN' => pack("U", 0x8B), # PARTIAL LINE FORWARD
462 'PARTIAL LINE UP' => pack("U", 0x8C), # PARTIAL LINE BACKWARD
463 'VERTICAL TABULATION SET' => pack("U", 0x8A), # LINE TABULATION SET
464 'REVERSE INDEX' => pack("U", 0x8D), # REVERSE LINE FEED
3ffed8c2
KW
465
466 # Unicode 6.0 co-opted this for U+1F514, so deprecate it for now.
467 'BELL' => pack("U", 0x07),
bcc08981 468);
52ea3e69 469
38f4139d
KW
470my %loose_deprecated_aliases = (
471 'HORIZONTALTABULATION' => pack("U", 0x09),
472 'VERTICALTABULATION' => pack("U", 0x0B),
473 'FILESEPARATOR' => pack("U", 0x1C),
474 'GROUPSEPARATOR' => pack("U", 0x1D),
475 'RECORDSEPARATOR' => pack("U", 0x1E),
476 'UNITSEPARATOR' => pack("U", 0x1F),
477 'HORIZONTALTABULATIONSET' => pack("U", 0x88),
478 'HORIZONTALTABULATIONWITHJUSTIFICATION' => pack("U", 0x89),
479 'PARTIALLINEDOWN' => pack("U", 0x8B),
480 'PARTIALLINEUP' => pack("U", 0x8C),
481 'VERTICALTABULATIONSET' => pack("U", 0x8A),
482 'REVERSEINDEX' => pack("U", 0x8D),
483);
484
485# These are special cased in :loose matching, differing only in a medial
486# hyphen
487my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180);
488my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C);
489
84374e30 490
cc26ddeb 491my $txt; # The table of official character names
281aa49e 492
84374e30
KW
493my %full_names_cache; # Holds already-looked-up names, so don't have to
494# re-look them up again. The previous versions of charnames had scoping
495# bugs. For example if we use script A in one scope and find and cache
496# what Z resolves to, we can't use that cache in a different scope that
497# uses script B instead of A, as Z might be an entirely different letter
498# there; or there might be different aliases in effect in different
499# scopes, or :short may be in effect or not effect in different scopes,
500# or various combinations thereof. This was solved in this version
501# mostly by moving things to %^H. But some things couldn't be moved
502# there. One of them was the cache of runtime looked-up names, in part
503# because %^H is read-only at runtime. I (khw) don't know why the cache
504# was run-time only in the previous versions: perhaps oversight; perhaps
505# that compile time looking doesn't happen in a loop so didn't think it
506# was worthwhile; perhaps not wanting to make the cache too large. But
507# I decided to make it compile time as well; this could easily be
508# changed.
509# Anyway, this hash is not scoped, and is added to at runtime. It
510# doesn't have scoping problems because the data in it is restricted to
511# official names, which are always invariant, and we only set it and
512# look at it at during :full lookups, so is unaffected by any other
513# scoped options. I put this in to maintain parity with the older
514# version. If desired, a %short_names cache could also be made, as well
515# as one for each script, say in %script_names_cache, with each key
516# being a hash for a script named in a 'use charnames' statement. I
517# decided not to do that for now, just because it's added complication,
518# and because I'm just trying to maintain parity, not extend it.
519
38f4139d
KW
520# Like %full_names_cache, but for use when :loose is in effect. There needs
521# to be two caches because :loose may not be in effect for a scope, and a
522# loose name could inappropriately be returned when only exact matching is
523# called for.
524my %loose_names_cache;
525
281aa49e
KW
526# Designed so that test decimal first, and then hex. Leading zeros
527# imply non-decimal, as do non-[0-9]
232cbbee
KW
528my $decimal_qr = qr/^[1-9]\d*$/;
529
530# Returns the hex number in $1.
531my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/;
423cee85 532
8878f897
T
533sub croak
534{
535 require Carp; goto &Carp::croak;
536} # croak
537
538sub carp
539{
540 require Carp; goto &Carp::carp;
541} # carp
542
cc26ddeb 543sub alias (@) # Set up a single alias
35c0985d 544{
35c0985d 545 my $alias = ref $_[0] ? $_[0] : { @_ };
232cbbee
KW
546 foreach my $name (keys %$alias) {
547 my $value = $alias->{$name};
52fb7278 548 next unless defined $value; # Omit if screwed up.
84374e30
KW
549
550 # Is slightly slower to just after this statement see if it is
551 # decimal, since we already know it is after having converted from
552 # hex, but makes the code easier to maintain, and is called
553 # infrequently, only at compile-time
554 if ($value !~ $decimal_qr && $value =~ $hex_qr) {
555 $value = CORE::hex $1;
556 }
232cbbee 557 if ($value =~ $decimal_qr) {
0ae19c29 558 no warnings qw(non_unicode surrogate nonchar); # Allow any non-malformed
fb121860 559 $^H{charnames_ord_aliases}{$name} = pack("U", $value);
232cbbee
KW
560
561 # Use a canonical form.
b1c167a3 562 $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
232cbbee
KW
563 }
564 else {
52fb7278
KW
565 # XXX validate syntax when deprecation cycle complete. ie. start
566 # with an alpha only, etc.
567 $^H{charnames_name_aliases}{$name} = $value;
232cbbee
KW
568 }
569 }
35c0985d
MB
570} # alias
571
5a7fb30a 572sub not_legal_use_bytes_msg {
fb121860
KW
573 my ($name, $utf8) = @_;
574 my $return;
575
576 if (length($utf8) == 1) {
577 $return = sprintf("Character 0x%04x with name '%s' is", ord $utf8, $name);
578 } else {
579 $return = sprintf("String with name '%s' (and ordinals %s) contains character(s)", $name, join(" ", map { sprintf "0x%04X", ord $_ } split(//, $utf8)));
580 }
581 return $return . " above 0xFF with 'use bytes' in effect";
5a7fb30a
KW
582}
583
281aa49e 584sub alias_file ($) # Reads a file containing alias definitions
35c0985d 585{
51cf30b6
MB
586 my ($arg, $file) = @_;
587 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
588 $file = $arg;
589 }
590 elsif ($arg =~ m/^\w+$/) {
591 $file = "unicore/${arg}_alias.pl";
592 }
593 else {
594 croak "Charnames alias files can only have identifier characters";
595 }
35c0985d 596 if (my @alias = do $file) {
51cf30b6
MB
597 @alias == 1 && !defined $alias[0] and
598 croak "$file cannot be used as alias file for charnames";
599 @alias % 2 and
600 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
601 alias (@alias);
602 return (1);
603 }
604 0;
605} # alias_file
606
03f95285
KW
607# For use when don't import anything. This structure must be kept in
608# sync with the one that import() fills up.
609my %dummy_H = (
610 charnames_stringified_names => "",
611 charnames_stringified_ords => "",
612 charnames_scripts => "",
613 charnames_full => 1,
38f4139d 614 charnames_loose => 0,
03f95285
KW
615 charnames_short => 0,
616 );
617
63098191 618
fb121860
KW
619sub lookup_name ($$$) {
620 my ($name, $wants_ord, $runtime) = @_;
63098191 621
fb121860
KW
622 # Lookup the name or sequence $name in the tables. If $wants_ord is false,
623 # returns the string equivalent of $name; if true, returns the ordinal value
624 # instead, but in this case $name must not be a sequence; otherwise undef is
625 # returned and a warning raised. $runtime is 0 if compiletime, otherwise
626 # gives the number of stack frames to go back to get the application caller
627 # info.
628 # If $name is not found, returns undef in runtime with no warning; and in
629 # compiletime, the Unicode replacement character, with a warning.
63098191 630
fb121860
KW
631 # It looks first in the aliases, then in the large table of official Unicode
632 # names.
84374e30 633
9deebca3 634 my $utf8; # The string result
e79869e1 635 my $save_input;
b177ca84 636
84374e30 637 if ($runtime) {
03f95285 638
fb121860
KW
639 my $hints_ref = (caller($runtime))[10];
640
03f95285
KW
641 # If we didn't import anything (which happens with 'use charnames ()',
642 # substitute a dummy structure.
643 $hints_ref = \%dummy_H if ! defined $hints_ref
38f4139d
KW
644 || (! defined $hints_ref->{charnames_full}
645 && ! defined $hints_ref->{charnames_loose});
03f95285 646
84374e30
KW
647 # At runtime, but currently not at compile time, $^H gets
648 # stringified, so un-stringify back to the original data structures.
649 # These get thrown away by perl before the next invocation
650 # Also fill in the hash with the non-stringified data.
03f95285 651 # N.B. New fields must be also added to %dummy_H
84374e30 652
03f95285
KW
653 %{$^H{charnames_name_aliases}} = split ',',
654 $hints_ref->{charnames_stringified_names};
655 %{$^H{charnames_ord_aliases}} = split ',',
656 $hints_ref->{charnames_stringified_ords};
e79869e1 657 $^H{charnames_scripts} = $hints_ref->{charnames_scripts};
84374e30 658 $^H{charnames_full} = $hints_ref->{charnames_full};
38f4139d 659 $^H{charnames_loose} = $hints_ref->{charnames_loose};
84374e30
KW
660 $^H{charnames_short} = $hints_ref->{charnames_short};
661 }
662
38f4139d
KW
663 my $loose = $^H{charnames_loose};
664 my $lookup_name; # Input name suitably modified for grepping for in the
665 # table
666
232cbbee 667 # User alias should be checked first or else can't override ours, and if we
9deebca3 668 # were to add any, could conflict with theirs.
84374e30 669 if (exists $^H{charnames_ord_aliases}{$name}) {
f1ccd77d 670 $utf8 = $^H{charnames_ord_aliases}{$name};
16036bcd 671 }
84374e30
KW
672 elsif (exists $^H{charnames_name_aliases}{$name}) {
673 $name = $^H{charnames_name_aliases}{$name};
38f4139d
KW
674 $save_input = $lookup_name = $name; # Cache the result for any error
675 # message
676 # The aliases are documented to not match loosely, so change loose match
677 # into full.
678 if ($loose) {
679 $loose = 0;
680 $^H{charnames_full} = 1;
681 }
52ea3e69 682 }
38f4139d
KW
683 else {
684
685 # Here, not a user alias. That means that loose matching may be in
686 # effect; will have to modify the input name.
687 $lookup_name = $name;
688 if ($loose) {
689 $lookup_name = uc $lookup_name;
690
691 # Squeeze out all underscores
692 $lookup_name =~ s/_//g;
693
694 # Remove all medial hyphens
695 $lookup_name =~ s/ (?<= \S ) - (?= \S )//gx;
696
697 # Squeeze out all spaces
698 $lookup_name =~ s/\s//g;
699 }
700
701 # Here, $lookup_name has been modified as necessary for looking in the
702 # hashes. Check the system alias files next. Most of these aliases are
703 # the same for both strict and loose matching. To save space, the ones
704 # which differ are in their own separate hash, which is checked if loose
705 # matching is selected and the regular match fails. To save time, the
706 # loose hashes could be expanded to include all aliases, and there would
707 # only have to be one check. But if someone specifies :loose, they are
708 # interested in convenience over speed, and the time for this second check
709 # is miniscule compared to the rest of the routine.
710 if (exists $system_aliases{$lookup_name}) {
711 $utf8 = $system_aliases{$lookup_name};
712 }
713 elsif ($loose && exists $loose_system_aliases{$lookup_name}) {
714 $utf8 = $loose_system_aliases{$lookup_name};
715 }
716 elsif (exists $deprecated_aliases{$lookup_name}) {
717 require warnings;
718 warnings::warnif('deprecated',
719 "Unicode character name \"$name\" is deprecated, use \""
720 . viacode(ord $deprecated_aliases{$lookup_name})
721 . "\" instead");
722 $utf8 = $deprecated_aliases{$lookup_name};
723 }
724 elsif ($loose && exists $loose_deprecated_aliases{$lookup_name}) {
725 require warnings;
726 warnings::warnif('deprecated',
727 "Unicode character name \"$name\" is deprecated, use \""
728 . viacode(ord $loose_deprecated_aliases{$lookup_name})
729 . "\" instead");
730 $utf8 = $loose_deprecated_aliases{$lookup_name};
731 }
52ea3e69 732 }
b177ca84 733
38f4139d 734 my @off; # Offsets into table of pattern match begin and end
52ea3e69 735
38f4139d 736 # If haven't found it yet...
f1ccd77d 737 if (! defined $utf8) {
35c0985d 738
9deebca3 739 # See if has looked this input up earlier.
38f4139d 740 if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) {
f1ccd77d 741 $utf8 = $full_names_cache{$name};
35c0985d 742 }
38f4139d
KW
743 elsif ($loose && exists $loose_names_cache{$name}) {
744 $utf8 = $loose_names_cache{$name};
745 }
746 else { # Here, must do a look-up
747
748 # If full or loose matching succeeded, points to where to cache the
749 # result
750 my $cache_ref;
35c0985d 751
84374e30
KW
752 ## Suck in the code/name list as a big string.
753 ## Lines look like:
73d9566f 754 ## "00052\tLATIN CAPITAL LETTER R\n"
fb121860
KW
755 # or
756 # "0052 0303\tLATIN CAPITAL LETTER R WITH TILDE\n"
84374e30
KW
757 $txt = do "unicore/Name.pl" unless $txt;
758
759 ## @off will hold the index into the code/name string of the start and
760 ## end of the name as we find it.
761
38f4139d
KW
762 ## If :loose, look for a loose match; if :full, look for the name
763 ## exactly
6294fed8
KW
764 # First, see if the name is one which is algorithmically determinable.
765 # The subroutine is included in Name.pl. The table contained in
766 # $txt doesn't contain these. Experiments show that checking
767 # for these before checking for the regular names has no
768 # noticeable impact on performance for the regular names, but
769 # the other way around slows down finding these immensely.
770 # Algorithmically determinables are not placed in the cache because
771 # that uses up memory, and finding these again is fast.
38f4139d
KW
772 if (($loose || $^H{charnames_full})
773 && (defined (my $ord = name_to_code_point_special($lookup_name, $loose))))
774 {
775 $utf8 = pack("U", $ord);
776 }
777 else {
778
779 # Not algorithmically determinable; look up in the table. The name
780 # will be turned into a regex, so quote any meta characters.
781 $lookup_name = quotemeta $lookup_name;
782
783 if ($loose) {
784
785 # For loose matches, $lookup_name has already squeezed out the
786 # non-essential characters. We have to add in code to make the
787 # squeezed version match the non-squeezed equivalent in the table.
788 # The only remaining hyphens are ones that start or end a word in
789 # the original. They have been quoted in $lookup_name so they look
790 # like "\-". Change all other characters except the backslash
791 # quotes for any metacharacters, and the final character, so that
792 # e.g., COLON gets transformed into: /C[- ]?O[- ]?L[- ]?O[- ]?N/
793 $lookup_name =~ s/ (?! \\ -) # Don't do this to the \- sequence
794 ( [^-\\] ) # Nor the "-" within that sequence,
795 # nor the "\" that quotes metachars,
796 # but otherwise put the char into $1
797 (?=.) # And don't do it for the final char
798 /$1\[- \]?/gx; # And add an optional blank or
799 # '-' after each $1 char
800
801 # Those remaining hyphens were originally at the beginning or end of
802 # a word, so they can match either a blank before or after, but not
803 # both. (Keep in mind that they have been quoted, so are a '\-'
804 # sequence)
805 $lookup_name =~ s/\\ -/(?:- | -)/xg;
fb121860 806 }
5bd59e57 807
38f4139d
KW
808 # Do the lookup in the full table if asked for, and if succeeds
809 # save the offsets and set where to cache the result.
810 if (($loose || $^H{charnames_full}) && $txt =~ /\t$lookup_name$/m) {
811 @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
812 $cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache;
52fb7278 813 }
38f4139d 814 else {
84374e30 815
6294fed8
KW
816 # Here, didn't look for, or didn't find the name.
817 # If :short is allowed, see if input is like "greek:Sigma".
818 # Keep in mind that $lookup_name has had the metas quoted.
819 my $scripts_trie = "";
820 my $name_has_uppercase;
821 if (($^H{charnames_short})
822 && $lookup_name =~ /^ (?: \\ \s)* # Quoted space
823 (.+?) # $1 = the script
824 (?: \\ \s)*
825 \\ : # Quoted colon
826 (?: \\ \s)*
827 (.+?) # $2 = the name
828 (?: \\ \s)* $
829 /xs)
830 {
831 # Even in non-loose matching, the script traditionally has been
832 # case insensitve
833 $scripts_trie = "\U$1";
834 $lookup_name = $2;
835
836 # Use original name to find its input casing, but ignore the
837 # script part of that to make the determination.
838 $save_input = $name if ! defined $save_input;
839 $name =~ s/.*?://;
840 $name_has_uppercase = $name =~ /[[:upper:]]/;
841 }
842 else { # Otherwise look in allowed scripts
843 $scripts_trie = $^H{charnames_scripts};
844
845 # Use original name to find its input casing
846 $name_has_uppercase = $name =~ /[[:upper:]]/;
847 }
848
849 my $case = $name_has_uppercase ? "CAPITAL" : "SMALL";
850 if (! $scripts_trie
851 || $txt !~
852 /\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm)
853 {
854 # Here we still don't have it, give up.
855 return if $runtime;
856
857 # May have zapped input name, get it again.
858 $name = (defined $save_input) ? $save_input : $_[0];
859 carp "Unknown charname '$name'";
860 return ($wants_ord) ? 0xFFFD : pack("U", 0xFFFD);
861 }
862
863 # Here have found the input name in the table.
864 @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
38f4139d 865 }
b1c167a3 866
69ccf208
KW
867 # Here, the input name has been found; we haven't set up the output,
868 # but we know where in the string
fb121860 869 # the name starts. The string is set up so that for single characters
98dc9551 870 # (and not named sequences), the name is preceded immediately by a
fb121860 871 # tab and 5 hex digits for its code, with a \n before those. Named
98dc9551 872 # sequences won't have the 7th preceding character be a \n.
fb121860
KW
873 # (Actually, for the very first entry in the table this isn't strictly
874 # true: subtracting 7 will yield -1, and the substr below will
875 # therefore yield the very last character in the table, which should
876 # also be a \n, so the statement works anyway.)
877 if (substr($txt, $off[0] - 7, 1) eq "\n") {
878 $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5));
38f4139d
KW
879
880 # Handle the single loose matching special case, in which two names
881 # differ only by a single medial hyphen. If the original had a
882 # hyphen (or more) in the right place, then it is that one.
883 $utf8 = $HANGUL_JUNGSEONG_O_E_utf8
884 if $loose
885 && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8
886 && $name =~ m/O \s* - [-\s]* E/ix;
887 # Note that this wouldn't work if there were a 2nd
888 # OE in the name
fb121860
KW
889 }
890 else {
891
892 # Here, is a named sequence. Need to go looking for the beginning,
893 # which is just after the \n from the previous entry in the table.
894 # The +1 skips past that newline, or, if the rindex() fails, to put
895 # us to an offset of zero.
896 my $charstart = rindex($txt, "\n", $off[0] - 7) + 1;
897 $utf8 = pack("U*", map { CORE::hex }
898 split " ", substr($txt, $charstart, $off[0] - $charstart - 1));
899 }
5bd59e57 900 }
84374e30
KW
901
902 # Cache the input so as to not have to search the large table
903 # again, but only if it came from the one search that we cache.
38f4139d
KW
904 # (Haven't bothered with the pain of sorting out scoping issues for the
905 # scripts searches.)
906 $cache_ref->{$name} = $utf8 if defined $cache_ref;
35c0985d 907 }
423cee85 908 }
b177ca84 909
889a6fe0 910$Carp::Internal{ (__PACKAGE__) } = 1;
63098191 911
fb121860
KW
912 # Here, have the utf8. If the return is to be an ord, must be any single
913 # character.
914 if ($wants_ord) {
915 return ord($utf8) if length $utf8 == 1;
916 }
917 else {
918
919 # Here, wants string output. If utf8 is acceptable, just return what
920 # we've got; otherwise attempt to convert it to non-utf8 and return that.
921 my $in_bytes = ($runtime)
922 ? (caller $runtime)[8] & $bytes::hint_bits
923 : $^H & $bytes::hint_bits;
924 return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg
925 # means don't die on failure
926 }
927
928 # Here, there is an error: either there are too many characters, or the
929 # result string needs to be non-utf8, and at least one character requires
930 # utf8. Prefer any official name over the input one for the error message.
e79869e1
KW
931 if (@off) {
932 $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
933 }
934 else {
935 $name = (defined $save_input) ? $save_input : $_[0];
936 }
fb121860
KW
937
938 if ($wants_ord) {
939 # Only way to get here in this case is if result too long. Message
940 # assumes that our only caller that requires single char result is
941 # vianame.
942 carp "charnames::vianame() doesn't handle named sequences ($name). Use charnames::string_vianame() instead";
943 return;
944 }
945
946 # Only other possible failure here is from use bytes.
947 if ($runtime) {
948 carp not_legal_use_bytes_msg($name, $utf8);
949 return;
950 } else {
951 croak not_legal_use_bytes_msg($name, $utf8);
952 }
953
63098191
KW
954} # lookup_name
955
956sub charnames {
63098191 957
9deebca3
KW
958 # For \N{...}. Looks up the character name and returns the string
959 # representation of it.
63098191 960
fb121860
KW
961 # The first 0 arg means wants a string returned; the second that we are in
962 # compile time
963 return lookup_name($_[0], 0, 0);
63098191 964}
423cee85 965
b177ca84
JF
966sub import
967{
968 shift; ## ignore class name
969
35c0985d 970 if (not @_) {
1f874cb6 971 carp("'use charnames' needs explicit imports list");
b177ca84 972 }
423cee85 973 $^H{charnames} = \&charnames ;
84374e30
KW
974 $^H{charnames_ord_aliases} = {};
975 $^H{charnames_name_aliases} = {};
976 $^H{charnames_inverse_ords} = {};
03f95285
KW
977 # New fields must be added to %dummy_H, and the code in lookup_name()
978 # that copies fields from the runtime structure
b177ca84
JF
979
980 ##
981 ## fill %h keys with our @_ args.
982 ##
35c0985d 983 my ($promote, %h, @args) = (0);
e5c3f898
MG
984 while (my $arg = shift) {
985 if ($arg eq ":alias") {
51cf30b6 986 @_ or
52fb7278 987 croak ":alias needs an argument in charnames";
35c0985d
MB
988 my $alias = shift;
989 if (ref $alias) {
52fb7278
KW
990 ref $alias eq "HASH" or
991 croak "Only HASH reference supported as argument to :alias";
992 alias ($alias);
993 next;
35c0985d 994 }
51cf30b6 995 if ($alias =~ m{:(\w+)$}) {
38f4139d 996 $1 eq "full" || $1 eq "loose" || $1 eq "short" and
52fb7278
KW
997 croak ":alias cannot use existing pragma :$1 (reversed order?)";
998 alias_file ($1) and $promote = 1;
999 next;
35c0985d 1000 }
51cf30b6
MB
1001 alias_file ($alias);
1002 next;
1003 }
38f4139d
KW
1004 if (substr($arg, 0, 1) eq ':'
1005 and ! ($arg eq ":full" || $arg eq ":short" || $arg eq ":loose"))
1006 {
e5c3f898 1007 warn "unsupported special '$arg' in charnames";
51cf30b6 1008 next;
35c0985d 1009 }
e5c3f898 1010 push @args, $arg;
35c0985d
MB
1011 }
1012 @args == 0 && $promote and @args = (":full");
1013 @h{@args} = (1) x @args;
b177ca84 1014
38f4139d
KW
1015 # Don't leave these undefined as are tested for in lookup_names
1016 $^H{charnames_full} = delete $h{':full'} || 0;
1017 $^H{charnames_loose} = delete $h{':loose'} || 0;
03f95285 1018 $^H{charnames_short} = delete $h{':short'} || 0;
363879a0 1019 my @scripts = map { uc quotemeta } keys %h;
b177ca84
JF
1020
1021 ##
1022 ## If utf8? warnings are enabled, and some scripts were given,
281aa49e 1023 ## see if at least we can find one letter from each script.
b177ca84 1024 ##
e79869e1 1025 if (warnings::enabled('utf8') && @scripts) {
35c0985d
MB
1026 $txt = do "unicore/Name.pl" unless $txt;
1027
e79869e1 1028 for my $script (@scripts) {
73d9566f 1029 if (not $txt =~ m/\t$script (?:CAPITAL |SMALL )?LETTER /) {
52fb7278 1030 warnings::warn('utf8', "No such script: '$script'");
e79869e1 1031 $script = quotemeta $script; # Escape it, for use in the re.
b177ca84 1032 }
35c0985d 1033 }
bd62941a 1034 }
84374e30
KW
1035
1036 # %^H gets stringified, so serialize it ourselves so can extract the
1037 # real data back later.
1038 $^H{charnames_stringified_ords} = join ",", %{$^H{charnames_ord_aliases}};
1039 $^H{charnames_stringified_names} = join ",", %{$^H{charnames_name_aliases}};
1040 $^H{charnames_stringified_inverse_ords} = join ",", %{$^H{charnames_inverse_ords}};
38f4139d
KW
1041
1042 # Modify the input script names for loose name matching if that is also
1043 # specified, similar to the way the base character name is prepared. They
1044 # don't (currently, and hopefully never will) have dashes. These go into a
1045 # regex, and have already been uppercased and quotemeta'd. Squeeze out all
1046 # input underscores, blanks, and dashes. Then convert so will match a blank
1047 # between any characters.
1048 if ($^H{charnames_loose}) {
1049 for (my $i = 0; $i < @scripts; $i++) {
1050 $scripts[$i] =~ s/[_ -]//g;
1051 $scripts[$i] =~ s/ ( [^\\] ) (?= . ) /$1\\ ?/gx;
1052 }
1053 }
1054
e79869e1 1055 $^H{charnames_scripts} = join "|", @scripts; # Stringifiy them as a trie
35c0985d 1056} # import
423cee85 1057
84374e30
KW
1058# Cache of already looked-up values. This is set to only contain
1059# official values, and user aliases can't override them, so scoping is
1060# not an issue.
1061my %viacode;
63098191
KW
1062
1063sub viacode {
1064
1065 # Returns the name of the code point argument
4e2cda5d 1066
35c0985d
MB
1067 if (@_ != 1) {
1068 carp "charnames::viacode() expects one argument";
bd5c3bd9 1069 return;
35c0985d 1070 }
f0175764 1071
35c0985d 1072 my $arg = shift;
b177ca84 1073
e5432b89
KW
1074 # This is derived from Unicode::UCD, where it is nearly the same as the
1075 # function _getcode(), but here it makes sure that even a hex argument
1076 # has the proper number of leading zeros, which is critical in
1077 # matching against $txt below
281aa49e 1078 # Must check if decimal first; see comments at that definition
35c0985d 1079 my $hex;
232cbbee 1080 if ($arg =~ $decimal_qr) {
b1c167a3 1081 $hex = sprintf "%05X", $arg;
232cbbee 1082 } elsif ($arg =~ $hex_qr) {
e10d7780 1083 # Below is the line that differs from the _getcode() source
b1c167a3 1084 $hex = sprintf "%05X", hex $1;
35c0985d
MB
1085 } else {
1086 carp("unexpected arg \"$arg\" to charnames::viacode()");
1087 return;
1088 }
b177ca84 1089
35c0985d 1090 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 1091
ac046fe1
KW
1092 # If the code point is above the max in the table, there's no point
1093 # looking through it. Checking the length first is slightly faster
1094 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) {
1095 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 1096
5bd59e57
KW
1097 # See if the name is algorithmically determinable.
1098 my $algorithmic = code_point_to_name_special(CORE::hex $hex);
1099 if (defined $algorithmic) {
1100 $viacode{$hex} = $algorithmic;
1101 return $algorithmic;
1102 }
1103
ac046fe1
KW
1104 # Return the official name, if exists. It's unclear to me (khw) at
1105 # this juncture if it is better to return a user-defined override, so
1106 # leaving it as is for now.
73d9566f 1107 if ($txt =~ m/^$hex\t/m) {
f3227b74 1108
52fb7278
KW
1109 # The name starts with the next character and goes up to the
1110 # next new-line. Using capturing parentheses above instead of
1111 # @+ more than doubles the execution time in Perl 5.13
f3227b74 1112 $viacode{$hex} = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
52fb7278 1113 return $viacode{$hex};
ac046fe1 1114 }
232cbbee
KW
1115 }
1116
1117 # See if there is a user name for it, before giving up completely.
03f95285
KW
1118 # First get the scoped aliases, give up if have none.
1119 my $H_ref = (caller(0))[10];
1120 return if ! defined $H_ref
1121 || ! exists $H_ref->{charnames_stringified_inverse_ords};
1122
84374e30 1123 my %code_point_aliases = split ',',
03f95285 1124 $H_ref->{charnames_stringified_inverse_ords};
84374e30 1125 if (! exists $code_point_aliases{$hex}) {
ac046fe1
KW
1126 if (CORE::hex($hex) > 0x10FFFF) {
1127 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
1128 }
1129 return;
1130 }
bd5c3bd9 1131
84374e30 1132 return $code_point_aliases{$hex};
35c0985d 1133} # viacode
daf0d493
JH
1134
1135sub vianame
1136{
35c0985d
MB
1137 if (@_ != 1) {
1138 carp "charnames::vianame() expects one name argument";
1139 return ()
1140 }
daf0d493 1141
63098191
KW
1142 # Looks up the character name and returns its ordinal if
1143 # found, undef otherwise.
daf0d493 1144
63098191 1145 my $arg = shift;
dbc0d4f2 1146
63098191 1147 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d 1148
fb121860
KW
1149 # khw claims that this is poor interface design. The function should
1150 # return either a an ord or a chr for all inputs; not be bipolar. But
1151 # can't change it because of backward compatibility. New code can use
1152 # string_vianame() instead.
5a7fb30a
KW
1153 my $ord = CORE::hex $1;
1154 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
fb121860 1155 carp not_legal_use_bytes_msg($arg, chr $ord);
5a7fb30a 1156 return;
63098191 1157 }
daf0d493 1158
fb121860
KW
1159 # The first 1 arg means wants an ord returned; the second that we are in
1160 # runtime, and this is the first level routine called from the user
1161 return lookup_name($arg, 1, 1);
35c0985d 1162} # vianame
b177ca84 1163
fb121860
KW
1164sub string_vianame {
1165
1166 # Looks up the character name and returns its string representation if
1167 # found, undef otherwise.
1168
1169 if (@_ != 1) {
1170 carp "charnames::string_vianame() expects one name argument";
1171 return;
1172 }
1173
1174 my $arg = shift;
1175
1176 if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
1177
1178 my $ord = CORE::hex $1;
1179 return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits);
1180
1181 carp not_legal_use_bytes_msg($arg, chr $ord);
1182 return;
1183 }
1184
1185 # The 0 arg means wants a string returned; the 1 arg means that we are in
1186 # runtime, and this is the first level routine called from the user
1187 return lookup_name($arg, 0, 1);
1188} # string_vianame
1189
1190
423cee85
JH
1191
11921;
1193__END__
1194
1195=head1 NAME
1196
fb121860 1197charnames - access to Unicode character names and named character sequences; also define character names
423cee85
JH
1198
1199=head1 SYNOPSIS
1200
bcc08981
KW
1201 use charnames ':full';
1202 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
1203 print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}",
1204 " is an officially named sequence of two Unicode characters\n";
1205
38f4139d
KW
1206 use charnames ':loose';
1207 print "\N{Greek small-letter sigma}",
1208 "can be used to ignore case, underscores, most blanks,"
1209 "and when you aren't sure if the official name has hyphens\n";
1210
bcc08981
KW
1211 use charnames ':short';
1212 print "\N{greek:Sigma} is an upper-case sigma.\n";
1213
1214 use charnames qw(cyrillic greek);
1215 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
1216
1217 use charnames ":full", ":alias" => {
1218 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
1219 mychar => 0xE8000, # Private use area
1220 };
1221 print "\N{e_ACUTE} is a small letter e with an acute.\n";
14aeae98 1222 print "\N{mychar} allows me to name private use characters.\n";
bcc08981
KW
1223
1224 use charnames ();
1225 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
1226 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
1227 # "10330"
1228 print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on
1229 # ASCII platforms;
1230 # 193 on EBCDIC
1231 print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A"
b177ca84 1232
423cee85
JH
1233=head1 DESCRIPTION
1234
da9dec57 1235Pragma C<use charnames> is used to gain access to the names of the
fb121860
KW
1236Unicode characters and named character sequences, and to allow you to define
1237your own character and character sequence names.
1238
1239All forms of the pragma enable use of the following 3 functions:
1240
1241=over
1242
1243=item *
1244
1245L</charnames::string_vianame(I<name>)> for run-time lookup of a
1246either a character name or a named character sequence, returning its string
1247representation
1248
1249=item *
1250
1251L</charnames::vianame(I<name>)> for run-time lookup of a
1252character name (but not a named character sequence) to get its ordinal value
1253(code point)
da9dec57 1254
fb121860 1255=item *
da9dec57 1256
fb121860
KW
1257L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its
1258Unicode name.
1259
1260=back
1261
1262All forms other than C<S<"use charnames ();">> also enable the use of
da9dec57 1263C<\N{I<CHARNAME>}> sequences to compile a Unicode character into a
8ebef31d 1264string, based on its name.
da9dec57
KW
1265
1266Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
1267also inserts a character into a string, but doesn't require the use of
1268this pragma. The character it inserts is the one whose code point
1269(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
1270the Unicode (white background, black foreground) smiley face; it doesn't
1271require this pragma, whereas the equivalent, C<"\N{WHITE SMILING FACE}">
1272does.
d9f23c72 1273Also note, C<\N{I<...>}> can mean a regex quantifier instead of a character
8ebef31d
KW
1274name, when the I<...> is a number (or comma separated pair of numbers
1275(see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
da9dec57 1276
38f4139d
KW
1277The C<charnames> pragma supports arguments C<:full>, C<:loose>, C<:short>,
1278script names and L<customized aliases|/CUSTOM ALIASES>.
1279
1280If C<:full> is present, for expansion of
da9dec57 1281C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
38f4139d
KW
1282standard Unicode character names.
1283
1284C<:loose> is a variant of C<:full> which allows I<CHARNAME> to be less
1285precisely specified. Details are in L</LOOSE MATCHES>.
1286
1287If C<:short> is present, and
da9dec57 1288I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
14aeae98
KW
1289as a letter in script I<SCRIPT>, as described in the next paragraph.
1290Or, if C<use charnames> is used
da9dec57
KW
1291with script name arguments, then for C<\N{I<CHARNAME>}> the name
1292I<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd
KW
1293specified order). Customized aliases can override these, and are explained in
1294L</CUSTOM ALIASES>.
423cee85 1295
da9dec57 1296For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>
14aeae98 1297this pragma looks in the table of standard Unicode names for the names
423cee85
JH
1298
1299 SCRIPTNAME CAPITAL LETTER CHARNAME
1300 SCRIPTNAME SMALL LETTER CHARNAME
1301 SCRIPTNAME LETTER CHARNAME
1302
14aeae98 1303If I<CHARNAME> is all lowercase,
daf0d493 1304then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
14aeae98 1305is ignored, and both I<CHARNAME> and I<SCRIPTNAME> are converted to all
38f4139d
KW
1306uppercase for look-up. Other than that, both of them follow L<loose|/LOOSE
1307MATCHES> rules if C<:loose> is also specified; strict otherwise.
daf0d493 1308
da9dec57
KW
1309Note that C<\N{...}> is compile-time; it's a special form of string
1310constant used inside double-quotish strings; this means that you cannot
4e2cda5d 1311use variables inside the C<\N{...}>. If you want similar run-time
fb121860
KW
1312functionality, use
1313L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>.
423cee85 1314
301a3cda 1315For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
da9dec57
KW
1316there are no official Unicode names but you can use instead the ISO 6429
1317names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF,
1f31fcd4 1318ESC, ...). In Unicode 3.2 (as of Perl 5.8) some naming changes took
b59ae8bb 1319place, and ISO 6429 was updated, see L</ALIASES>. Since Unicode 6.0, it
d9f23c72
KW
1320is deprecated to use C<BELL>. Instead use C<ALERT> (but C<BEL> will continue
1321to work).
301a3cda 1322
e5432b89
KW
1323If the input name is unknown, C<\N{NAME}> raises a warning and
1324substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD).
1325
8ebef31d
KW
1326For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the
1327input name is that of a character that won't fit into a byte (i.e., whose
1328ordinal is above 255).
e5432b89 1329
da9dec57
KW
1330Otherwise, any string that includes a C<\N{I<charname>}> or
1331C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see
1332L<perlunicode/Byte and Character Semantics>).
1333
38f4139d
KW
1334=head1 LOOSE MATCHES
1335
1336By specifying C<:loose>, Unicode's L<loose character name
5ef88e32 1337matching|http://www.unicode.org/reports/tr44#Matching_Rules> rules are
38f4139d
KW
1338selected instead of the strict exact match used otherwise.
1339That means that I<CHARNAME> doesn't have to be so precisely specified.
1340Upper/lower case doesn't matter (except with scripts as mentioned above), nor
1341do any underscores, and the only hyphens that matter are those at the
1342beginning or end of a word in the name (with one exception: the hyphen in
1343U+1180 C<HANGUL JUNGSEONG O-E> does matter).
1344Also, blanks not adjacent to hyphens don't matter.
1345The official Unicode names are quite variable as to where they use hyphens
1346versus spaces to separate word-like units, and this option allows you to not
1347have to care as much.
1348The reason non-medial hyphens matter is because of cases like
1349U+0F60 C<TIBETAN LETTER -A> versus U+0F68 C<TIBETAN LETTER A>.
1350The hyphen here is significant, as is the space before it, and so both must be
1351included.
1352
1353C<:loose> slows down look-ups by a factor of 2 to 3 versus
1354C<:full>, but the trade-off may be worth it to you. Each individual look-up
1355takes very little time, and the results are cached, so the speed difference
1356would become a factor only in programs that do look-ups of many different
1357spellings, and probably only when those look-ups are through vianame() and
1358string_vianame(), since C<\N{...}> look-ups are done at compile time.
1359
5ffe0e96 1360=head1 ALIASES
423cee85 1361
14aeae98
KW
1362A few aliases have been defined for convenience; instead of having
1363to use the official names,
423cee85 1364
5ffe0e96
MB
1365 LINE FEED (LF)
1366 FORM FEED (FF)
1367 CARRIAGE RETURN (CR)
1368 NEXT LINE (NEL)
423cee85 1369
e5432b89 1370(yes, with parentheses), one can use
d5448623 1371
5ffe0e96
MB
1372 LINE FEED
1373 FORM FEED
1374 CARRIAGE RETURN
1375 NEXT LINE
1376 LF
1377 FF
1378 CR
1379 NEL
1380
16036bcd
KW
1381All the other standard abbreviations for the controls, such as C<ACK> for
1382C<ACKNOWLEDGE> also can be used.
1383
5ffe0e96
MB
1384One can also use
1385
1386 BYTE ORDER MARK
1387 BOM
1388
16036bcd
KW
1389and these abbreviations
1390
1391 Abbreviation Full Name
1392
1393 CGJ COMBINING GRAPHEME JOINER
1394 FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE
1395 FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO
1396 FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE
1397 LRE LEFT-TO-RIGHT EMBEDDING
1398 LRM LEFT-TO-RIGHT MARK
1399 LRO LEFT-TO-RIGHT OVERRIDE
1400 MMSP MEDIUM MATHEMATICAL SPACE
1401 MVS MONGOLIAN VOWEL SEPARATOR
1402 NBSP NO-BREAK SPACE
1403 NNBSP NARROW NO-BREAK SPACE
1404 PDF POP DIRECTIONAL FORMATTING
1405 RLE RIGHT-TO-LEFT EMBEDDING
1406 RLM RIGHT-TO-LEFT MARK
1407 RLO RIGHT-TO-LEFT OVERRIDE
1408 SHY SOFT HYPHEN
1409 VS1 VARIATION SELECTOR-1
1410 .
1411 .
1412 .
1413 VS256 VARIATION SELECTOR-256
1414 WJ WORD JOINER
1415 ZWJ ZERO WIDTH JOINER
1416 ZWNJ ZERO WIDTH NON-JOINER
1417 ZWSP ZERO WIDTH SPACE
5ffe0e96
MB
1418
1419For backward compatibility one can use the old names for
1420certain C0 and C1 controls
1421
1422 old new
1423
5ffe0e96
MB
1424 FILE SEPARATOR INFORMATION SEPARATOR FOUR
1425 GROUP SEPARATOR INFORMATION SEPARATOR THREE
16036bcd
KW
1426 HORIZONTAL TABULATION CHARACTER TABULATION
1427 HORIZONTAL TABULATION SET CHARACTER TABULATION SET
1428 HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION
1429 WITH JUSTIFICATION
5ffe0e96
MB
1430 PARTIAL LINE DOWN PARTIAL LINE FORWARD
1431 PARTIAL LINE UP PARTIAL LINE BACKWARD
16036bcd
KW
1432 RECORD SEPARATOR INFORMATION SEPARATOR TWO
1433 REVERSE INDEX REVERSE LINE FEED
1434 UNIT SEPARATOR INFORMATION SEPARATOR ONE
1435 VERTICAL TABULATION LINE TABULATION
1436 VERTICAL TABULATION SET LINE TABULATION SET
5ffe0e96
MB
1437
1438but the old names in addition to giving the character
1439will also give a warning about being deprecated.
423cee85 1440
16036bcd
KW
1441And finally, certain published variants are usable, including some for
1442controls that have no Unicode names:
1443
1f31fcd4
KW
1444 name character
1445
52fb7278 1446 END OF PROTECTED AREA END OF GUARDED AREA, U+0097
1f31fcd4
KW
1447 HIGH OCTET PRESET U+0081
1448 HOP U+0081
1449 IND U+0084
1450 INDEX U+0084
1451 PAD U+0080
1452 PADDING CHARACTER U+0080
1453 PRIVATE USE 1 PRIVATE USE ONE, U+0091
1454 PRIVATE USE 2 PRIVATE USE TWO, U+0092
1455 SGC U+0099
1456 SINGLE GRAPHIC CHARACTER INTRODUCER U+0099
1457 SINGLE-SHIFT 2 SINGLE SHIFT TWO, U+008E
1458 SINGLE-SHIFT 3 SINGLE SHIFT THREE, U+008F
1459 START OF PROTECTED AREA START OF GUARDED AREA, U+0096
16036bcd 1460
35c0985d
MB
1461=head1 CUSTOM ALIASES
1462
1f31fcd4
KW
1463You can add customized aliases to standard (C<:full>) Unicode naming
1464conventions. The aliases override any standard definitions, so, if
da9dec57
KW
1465you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
1466mean C<"B">, etc.
55bc7d3c
KW
1467
1468Note that an alias should not be something that is a legal curly
1469brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example
e5432b89
KW
1470C<\N{123}> means to match 123 non-newline characters, and is not treated as a
1471charnames alias. Aliases are discouraged from beginning with anything
1472other than an alphabetic character and from containing anything other
1473than alphanumerics, spaces, dashes, parentheses, and underscores.
1474Currently they must be ASCII.
1475
38f4139d
KW
1476An alias can map to either an official Unicode character name (not a loose
1477matched name) or to a
e5432b89
KW
1478numeric code point (ordinal). The latter is useful for assigning names
1479to code points in Unicode private use areas such as U+E800 through
f12d74c0
KW
1480U+F8FF.
1481A numeric code point must be a non-negative integer or a string beginning
1482with C<"U+"> or C<"0x"> with the remainder considered to be a
1483hexadecimal integer. A literal numeric constant must be unsigned; it
1484will be interpreted as hex if it has a leading zero or contains
1485non-decimal hex digits; otherwise it will be interpreted as decimal.
232cbbee 1486
da9dec57 1487Aliases are added either by the use of anonymous hashes:
35c0985d 1488
da9dec57 1489 use charnames ":alias" => {
35c0985d 1490 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee 1491 mychar1 => 0xE8000,
35c0985d
MB
1492 };
1493 my $str = "\N{e_ACUTE}";
1494
da9dec57 1495or by using a file containing aliases:
35c0985d 1496
da9dec57 1497 use charnames ":alias" => "pro";
35c0985d 1498
8ebef31d 1499This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
da9dec57 1500file should return a list in plain perl:
35c0985d
MB
1501
1502 (
1503 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
1504 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
1505 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
1506 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
1507 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
1508 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
1509 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
f12d74c0 1510 mychar2 => "U+E8001",
35c0985d
MB
1511 );
1512
da9dec57
KW
1513Both these methods insert C<":full"> automatically as the first argument (if no
1514other argument is given), and you can give the C<":full"> explicitly as
1515well, like
35c0985d 1516
da9dec57 1517 use charnames ":full", ":alias" => "pro";
35c0985d 1518
38f4139d
KW
1519C<":loose"> has no effect with these. Input names must match exactly, using
1520C<":full"> rules.
1521
14aeae98 1522Also, both these methods currently allow only single characters to be named.
8ebef31d
KW
1523To name a sequence of characters, use a
1524L<custom translator|/CUSTOM TRANSLATORS> (described below).
1525
da9dec57 1526=head1 charnames::viacode(I<code>)
b177ca84
JF
1527
1528Returns the full name of the character indicated by the numeric code.
da9dec57 1529For example,
b177ca84
JF
1530
1531 print charnames::viacode(0x2722);
1532
1533prints "FOUR TEARDROP-SPOKED ASTERISK".
1534
232cbbee 1535The name returned is the official name for the code point, if
8ebef31d 1536available; otherwise your custom alias for it. This means that your
232cbbee 1537alias will only be returned for code points that don't have an official
14aeae98 1538Unicode name (nor a Unicode version 1 name), such as private use code
232cbbee 1539points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099.
da9dec57
KW
1540If you define more than one name for the code point, it is indeterminate
1541which one will be returned.
1542
1543The function returns C<undef> if no name is known for the code point.
1544In Unicode the proper name of these is the empty string, which
1545C<undef> stringifies to. (If you ask for a code point past the legal
1546Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
f12d74c0
KW
1547get C<undef> plus a warning.)
1548
1549The input number must be a non-negative integer or a string beginning
1550with C<"U+"> or C<"0x"> with the remainder considered to be a
1551hexadecimal integer. A literal numeric constant must be unsigned; it
1552will be interpreted as hex if it has a leading zero or contains
1553non-decimal hex digits; otherwise it will be interpreted as decimal.
daf0d493 1554
d9f23c72 1555Notice that the name returned for U+FEFF is "ZERO WIDTH NO-BREAK
274085e3
PN
1556SPACE", not "BYTE ORDER MARK".
1557
fb121860 1558=head1 charnames::string_vianame(I<name>)
daf0d493 1559
fb121860
KW
1560This is a runtime equivalent to C<\N{...}>. I<name> can be any expression
1561that evaluates to a name accepted by C<\N{...}> under the L<C<:full>
1562option|/DESCRIPTION> to C<charnames>. In addition, any other options for the
38f4139d
KW
1563controlling C<"use charnames"> in the same scope apply, like C<:loose> or any
1564L<script list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM
1565ALIASES> you may have defined.
daf0d493 1566
fb121860
KW
1567The only difference is that if the input name is unknown, C<string_vianame>
1568returns C<undef> instead of the REPLACEMENT CHARACTER and does not raise a
1569warning message.
daf0d493 1570
fb121860
KW
1571=head1 charnames::vianame(I<name>)
1572
1573This is similar to C<string_vianame>. The main difference is that under most
5ef88e32 1574circumstances, vianame returns an ordinal code
fb121860 1575point, whereas C<string_vianame> returns a string. For example,
daf0d493 1576
fb121860 1577 printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
b177ca84 1578
fb121860 1579prints "U+2722".
1f31fcd4 1580
fb121860
KW
1581This leads to the other two differences. Since a single code point is
1582returned, the function can't handle named character sequences, as these are
14aeae98
KW
1583composed of multiple characters (it returns C<undef> for these. And, the code
1584point can be that of any
fb121860 1585character, even ones that aren't legal under the C<S<use bytes>> pragma,
b177ca84 1586
5ef88e32
KW
1587See L</BUGS> for the circumstances in which the behavior differs
1588from that described above.
1589
5ffe0e96 1590=head1 CUSTOM TRANSLATORS
52ea3e69 1591
5ffe0e96 1592The mechanism of translation of C<\N{...}> escapes is general and not
5ef88e32 1593hardwired into F<charnames.pm>. A module can install custom
5ffe0e96
MB
1594translations (inside the scope which C<use>s the module) with the
1595following magic incantation:
52ea3e69 1596
5ffe0e96 1597 sub import {
52fb7278
KW
1598 shift;
1599 $^H{charnames} = \&translator;
5ffe0e96 1600 }
52ea3e69 1601
da9dec57 1602Here translator() is a subroutine which takes I<CHARNAME> as an
5ffe0e96 1603argument, and returns text to insert into the string instead of the
5ef88e32
KW
1604C<\N{I<CHARNAME>}> escape.
1605
1606This is the only way you can create a custom named sequence of code points.
1607
1608Since the text to insert should be different
5ffe0e96
MB
1609in C<bytes> mode and out of it, the function should check the current
1610state of C<bytes>-flag as in:
52ea3e69 1611
52fb7278 1612 use bytes (); # for $bytes::hint_bits
5ffe0e96 1613 sub translator {
52fb7278
KW
1614 if ($^H & $bytes::hint_bits) {
1615 return bytes_translator(@_);
1616 }
1617 else {
1618 return utf8_translator(@_);
1619 }
5ffe0e96 1620 }
52ea3e69 1621
da9dec57 1622See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
f0175764 1623
9e808deb
KW
1624Of course, C<vianame>, C<viacode>, and C<string_vianame> would need to be
1625overridden as well.
1f31fcd4 1626
423cee85
JH
1627=head1 BUGS
1628
14aeae98 1629vianame() normally returns an ordinal code point, but when the input name is of
8ebef31d
KW
1630the form C<U+...>, it returns a chr instead. In this case, if C<use bytes> is
1631in effect and the character won't fit into a byte, it returns C<undef> and
1632raises a warning.
55bc7d3c 1633
16036bcd
KW
1634Names must be ASCII characters only, which means that you are out of luck if
1635you want to create aliases in a language where some or all the characters of
1636the desired aliases are non-ASCII.
bee80e93 1637
f12d74c0
KW
1638Since evaluation of the translation function (see L</CUSTOM
1639TRANSLATORS>) happens in the middle of compilation (of a string
1640literal), the translation function should not do any C<eval>s or
1641C<require>s. This restriction should be lifted (but is low priority) in
1642a future version of Perl.
423cee85
JH
1643
1644=cut
0eacc33e 1645
52fb7278 1646# ex: set ts=8 sts=2 sw=2 et: