| 1 | package charnames; |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use File::Spec; |
| 5 | our $VERSION = '1.09'; |
| 6 | |
| 7 | use bytes (); # for $bytes::hint_bits |
| 8 | |
| 9 | my %system_aliases = ( |
| 10 | # Icky 3.2 names with parentheses. |
| 11 | 'LINE FEED' => 0x0A, # LINE FEED (LF) |
| 12 | 'FORM FEED' => 0x0C, # FORM FEED (FF) |
| 13 | 'CARRIAGE RETURN' => 0x0D, # CARRIAGE RETURN (CR) |
| 14 | 'NEXT LINE' => 0x85, # NEXT LINE (NEL) |
| 15 | |
| 16 | # Some variant names from Wikipedia |
| 17 | 'SINGLE-SHIFT 2' => 0x8E, |
| 18 | 'SINGLE-SHIFT 3' => 0x8F, |
| 19 | 'PRIVATE USE 1' => 0x91, |
| 20 | 'PRIVATE USE 2' => 0x92, |
| 21 | 'START OF PROTECTED AREA' => 0x96, |
| 22 | 'END OF PROTECTED AREA' => 0x97, |
| 23 | |
| 24 | # Convenience. Standard abbreviations for the controls |
| 25 | 'NUL' => 0x00, # NULL |
| 26 | 'SOH' => 0x01, # START OF HEADING |
| 27 | 'STX' => 0x02, # START OF TEXT |
| 28 | 'ETX' => 0x03, # END OF TEXT |
| 29 | 'EOT' => 0x04, # END OF TRANSMISSION |
| 30 | 'ENQ' => 0x05, # ENQUIRY |
| 31 | 'ACK' => 0x06, # ACKNOWLEDGE |
| 32 | 'BEL' => 0x07, # BELL |
| 33 | 'BS' => 0x08, # BACKSPACE |
| 34 | 'HT' => 0x09, # HORIZONTAL TABULATION |
| 35 | 'LF' => 0x0A, # LINE FEED (LF) |
| 36 | 'VT' => 0x0B, # VERTICAL TABULATION |
| 37 | 'FF' => 0x0C, # FORM FEED (FF) |
| 38 | 'CR' => 0x0D, # CARRIAGE RETURN (CR) |
| 39 | 'SO' => 0x0E, # SHIFT OUT |
| 40 | 'SI' => 0x0F, # SHIFT IN |
| 41 | 'DLE' => 0x10, # DATA LINK ESCAPE |
| 42 | 'DC1' => 0x11, # DEVICE CONTROL ONE |
| 43 | 'DC2' => 0x12, # DEVICE CONTROL TWO |
| 44 | 'DC3' => 0x13, # DEVICE CONTROL THREE |
| 45 | 'DC4' => 0x14, # DEVICE CONTROL FOUR |
| 46 | 'NAK' => 0x15, # NEGATIVE ACKNOWLEDGE |
| 47 | 'SYN' => 0x16, # SYNCHRONOUS IDLE |
| 48 | 'ETB' => 0x17, # END OF TRANSMISSION BLOCK |
| 49 | 'CAN' => 0x18, # CANCEL |
| 50 | 'EOM' => 0x19, # END OF MEDIUM |
| 51 | 'SUB' => 0x1A, # SUBSTITUTE |
| 52 | 'ESC' => 0x1B, # ESCAPE |
| 53 | 'FS' => 0x1C, # FILE SEPARATOR |
| 54 | 'GS' => 0x1D, # GROUP SEPARATOR |
| 55 | 'RS' => 0x1E, # RECORD SEPARATOR |
| 56 | 'US' => 0x1F, # UNIT SEPARATOR |
| 57 | 'DEL' => 0x7F, # DELETE |
| 58 | 'BPH' => 0x82, # BREAK PERMITTED HERE |
| 59 | 'NBH' => 0x83, # NO BREAK HERE |
| 60 | 'NEL' => 0x85, # NEXT LINE (NEL) |
| 61 | 'SSA' => 0x86, # START OF SELECTED AREA |
| 62 | 'ESA' => 0x87, # END OF SELECTED AREA |
| 63 | 'HTS' => 0x88, # CHARACTER TABULATION SET |
| 64 | 'HTJ' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION |
| 65 | 'VTS' => 0x8A, # LINE TABULATION SET |
| 66 | 'PLD' => 0x8B, # PARTIAL LINE FORWARD |
| 67 | 'PLU' => 0x8C, # PARTIAL LINE BACKWARD |
| 68 | 'RI ' => 0x8D, # REVERSE LINE FEED |
| 69 | 'SS2' => 0x8E, # SINGLE SHIFT TWO |
| 70 | 'SS3' => 0x8F, # SINGLE SHIFT THREE |
| 71 | 'DCS' => 0x90, # DEVICE CONTROL STRING |
| 72 | 'PU1' => 0x91, # PRIVATE USE ONE |
| 73 | 'PU2' => 0x92, # PRIVATE USE TWO |
| 74 | 'STS' => 0x93, # SET TRANSMIT STATE |
| 75 | 'CCH' => 0x94, # CANCEL CHARACTER |
| 76 | 'MW ' => 0x95, # MESSAGE WAITING |
| 77 | 'SPA' => 0x96, # START OF GUARDED AREA |
| 78 | 'EPA' => 0x97, # END OF GUARDED AREA |
| 79 | 'SOS' => 0x98, # START OF STRING |
| 80 | 'SCI' => 0x9A, # SINGLE CHARACTER INTRODUCER |
| 81 | 'CSI' => 0x9B, # CONTROL SEQUENCE INTRODUCER |
| 82 | 'ST ' => 0x9C, # STRING TERMINATOR |
| 83 | 'OSC' => 0x9D, # OPERATING SYSTEM COMMAND |
| 84 | 'PM ' => 0x9E, # PRIVACY MESSAGE |
| 85 | 'APC' => 0x9F, # APPLICATION PROGRAM COMMAND |
| 86 | |
| 87 | # There are no names for these in the Unicode standard; |
| 88 | # perhaps should be deprecated, but then again there are |
| 89 | # no alternative names, so am not deprecating. And if |
| 90 | # did, the code would have to change to not recommend an |
| 91 | # alternative for these. |
| 92 | 'PADDING CHARACTER' => 0x80, |
| 93 | 'PAD' => 0x80, |
| 94 | 'HIGH OCTET PRESET' => 0x81, |
| 95 | 'HOP' => 0x81, |
| 96 | 'INDEX' => 0x84, |
| 97 | 'IND' => 0x84, |
| 98 | 'SINGLE GRAPHIC CHARACTER INTRODUCER' => 0x99, |
| 99 | 'SGC' => 0x99, |
| 100 | |
| 101 | # More convenience. For further convenience, |
| 102 | # it is suggested some way of using the NamesList |
| 103 | # aliases be implemented, but there are ambiguities in |
| 104 | # NamesList.txt |
| 105 | 'BOM' => 0xFEFF, # BYTE ORDER MARK |
| 106 | 'BYTE ORDER MARK'=> 0xFEFF, |
| 107 | 'CGJ' => 0x034F, # COMBINING GRAPHEME JOINER |
| 108 | 'FVS1' => 0x180B, # MONGOLIAN FREE VARIATION SELECTOR ONE |
| 109 | 'FVS2' => 0x180C, # MONGOLIAN FREE VARIATION SELECTOR TWO |
| 110 | 'FVS3' => 0x180D, # MONGOLIAN FREE VARIATION SELECTOR THREE |
| 111 | 'LRE' => 0x202A, # LEFT-TO-RIGHT EMBEDDING |
| 112 | 'LRM' => 0x200E, # LEFT-TO-RIGHT MARK |
| 113 | 'LRO' => 0x202D, # LEFT-TO-RIGHT OVERRIDE |
| 114 | 'MMSP' => 0x205F, # MEDIUM MATHEMATICAL SPACE |
| 115 | 'MVS' => 0x180E, # MONGOLIAN VOWEL SEPARATOR |
| 116 | 'NBSP' => 0x00A0, # NO-BREAK SPACE |
| 117 | 'NNBSP' => 0x202F, # NARROW NO-BREAK SPACE |
| 118 | 'PDF' => 0x202C, # POP DIRECTIONAL FORMATTING |
| 119 | 'RLE' => 0x202B, # RIGHT-TO-LEFT EMBEDDING |
| 120 | 'RLM' => 0x200F, # RIGHT-TO-LEFT MARK |
| 121 | 'RLO' => 0x202E, # RIGHT-TO-LEFT OVERRIDE |
| 122 | 'SHY' => 0x00AD, # SOFT HYPHEN |
| 123 | 'VS1' => 0xFE00, # VARIATION SELECTOR-1 |
| 124 | 'VS2' => 0xFE01, # VARIATION SELECTOR-2 |
| 125 | 'VS3' => 0xFE02, # VARIATION SELECTOR-3 |
| 126 | 'VS4' => 0xFE03, # VARIATION SELECTOR-4 |
| 127 | 'VS5' => 0xFE04, # VARIATION SELECTOR-5 |
| 128 | 'VS6' => 0xFE05, # VARIATION SELECTOR-6 |
| 129 | 'VS7' => 0xFE06, # VARIATION SELECTOR-7 |
| 130 | 'VS8' => 0xFE07, # VARIATION SELECTOR-8 |
| 131 | 'VS9' => 0xFE08, # VARIATION SELECTOR-9 |
| 132 | 'VS10' => 0xFE09, # VARIATION SELECTOR-10 |
| 133 | 'VS11' => 0xFE0A, # VARIATION SELECTOR-11 |
| 134 | 'VS12' => 0xFE0B, # VARIATION SELECTOR-12 |
| 135 | 'VS13' => 0xFE0C, # VARIATION SELECTOR-13 |
| 136 | 'VS14' => 0xFE0D, # VARIATION SELECTOR-14 |
| 137 | 'VS15' => 0xFE0E, # VARIATION SELECTOR-15 |
| 138 | 'VS16' => 0xFE0F, # VARIATION SELECTOR-16 |
| 139 | 'VS17' => 0xE0100, # VARIATION SELECTOR-17 |
| 140 | 'VS18' => 0xE0101, # VARIATION SELECTOR-18 |
| 141 | 'VS19' => 0xE0102, # VARIATION SELECTOR-19 |
| 142 | 'VS20' => 0xE0103, # VARIATION SELECTOR-20 |
| 143 | 'VS21' => 0xE0104, # VARIATION SELECTOR-21 |
| 144 | 'VS22' => 0xE0105, # VARIATION SELECTOR-22 |
| 145 | 'VS23' => 0xE0106, # VARIATION SELECTOR-23 |
| 146 | 'VS24' => 0xE0107, # VARIATION SELECTOR-24 |
| 147 | 'VS25' => 0xE0108, # VARIATION SELECTOR-25 |
| 148 | 'VS26' => 0xE0109, # VARIATION SELECTOR-26 |
| 149 | 'VS27' => 0xE010A, # VARIATION SELECTOR-27 |
| 150 | 'VS28' => 0xE010B, # VARIATION SELECTOR-28 |
| 151 | 'VS29' => 0xE010C, # VARIATION SELECTOR-29 |
| 152 | 'VS30' => 0xE010D, # VARIATION SELECTOR-30 |
| 153 | 'VS31' => 0xE010E, # VARIATION SELECTOR-31 |
| 154 | 'VS32' => 0xE010F, # VARIATION SELECTOR-32 |
| 155 | 'VS33' => 0xE0110, # VARIATION SELECTOR-33 |
| 156 | 'VS34' => 0xE0111, # VARIATION SELECTOR-34 |
| 157 | 'VS35' => 0xE0112, # VARIATION SELECTOR-35 |
| 158 | 'VS36' => 0xE0113, # VARIATION SELECTOR-36 |
| 159 | 'VS37' => 0xE0114, # VARIATION SELECTOR-37 |
| 160 | 'VS38' => 0xE0115, # VARIATION SELECTOR-38 |
| 161 | 'VS39' => 0xE0116, # VARIATION SELECTOR-39 |
| 162 | 'VS40' => 0xE0117, # VARIATION SELECTOR-40 |
| 163 | 'VS41' => 0xE0118, # VARIATION SELECTOR-41 |
| 164 | 'VS42' => 0xE0119, # VARIATION SELECTOR-42 |
| 165 | 'VS43' => 0xE011A, # VARIATION SELECTOR-43 |
| 166 | 'VS44' => 0xE011B, # VARIATION SELECTOR-44 |
| 167 | 'VS45' => 0xE011C, # VARIATION SELECTOR-45 |
| 168 | 'VS46' => 0xE011D, # VARIATION SELECTOR-46 |
| 169 | 'VS47' => 0xE011E, # VARIATION SELECTOR-47 |
| 170 | 'VS48' => 0xE011F, # VARIATION SELECTOR-48 |
| 171 | 'VS49' => 0xE0120, # VARIATION SELECTOR-49 |
| 172 | 'VS50' => 0xE0121, # VARIATION SELECTOR-50 |
| 173 | 'VS51' => 0xE0122, # VARIATION SELECTOR-51 |
| 174 | 'VS52' => 0xE0123, # VARIATION SELECTOR-52 |
| 175 | 'VS53' => 0xE0124, # VARIATION SELECTOR-53 |
| 176 | 'VS54' => 0xE0125, # VARIATION SELECTOR-54 |
| 177 | 'VS55' => 0xE0126, # VARIATION SELECTOR-55 |
| 178 | 'VS56' => 0xE0127, # VARIATION SELECTOR-56 |
| 179 | 'VS57' => 0xE0128, # VARIATION SELECTOR-57 |
| 180 | 'VS58' => 0xE0129, # VARIATION SELECTOR-58 |
| 181 | 'VS59' => 0xE012A, # VARIATION SELECTOR-59 |
| 182 | 'VS60' => 0xE012B, # VARIATION SELECTOR-60 |
| 183 | 'VS61' => 0xE012C, # VARIATION SELECTOR-61 |
| 184 | 'VS62' => 0xE012D, # VARIATION SELECTOR-62 |
| 185 | 'VS63' => 0xE012E, # VARIATION SELECTOR-63 |
| 186 | 'VS64' => 0xE012F, # VARIATION SELECTOR-64 |
| 187 | 'VS65' => 0xE0130, # VARIATION SELECTOR-65 |
| 188 | 'VS66' => 0xE0131, # VARIATION SELECTOR-66 |
| 189 | 'VS67' => 0xE0132, # VARIATION SELECTOR-67 |
| 190 | 'VS68' => 0xE0133, # VARIATION SELECTOR-68 |
| 191 | 'VS69' => 0xE0134, # VARIATION SELECTOR-69 |
| 192 | 'VS70' => 0xE0135, # VARIATION SELECTOR-70 |
| 193 | 'VS71' => 0xE0136, # VARIATION SELECTOR-71 |
| 194 | 'VS72' => 0xE0137, # VARIATION SELECTOR-72 |
| 195 | 'VS73' => 0xE0138, # VARIATION SELECTOR-73 |
| 196 | 'VS74' => 0xE0139, # VARIATION SELECTOR-74 |
| 197 | 'VS75' => 0xE013A, # VARIATION SELECTOR-75 |
| 198 | 'VS76' => 0xE013B, # VARIATION SELECTOR-76 |
| 199 | 'VS77' => 0xE013C, # VARIATION SELECTOR-77 |
| 200 | 'VS78' => 0xE013D, # VARIATION SELECTOR-78 |
| 201 | 'VS79' => 0xE013E, # VARIATION SELECTOR-79 |
| 202 | 'VS80' => 0xE013F, # VARIATION SELECTOR-80 |
| 203 | 'VS81' => 0xE0140, # VARIATION SELECTOR-81 |
| 204 | 'VS82' => 0xE0141, # VARIATION SELECTOR-82 |
| 205 | 'VS83' => 0xE0142, # VARIATION SELECTOR-83 |
| 206 | 'VS84' => 0xE0143, # VARIATION SELECTOR-84 |
| 207 | 'VS85' => 0xE0144, # VARIATION SELECTOR-85 |
| 208 | 'VS86' => 0xE0145, # VARIATION SELECTOR-86 |
| 209 | 'VS87' => 0xE0146, # VARIATION SELECTOR-87 |
| 210 | 'VS88' => 0xE0147, # VARIATION SELECTOR-88 |
| 211 | 'VS89' => 0xE0148, # VARIATION SELECTOR-89 |
| 212 | 'VS90' => 0xE0149, # VARIATION SELECTOR-90 |
| 213 | 'VS91' => 0xE014A, # VARIATION SELECTOR-91 |
| 214 | 'VS92' => 0xE014B, # VARIATION SELECTOR-92 |
| 215 | 'VS93' => 0xE014C, # VARIATION SELECTOR-93 |
| 216 | 'VS94' => 0xE014D, # VARIATION SELECTOR-94 |
| 217 | 'VS95' => 0xE014E, # VARIATION SELECTOR-95 |
| 218 | 'VS96' => 0xE014F, # VARIATION SELECTOR-96 |
| 219 | 'VS97' => 0xE0150, # VARIATION SELECTOR-97 |
| 220 | 'VS98' => 0xE0151, # VARIATION SELECTOR-98 |
| 221 | 'VS99' => 0xE0152, # VARIATION SELECTOR-99 |
| 222 | 'VS100' => 0xE0153, # VARIATION SELECTOR-100 |
| 223 | 'VS101' => 0xE0154, # VARIATION SELECTOR-101 |
| 224 | 'VS102' => 0xE0155, # VARIATION SELECTOR-102 |
| 225 | 'VS103' => 0xE0156, # VARIATION SELECTOR-103 |
| 226 | 'VS104' => 0xE0157, # VARIATION SELECTOR-104 |
| 227 | 'VS105' => 0xE0158, # VARIATION SELECTOR-105 |
| 228 | 'VS106' => 0xE0159, # VARIATION SELECTOR-106 |
| 229 | 'VS107' => 0xE015A, # VARIATION SELECTOR-107 |
| 230 | 'VS108' => 0xE015B, # VARIATION SELECTOR-108 |
| 231 | 'VS109' => 0xE015C, # VARIATION SELECTOR-109 |
| 232 | 'VS110' => 0xE015D, # VARIATION SELECTOR-110 |
| 233 | 'VS111' => 0xE015E, # VARIATION SELECTOR-111 |
| 234 | 'VS112' => 0xE015F, # VARIATION SELECTOR-112 |
| 235 | 'VS113' => 0xE0160, # VARIATION SELECTOR-113 |
| 236 | 'VS114' => 0xE0161, # VARIATION SELECTOR-114 |
| 237 | 'VS115' => 0xE0162, # VARIATION SELECTOR-115 |
| 238 | 'VS116' => 0xE0163, # VARIATION SELECTOR-116 |
| 239 | 'VS117' => 0xE0164, # VARIATION SELECTOR-117 |
| 240 | 'VS118' => 0xE0165, # VARIATION SELECTOR-118 |
| 241 | 'VS119' => 0xE0166, # VARIATION SELECTOR-119 |
| 242 | 'VS120' => 0xE0167, # VARIATION SELECTOR-120 |
| 243 | 'VS121' => 0xE0168, # VARIATION SELECTOR-121 |
| 244 | 'VS122' => 0xE0169, # VARIATION SELECTOR-122 |
| 245 | 'VS123' => 0xE016A, # VARIATION SELECTOR-123 |
| 246 | 'VS124' => 0xE016B, # VARIATION SELECTOR-124 |
| 247 | 'VS125' => 0xE016C, # VARIATION SELECTOR-125 |
| 248 | 'VS126' => 0xE016D, # VARIATION SELECTOR-126 |
| 249 | 'VS127' => 0xE016E, # VARIATION SELECTOR-127 |
| 250 | 'VS128' => 0xE016F, # VARIATION SELECTOR-128 |
| 251 | 'VS129' => 0xE0170, # VARIATION SELECTOR-129 |
| 252 | 'VS130' => 0xE0171, # VARIATION SELECTOR-130 |
| 253 | 'VS131' => 0xE0172, # VARIATION SELECTOR-131 |
| 254 | 'VS132' => 0xE0173, # VARIATION SELECTOR-132 |
| 255 | 'VS133' => 0xE0174, # VARIATION SELECTOR-133 |
| 256 | 'VS134' => 0xE0175, # VARIATION SELECTOR-134 |
| 257 | 'VS135' => 0xE0176, # VARIATION SELECTOR-135 |
| 258 | 'VS136' => 0xE0177, # VARIATION SELECTOR-136 |
| 259 | 'VS137' => 0xE0178, # VARIATION SELECTOR-137 |
| 260 | 'VS138' => 0xE0179, # VARIATION SELECTOR-138 |
| 261 | 'VS139' => 0xE017A, # VARIATION SELECTOR-139 |
| 262 | 'VS140' => 0xE017B, # VARIATION SELECTOR-140 |
| 263 | 'VS141' => 0xE017C, # VARIATION SELECTOR-141 |
| 264 | 'VS142' => 0xE017D, # VARIATION SELECTOR-142 |
| 265 | 'VS143' => 0xE017E, # VARIATION SELECTOR-143 |
| 266 | 'VS144' => 0xE017F, # VARIATION SELECTOR-144 |
| 267 | 'VS145' => 0xE0180, # VARIATION SELECTOR-145 |
| 268 | 'VS146' => 0xE0181, # VARIATION SELECTOR-146 |
| 269 | 'VS147' => 0xE0182, # VARIATION SELECTOR-147 |
| 270 | 'VS148' => 0xE0183, # VARIATION SELECTOR-148 |
| 271 | 'VS149' => 0xE0184, # VARIATION SELECTOR-149 |
| 272 | 'VS150' => 0xE0185, # VARIATION SELECTOR-150 |
| 273 | 'VS151' => 0xE0186, # VARIATION SELECTOR-151 |
| 274 | 'VS152' => 0xE0187, # VARIATION SELECTOR-152 |
| 275 | 'VS153' => 0xE0188, # VARIATION SELECTOR-153 |
| 276 | 'VS154' => 0xE0189, # VARIATION SELECTOR-154 |
| 277 | 'VS155' => 0xE018A, # VARIATION SELECTOR-155 |
| 278 | 'VS156' => 0xE018B, # VARIATION SELECTOR-156 |
| 279 | 'VS157' => 0xE018C, # VARIATION SELECTOR-157 |
| 280 | 'VS158' => 0xE018D, # VARIATION SELECTOR-158 |
| 281 | 'VS159' => 0xE018E, # VARIATION SELECTOR-159 |
| 282 | 'VS160' => 0xE018F, # VARIATION SELECTOR-160 |
| 283 | 'VS161' => 0xE0190, # VARIATION SELECTOR-161 |
| 284 | 'VS162' => 0xE0191, # VARIATION SELECTOR-162 |
| 285 | 'VS163' => 0xE0192, # VARIATION SELECTOR-163 |
| 286 | 'VS164' => 0xE0193, # VARIATION SELECTOR-164 |
| 287 | 'VS165' => 0xE0194, # VARIATION SELECTOR-165 |
| 288 | 'VS166' => 0xE0195, # VARIATION SELECTOR-166 |
| 289 | 'VS167' => 0xE0196, # VARIATION SELECTOR-167 |
| 290 | 'VS168' => 0xE0197, # VARIATION SELECTOR-168 |
| 291 | 'VS169' => 0xE0198, # VARIATION SELECTOR-169 |
| 292 | 'VS170' => 0xE0199, # VARIATION SELECTOR-170 |
| 293 | 'VS171' => 0xE019A, # VARIATION SELECTOR-171 |
| 294 | 'VS172' => 0xE019B, # VARIATION SELECTOR-172 |
| 295 | 'VS173' => 0xE019C, # VARIATION SELECTOR-173 |
| 296 | 'VS174' => 0xE019D, # VARIATION SELECTOR-174 |
| 297 | 'VS175' => 0xE019E, # VARIATION SELECTOR-175 |
| 298 | 'VS176' => 0xE019F, # VARIATION SELECTOR-176 |
| 299 | 'VS177' => 0xE01A0, # VARIATION SELECTOR-177 |
| 300 | 'VS178' => 0xE01A1, # VARIATION SELECTOR-178 |
| 301 | 'VS179' => 0xE01A2, # VARIATION SELECTOR-179 |
| 302 | 'VS180' => 0xE01A3, # VARIATION SELECTOR-180 |
| 303 | 'VS181' => 0xE01A4, # VARIATION SELECTOR-181 |
| 304 | 'VS182' => 0xE01A5, # VARIATION SELECTOR-182 |
| 305 | 'VS183' => 0xE01A6, # VARIATION SELECTOR-183 |
| 306 | 'VS184' => 0xE01A7, # VARIATION SELECTOR-184 |
| 307 | 'VS185' => 0xE01A8, # VARIATION SELECTOR-185 |
| 308 | 'VS186' => 0xE01A9, # VARIATION SELECTOR-186 |
| 309 | 'VS187' => 0xE01AA, # VARIATION SELECTOR-187 |
| 310 | 'VS188' => 0xE01AB, # VARIATION SELECTOR-188 |
| 311 | 'VS189' => 0xE01AC, # VARIATION SELECTOR-189 |
| 312 | 'VS190' => 0xE01AD, # VARIATION SELECTOR-190 |
| 313 | 'VS191' => 0xE01AE, # VARIATION SELECTOR-191 |
| 314 | 'VS192' => 0xE01AF, # VARIATION SELECTOR-192 |
| 315 | 'VS193' => 0xE01B0, # VARIATION SELECTOR-193 |
| 316 | 'VS194' => 0xE01B1, # VARIATION SELECTOR-194 |
| 317 | 'VS195' => 0xE01B2, # VARIATION SELECTOR-195 |
| 318 | 'VS196' => 0xE01B3, # VARIATION SELECTOR-196 |
| 319 | 'VS197' => 0xE01B4, # VARIATION SELECTOR-197 |
| 320 | 'VS198' => 0xE01B5, # VARIATION SELECTOR-198 |
| 321 | 'VS199' => 0xE01B6, # VARIATION SELECTOR-199 |
| 322 | 'VS200' => 0xE01B7, # VARIATION SELECTOR-200 |
| 323 | 'VS201' => 0xE01B8, # VARIATION SELECTOR-201 |
| 324 | 'VS202' => 0xE01B9, # VARIATION SELECTOR-202 |
| 325 | 'VS203' => 0xE01BA, # VARIATION SELECTOR-203 |
| 326 | 'VS204' => 0xE01BB, # VARIATION SELECTOR-204 |
| 327 | 'VS205' => 0xE01BC, # VARIATION SELECTOR-205 |
| 328 | 'VS206' => 0xE01BD, # VARIATION SELECTOR-206 |
| 329 | 'VS207' => 0xE01BE, # VARIATION SELECTOR-207 |
| 330 | 'VS208' => 0xE01BF, # VARIATION SELECTOR-208 |
| 331 | 'VS209' => 0xE01C0, # VARIATION SELECTOR-209 |
| 332 | 'VS210' => 0xE01C1, # VARIATION SELECTOR-210 |
| 333 | 'VS211' => 0xE01C2, # VARIATION SELECTOR-211 |
| 334 | 'VS212' => 0xE01C3, # VARIATION SELECTOR-212 |
| 335 | 'VS213' => 0xE01C4, # VARIATION SELECTOR-213 |
| 336 | 'VS214' => 0xE01C5, # VARIATION SELECTOR-214 |
| 337 | 'VS215' => 0xE01C6, # VARIATION SELECTOR-215 |
| 338 | 'VS216' => 0xE01C7, # VARIATION SELECTOR-216 |
| 339 | 'VS217' => 0xE01C8, # VARIATION SELECTOR-217 |
| 340 | 'VS218' => 0xE01C9, # VARIATION SELECTOR-218 |
| 341 | 'VS219' => 0xE01CA, # VARIATION SELECTOR-219 |
| 342 | 'VS220' => 0xE01CB, # VARIATION SELECTOR-220 |
| 343 | 'VS221' => 0xE01CC, # VARIATION SELECTOR-221 |
| 344 | 'VS222' => 0xE01CD, # VARIATION SELECTOR-222 |
| 345 | 'VS223' => 0xE01CE, # VARIATION SELECTOR-223 |
| 346 | 'VS224' => 0xE01CF, # VARIATION SELECTOR-224 |
| 347 | 'VS225' => 0xE01D0, # VARIATION SELECTOR-225 |
| 348 | 'VS226' => 0xE01D1, # VARIATION SELECTOR-226 |
| 349 | 'VS227' => 0xE01D2, # VARIATION SELECTOR-227 |
| 350 | 'VS228' => 0xE01D3, # VARIATION SELECTOR-228 |
| 351 | 'VS229' => 0xE01D4, # VARIATION SELECTOR-229 |
| 352 | 'VS230' => 0xE01D5, # VARIATION SELECTOR-230 |
| 353 | 'VS231' => 0xE01D6, # VARIATION SELECTOR-231 |
| 354 | 'VS232' => 0xE01D7, # VARIATION SELECTOR-232 |
| 355 | 'VS233' => 0xE01D8, # VARIATION SELECTOR-233 |
| 356 | 'VS234' => 0xE01D9, # VARIATION SELECTOR-234 |
| 357 | 'VS235' => 0xE01DA, # VARIATION SELECTOR-235 |
| 358 | 'VS236' => 0xE01DB, # VARIATION SELECTOR-236 |
| 359 | 'VS237' => 0xE01DC, # VARIATION SELECTOR-237 |
| 360 | 'VS238' => 0xE01DD, # VARIATION SELECTOR-238 |
| 361 | 'VS239' => 0xE01DE, # VARIATION SELECTOR-239 |
| 362 | 'VS240' => 0xE01DF, # VARIATION SELECTOR-240 |
| 363 | 'VS241' => 0xE01E0, # VARIATION SELECTOR-241 |
| 364 | 'VS242' => 0xE01E1, # VARIATION SELECTOR-242 |
| 365 | 'VS243' => 0xE01E2, # VARIATION SELECTOR-243 |
| 366 | 'VS244' => 0xE01E3, # VARIATION SELECTOR-244 |
| 367 | 'VS245' => 0xE01E4, # VARIATION SELECTOR-245 |
| 368 | 'VS246' => 0xE01E5, # VARIATION SELECTOR-246 |
| 369 | 'VS247' => 0xE01E6, # VARIATION SELECTOR-247 |
| 370 | 'VS248' => 0xE01E7, # VARIATION SELECTOR-248 |
| 371 | 'VS249' => 0xE01E8, # VARIATION SELECTOR-249 |
| 372 | 'VS250' => 0xE01E9, # VARIATION SELECTOR-250 |
| 373 | 'VS251' => 0xE01EA, # VARIATION SELECTOR-251 |
| 374 | 'VS252' => 0xE01EB, # VARIATION SELECTOR-252 |
| 375 | 'VS253' => 0xE01EC, # VARIATION SELECTOR-253 |
| 376 | 'VS254' => 0xE01ED, # VARIATION SELECTOR-254 |
| 377 | 'VS255' => 0xE01EE, # VARIATION SELECTOR-255 |
| 378 | 'VS256' => 0xE01EF, # VARIATION SELECTOR-256 |
| 379 | 'WJ' => 0x2060, # WORD JOINER |
| 380 | 'ZWJ' => 0x200D, # ZERO WIDTH JOINER |
| 381 | 'ZWNJ' => 0x200C, # ZERO WIDTH NON-JOINER |
| 382 | 'ZWSP' => 0x200B, # ZERO WIDTH SPACE |
| 383 | ); |
| 384 | |
| 385 | my %deprecated_aliases = ( |
| 386 | # Pre-3.2 compatibility (only for the first 256 characters). |
| 387 | # Use of these gives deprecated message. |
| 388 | 'HORIZONTAL TABULATION' => 0x09, # CHARACTER TABULATION |
| 389 | 'VERTICAL TABULATION' => 0x0B, # LINE TABULATION |
| 390 | 'FILE SEPARATOR' => 0x1C, # INFORMATION SEPARATOR FOUR |
| 391 | 'GROUP SEPARATOR' => 0x1D, # INFORMATION SEPARATOR THREE |
| 392 | 'RECORD SEPARATOR' => 0x1E, # INFORMATION SEPARATOR TWO |
| 393 | 'UNIT SEPARATOR' => 0x1F, # INFORMATION SEPARATOR ONE |
| 394 | 'HORIZONTAL TABULATION SET' => 0x88, # CHARACTER TABULATION SET |
| 395 | 'HORIZONTAL TABULATION WITH JUSTIFICATION' => 0x89, # CHARACTER TABULATION WITH JUSTIFICATION |
| 396 | 'PARTIAL LINE DOWN' => 0x8B, # PARTIAL LINE FORWARD |
| 397 | 'PARTIAL LINE UP' => 0x8C, # PARTIAL LINE BACKWARD |
| 398 | 'VERTICAL TABULATION SET' => 0x8A, # LINE TABULATION SET |
| 399 | 'REVERSE INDEX' => 0x8D, # REVERSE LINE FEED |
| 400 | ); |
| 401 | |
| 402 | my %user_name_aliases = ( |
| 403 | # User defined aliases. Even more convenient :) |
| 404 | # These are the ones that resolved to names |
| 405 | ); |
| 406 | |
| 407 | my %user_numeric_aliases = ( |
| 408 | # And these resolve directly to code points. |
| 409 | ); |
| 410 | my %inverse_user_aliases = ( |
| 411 | # Map from code point to name |
| 412 | ); |
| 413 | my $txt; |
| 414 | my $decimal_qr = qr/^[1-9]\d*$/; |
| 415 | |
| 416 | # Returns the hex number in $1. |
| 417 | my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/; |
| 418 | |
| 419 | sub croak |
| 420 | { |
| 421 | require Carp; goto &Carp::croak; |
| 422 | } # croak |
| 423 | |
| 424 | sub carp |
| 425 | { |
| 426 | require Carp; goto &Carp::carp; |
| 427 | } # carp |
| 428 | |
| 429 | sub alias (@) |
| 430 | { |
| 431 | my $alias = ref $_[0] ? $_[0] : { @_ }; |
| 432 | foreach my $name (keys %$alias) { |
| 433 | my $value = $alias->{$name}; |
| 434 | if ($value =~ $decimal_qr) { |
| 435 | $user_numeric_aliases{$name} = $value; |
| 436 | |
| 437 | # Use a canonical form. |
| 438 | $inverse_user_aliases{sprintf("%04X", $value)} = $name; |
| 439 | } |
| 440 | elsif ($value =~ $hex_qr) { |
| 441 | my $decimal = CORE::hex $1; |
| 442 | $user_numeric_aliases{$name} = $decimal; |
| 443 | |
| 444 | # Must convert to decimal and back to guarantee canonical form |
| 445 | $inverse_user_aliases{sprintf("%04X", $decimal)} = $name; |
| 446 | } |
| 447 | else { |
| 448 | $user_name_aliases{$name} = $value; |
| 449 | } |
| 450 | } |
| 451 | } # alias |
| 452 | |
| 453 | sub alias_file ($) |
| 454 | { |
| 455 | my ($arg, $file) = @_; |
| 456 | if (-f $arg && File::Spec->file_name_is_absolute ($arg)) { |
| 457 | $file = $arg; |
| 458 | } |
| 459 | elsif ($arg =~ m/^\w+$/) { |
| 460 | $file = "unicore/${arg}_alias.pl"; |
| 461 | } |
| 462 | else { |
| 463 | croak "Charnames alias files can only have identifier characters"; |
| 464 | } |
| 465 | if (my @alias = do $file) { |
| 466 | @alias == 1 && !defined $alias[0] and |
| 467 | croak "$file cannot be used as alias file for charnames"; |
| 468 | @alias % 2 and |
| 469 | croak "$file did not return a (valid) list of alias pairs"; |
| 470 | alias (@alias); |
| 471 | return (1); |
| 472 | } |
| 473 | 0; |
| 474 | } # alias_file |
| 475 | |
| 476 | # This is not optimized in any way yet |
| 477 | sub charnames |
| 478 | { |
| 479 | my $name = shift; |
| 480 | my $ord; |
| 481 | my $fname; |
| 482 | |
| 483 | # User alias should be checked first or else can't override ours, and if we |
| 484 | # add any, could conflict with theirs. |
| 485 | if (exists $user_numeric_aliases{$name}) { |
| 486 | $ord = $user_numeric_aliases{$name}; |
| 487 | $fname = $name; |
| 488 | } |
| 489 | elsif (exists $user_name_aliases{$name}) { |
| 490 | $name = $user_name_aliases{$name}; |
| 491 | } |
| 492 | elsif (exists $system_aliases{$name}) { |
| 493 | $ord = $system_aliases{$name}; |
| 494 | $fname = $name; |
| 495 | } |
| 496 | elsif (exists $deprecated_aliases{$name}) { |
| 497 | require warnings; |
| 498 | warnings::warnif('deprecated', "Unicode character name \"$name\" is deprecated, use \"" . viacode($deprecated_aliases{$name}) . "\" instead"); |
| 499 | $ord = $deprecated_aliases{$name}; |
| 500 | $fname = $name; |
| 501 | } |
| 502 | |
| 503 | my @off; |
| 504 | |
| 505 | if (! defined $ord) { |
| 506 | ## Suck in the code/name list as a big string. |
| 507 | ## Lines look like: |
| 508 | ## "0052\t\tLATIN CAPITAL LETTER R\n" |
| 509 | $txt = do "unicore/Name.pl" unless $txt; |
| 510 | |
| 511 | ## @off will hold the index into the code/name string of the start and |
| 512 | ## end of the name as we find it. |
| 513 | |
| 514 | ## If :full, look for the name exactly |
| 515 | if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { |
| 516 | @off = ($-[0], $+[0]); |
| 517 | } |
| 518 | |
| 519 | ## If we didn't get above, and :short allowed, look for the short name. |
| 520 | ## The short name is like "greek:Sigma" |
| 521 | unless (@off) { |
| 522 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { |
| 523 | my ($script, $cname) = ($1, $2); |
| 524 | my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; |
| 525 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { |
| 526 | @off = ($-[0], $+[0]); |
| 527 | } |
| 528 | } |
| 529 | } |
| 530 | |
| 531 | ## If we still don't have it, check for the name among the loaded |
| 532 | ## scripts. |
| 533 | if (not @off) { |
| 534 | my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; |
| 535 | for my $script (@{$^H{charnames_scripts}}) { |
| 536 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { |
| 537 | @off = ($-[0], $+[0]); |
| 538 | last; |
| 539 | } |
| 540 | } |
| 541 | } |
| 542 | |
| 543 | ## If we don't have it by now, give up. |
| 544 | unless (@off) { |
| 545 | carp "Unknown charname '$name'"; |
| 546 | return "\x{FFFD}"; |
| 547 | } |
| 548 | |
| 549 | ## |
| 550 | ## Now know where in the string the name starts. |
| 551 | ## The code, in hex, is before that. |
| 552 | ## |
| 553 | ## The code can be 4-6 characters long, so we've got to sort of |
| 554 | ## go look for it, just after the newline that comes before $off[0]. |
| 555 | ## |
| 556 | ## This would be much easier if unicore/Name.pl had info in |
| 557 | ## a name/code order, instead of code/name order. |
| 558 | ## |
| 559 | ## The +1 after the rindex() is to skip past the newline we're finding, |
| 560 | ## or, if the rindex() fails, to put us to an offset of zero. |
| 561 | ## |
| 562 | my $hexstart = rindex($txt, "\n", $off[0]) + 1; |
| 563 | |
| 564 | ## we know where it starts, so turn into number - |
| 565 | ## the ordinal for the char. |
| 566 | $ord = CORE::hex substr($txt, $hexstart, $off[0] - $hexstart); |
| 567 | } |
| 568 | |
| 569 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect? |
| 570 | use bytes; |
| 571 | return chr $ord if $ord <= 255; |
| 572 | my $hex = sprintf "%04x", $ord; |
| 573 | if (not defined $fname) { |
| 574 | $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; |
| 575 | } |
| 576 | croak "Character 0x$hex with name '$fname' is above 0xFF"; |
| 577 | } |
| 578 | |
| 579 | no warnings 'utf8'; # allow even illegal characters |
| 580 | return pack "U", $ord; |
| 581 | } # charnames |
| 582 | |
| 583 | sub import |
| 584 | { |
| 585 | shift; ## ignore class name |
| 586 | |
| 587 | if (not @_) { |
| 588 | carp("`use charnames' needs explicit imports list"); |
| 589 | } |
| 590 | $^H{charnames} = \&charnames ; |
| 591 | |
| 592 | ## |
| 593 | ## fill %h keys with our @_ args. |
| 594 | ## |
| 595 | my ($promote, %h, @args) = (0); |
| 596 | while (my $arg = shift) { |
| 597 | if ($arg eq ":alias") { |
| 598 | @_ or |
| 599 | croak ":alias needs an argument in charnames"; |
| 600 | my $alias = shift; |
| 601 | if (ref $alias) { |
| 602 | ref $alias eq "HASH" or |
| 603 | croak "Only HASH reference supported as argument to :alias"; |
| 604 | alias ($alias); |
| 605 | next; |
| 606 | } |
| 607 | if ($alias =~ m{:(\w+)$}) { |
| 608 | $1 eq "full" || $1 eq "short" and |
| 609 | croak ":alias cannot use existing pragma :$1 (reversed order?)"; |
| 610 | alias_file ($1) and $promote = 1; |
| 611 | next; |
| 612 | } |
| 613 | alias_file ($alias); |
| 614 | next; |
| 615 | } |
| 616 | if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) { |
| 617 | warn "unsupported special '$arg' in charnames"; |
| 618 | next; |
| 619 | } |
| 620 | push @args, $arg; |
| 621 | } |
| 622 | @args == 0 && $promote and @args = (":full"); |
| 623 | @h{@args} = (1) x @args; |
| 624 | |
| 625 | $^H{charnames_full} = delete $h{':full'}; |
| 626 | $^H{charnames_short} = delete $h{':short'}; |
| 627 | $^H{charnames_scripts} = [map uc, keys %h]; |
| 628 | |
| 629 | ## |
| 630 | ## If utf8? warnings are enabled, and some scripts were given, |
| 631 | ## see if at least we can find one letter of each script. |
| 632 | ## |
| 633 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) { |
| 634 | $txt = do "unicore/Name.pl" unless $txt; |
| 635 | |
| 636 | for my $script (@{$^H{charnames_scripts}}) { |
| 637 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { |
| 638 | warnings::warn('utf8', "No such script: '$script'"); |
| 639 | } |
| 640 | } |
| 641 | } |
| 642 | } # import |
| 643 | |
| 644 | my %viacode; |
| 645 | |
| 646 | sub viacode |
| 647 | { |
| 648 | if (@_ != 1) { |
| 649 | carp "charnames::viacode() expects one argument"; |
| 650 | return; |
| 651 | } |
| 652 | |
| 653 | my $arg = shift; |
| 654 | |
| 655 | # this is derived from Unicode::UCD, where it is nearly the same as the |
| 656 | # function _getcode(), but it makes sure that even a hex argument has the |
| 657 | # proper number of leading zeros, which is critical in matching against $txt |
| 658 | # below |
| 659 | my $hex; |
| 660 | if ($arg =~ $decimal_qr) { |
| 661 | $hex = sprintf "%04X", $arg; |
| 662 | } elsif ($arg =~ $hex_qr) { |
| 663 | # Below is the line that differs from the _getcode() source |
| 664 | $hex = sprintf "%04X", hex $1; |
| 665 | } else { |
| 666 | carp("unexpected arg \"$arg\" to charnames::viacode()"); |
| 667 | return; |
| 668 | } |
| 669 | |
| 670 | # checking the length first is slightly faster |
| 671 | if (length($hex) > 5 && CORE::hex($hex) > 0x10FFFF) { |
| 672 | carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)"; |
| 673 | return; |
| 674 | } |
| 675 | |
| 676 | return $viacode{$hex} if exists $viacode{$hex}; |
| 677 | |
| 678 | $txt = do "unicore/Name.pl" unless $txt; |
| 679 | |
| 680 | # Return the official name, if exists |
| 681 | if ($txt =~ m/^$hex\t\t(.+)/m) { |
| 682 | $viacode{$hex} = $1; |
| 683 | return $1; |
| 684 | } |
| 685 | |
| 686 | # See if there is a user name for it, before giving up completely. |
| 687 | return if ! exists $inverse_user_aliases{$hex}; |
| 688 | |
| 689 | $viacode{$hex} = $inverse_user_aliases{$hex}; |
| 690 | return $inverse_user_aliases{$hex}; |
| 691 | } # viacode |
| 692 | |
| 693 | my %vianame; |
| 694 | |
| 695 | sub vianame |
| 696 | { |
| 697 | if (@_ != 1) { |
| 698 | carp "charnames::vianame() expects one name argument"; |
| 699 | return () |
| 700 | } |
| 701 | |
| 702 | my $arg = shift; |
| 703 | |
| 704 | return chr CORE::hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; |
| 705 | |
| 706 | return $vianame{$arg} if exists $vianame{$arg}; |
| 707 | |
| 708 | $txt = do "unicore/Name.pl" unless $txt; |
| 709 | |
| 710 | my $pos = index $txt, "\t\t$arg\n"; |
| 711 | if (0 <= $pos) { |
| 712 | my $posLF = rindex $txt, "\n", $pos; |
| 713 | (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d; |
| 714 | return $vianame{$arg} = CORE::hex $code; |
| 715 | |
| 716 | # If $pos is at the 1st line, $posLF must be -1 (not found); |
| 717 | # then $posLF + 1 equals to 0 (at the beginning of $txt). |
| 718 | # Otherwise $posLF is the position of "\n"; |
| 719 | # then $posLF + 1 must be the position of the next to "\n" |
| 720 | # (the beginning of the line). |
| 721 | # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t", |
| 722 | # "10300\t", "100000", etc. So we can get the code via removing TAB. |
| 723 | } else { |
| 724 | return; |
| 725 | } |
| 726 | } # vianame |
| 727 | |
| 728 | |
| 729 | 1; |
| 730 | __END__ |
| 731 | |
| 732 | =head1 NAME |
| 733 | |
| 734 | charnames - define character names for C<\N{named}> string literal escapes |
| 735 | |
| 736 | =head1 SYNOPSIS |
| 737 | |
| 738 | use charnames ':full'; |
| 739 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; |
| 740 | |
| 741 | use charnames ':short'; |
| 742 | print "\N{greek:Sigma} is an upper-case sigma.\n"; |
| 743 | |
| 744 | use charnames qw(cyrillic greek); |
| 745 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; |
| 746 | |
| 747 | use charnames ":full", ":alias" => { |
| 748 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", |
| 749 | }; |
| 750 | print "\N{e_ACUTE} is a small letter e with an acute.\n"; |
| 751 | |
| 752 | use charnames (); |
| 753 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" |
| 754 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints |
| 755 | # "10330" |
| 756 | |
| 757 | =head1 DESCRIPTION |
| 758 | |
| 759 | Pragma C<use charnames> supports arguments C<:full>, C<:short>, script |
| 760 | names and customized aliases. If C<:full> is present, for expansion of |
| 761 | C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of |
| 762 | standard Unicode character names. If C<:short> is present, and |
| 763 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up |
| 764 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used |
| 765 | with script name arguments, then for C<\N{CHARNAME}> the name |
| 766 | C<CHARNAME> is looked up as a letter in the given scripts (in the |
| 767 | specified order). Customized aliases can override these, and are explained in |
| 768 | L</CUSTOM ALIASES>. |
| 769 | |
| 770 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> |
| 771 | this pragma looks for the names |
| 772 | |
| 773 | SCRIPTNAME CAPITAL LETTER CHARNAME |
| 774 | SCRIPTNAME SMALL LETTER CHARNAME |
| 775 | SCRIPTNAME LETTER CHARNAME |
| 776 | |
| 777 | in the table of standard Unicode names. If C<CHARNAME> is lowercase, |
| 778 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
| 779 | is ignored. |
| 780 | |
| 781 | Note that C<\N{...}> is compile-time, it's a special form of string |
| 782 | constant used inside double-quoted strings: in other words, you cannot |
| 783 | use variables inside the C<\N{...}>. If you want similar run-time |
| 784 | functionality, use charnames::vianame(). |
| 785 | |
| 786 | For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F) |
| 787 | as of Unicode 3.1, there are no official Unicode names but you can use |
| 788 | instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth, and their |
| 789 | abbreviations, LF, ESC, ...). In |
| 790 | Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429 |
| 791 | has been updated, see L</ALIASES>. |
| 792 | |
| 793 | Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}" |
| 794 | is the Unicode smiley face, or "\N{WHITE SMILING FACE}". |
| 795 | |
| 796 | =head1 ALIASES |
| 797 | |
| 798 | A few aliases have been defined for convenience: instead of having |
| 799 | to use the official names |
| 800 | |
| 801 | LINE FEED (LF) |
| 802 | FORM FEED (FF) |
| 803 | CARRIAGE RETURN (CR) |
| 804 | NEXT LINE (NEL) |
| 805 | |
| 806 | (yes, with parentheses) one can use |
| 807 | |
| 808 | LINE FEED |
| 809 | FORM FEED |
| 810 | CARRIAGE RETURN |
| 811 | NEXT LINE |
| 812 | LF |
| 813 | FF |
| 814 | CR |
| 815 | NEL |
| 816 | |
| 817 | All the other standard abbreviations for the controls, such as C<ACK> for |
| 818 | C<ACKNOWLEDGE> also can be used. |
| 819 | |
| 820 | One can also use |
| 821 | |
| 822 | BYTE ORDER MARK |
| 823 | BOM |
| 824 | |
| 825 | and these abbreviations |
| 826 | |
| 827 | Abbreviation Full Name |
| 828 | |
| 829 | CGJ COMBINING GRAPHEME JOINER |
| 830 | FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE |
| 831 | FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO |
| 832 | FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE |
| 833 | LRE LEFT-TO-RIGHT EMBEDDING |
| 834 | LRM LEFT-TO-RIGHT MARK |
| 835 | LRO LEFT-TO-RIGHT OVERRIDE |
| 836 | MMSP MEDIUM MATHEMATICAL SPACE |
| 837 | MVS MONGOLIAN VOWEL SEPARATOR |
| 838 | NBSP NO-BREAK SPACE |
| 839 | NNBSP NARROW NO-BREAK SPACE |
| 840 | PDF POP DIRECTIONAL FORMATTING |
| 841 | RLE RIGHT-TO-LEFT EMBEDDING |
| 842 | RLM RIGHT-TO-LEFT MARK |
| 843 | RLO RIGHT-TO-LEFT OVERRIDE |
| 844 | SHY SOFT HYPHEN |
| 845 | VS1 VARIATION SELECTOR-1 |
| 846 | . |
| 847 | . |
| 848 | . |
| 849 | VS256 VARIATION SELECTOR-256 |
| 850 | WJ WORD JOINER |
| 851 | ZWJ ZERO WIDTH JOINER |
| 852 | ZWNJ ZERO WIDTH NON-JOINER |
| 853 | ZWSP ZERO WIDTH SPACE |
| 854 | |
| 855 | For backward compatibility one can use the old names for |
| 856 | certain C0 and C1 controls |
| 857 | |
| 858 | old new |
| 859 | |
| 860 | FILE SEPARATOR INFORMATION SEPARATOR FOUR |
| 861 | GROUP SEPARATOR INFORMATION SEPARATOR THREE |
| 862 | HORIZONTAL TABULATION CHARACTER TABULATION |
| 863 | HORIZONTAL TABULATION SET CHARACTER TABULATION SET |
| 864 | HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION |
| 865 | WITH JUSTIFICATION |
| 866 | PARTIAL LINE DOWN PARTIAL LINE FORWARD |
| 867 | PARTIAL LINE UP PARTIAL LINE BACKWARD |
| 868 | RECORD SEPARATOR INFORMATION SEPARATOR TWO |
| 869 | REVERSE INDEX REVERSE LINE FEED |
| 870 | UNIT SEPARATOR INFORMATION SEPARATOR ONE |
| 871 | VERTICAL TABULATION LINE TABULATION |
| 872 | VERTICAL TABULATION SET LINE TABULATION SET |
| 873 | |
| 874 | but the old names in addition to giving the character |
| 875 | will also give a warning about being deprecated. |
| 876 | |
| 877 | And finally, certain published variants are usable, including some for |
| 878 | controls that have no Unicode names: |
| 879 | |
| 880 | END OF PROTECTED AREA |
| 881 | HIGH OCTET PRESET |
| 882 | HOP |
| 883 | IND |
| 884 | INDEX |
| 885 | PAD |
| 886 | PADDING CHARACTER |
| 887 | PRIVATE USE 1 |
| 888 | PRIVATE USE 2 |
| 889 | SGC |
| 890 | SINGLE GRAPHIC CHARACTER INTRODUCER |
| 891 | SINGLE-SHIFT 2 |
| 892 | SINGLE-SHIFT 3 |
| 893 | START OF PROTECTED AREA |
| 894 | |
| 895 | =head1 CUSTOM ALIASES |
| 896 | |
| 897 | This version of charnames supports three mechanisms of adding local |
| 898 | or customized aliases to standard Unicode naming conventions (:full). |
| 899 | The aliases override any standard definitions, so, if you're twisted enough, |
| 900 | you can change C<"\N{LATIN CAPITAL LETTER A}"> to mean C<"B">, etc. |
| 901 | |
| 902 | Note that an alias should not be something that is a legal curly |
| 903 | brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example |
| 904 | C<\N{123}> means to match 123 non-newline characters, and is not treated as an |
| 905 | alias. Aliases are discouraged from beginning with anything other than an |
| 906 | alphabetic character and from containing anything other than alphanumerics, |
| 907 | spaces, dashes, colons, parentheses, and underscores. Currently they must be |
| 908 | ASCII. |
| 909 | |
| 910 | An alias can map to either an official Unicode character name or numeric |
| 911 | code point (ordinal). The latter is useful for assigning names to code |
| 912 | points in Unicode private use areas such as U+E000 through U+F8FF. The |
| 913 | number must look like an unsigned decimal integer, or a hexadecimal |
| 914 | constant beginning with C<0x>, or <U+>. |
| 915 | |
| 916 | =head2 Anonymous hashes |
| 917 | |
| 918 | use charnames ":full", ":alias" => { |
| 919 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", |
| 920 | mychar1 => 0xE8000, |
| 921 | }; |
| 922 | my $str = "\N{e_ACUTE}"; |
| 923 | |
| 924 | =head2 Alias file |
| 925 | |
| 926 | use charnames ":full", ":alias" => "pro"; |
| 927 | |
| 928 | will try to read "unicore/pro_alias.pl" from the @INC path. This |
| 929 | file should return a list in plain perl: |
| 930 | |
| 931 | ( |
| 932 | A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE", |
| 933 | A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", |
| 934 | A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS", |
| 935 | A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE", |
| 936 | A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE", |
| 937 | A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE", |
| 938 | A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON", |
| 939 | mychar2 => U+E8001, |
| 940 | ); |
| 941 | |
| 942 | =head2 Alias shortcut |
| 943 | |
| 944 | use charnames ":alias" => ":pro"; |
| 945 | |
| 946 | works exactly the same as the alias pairs, only this time, |
| 947 | ":full" is inserted automatically as the first argument (if no |
| 948 | other argument is given). |
| 949 | |
| 950 | =head1 charnames::viacode(code) |
| 951 | |
| 952 | Returns the full name of the character indicated by the numeric code. |
| 953 | The example |
| 954 | |
| 955 | print charnames::viacode(0x2722); |
| 956 | |
| 957 | prints "FOUR TEARDROP-SPOKED ASTERISK". |
| 958 | |
| 959 | Returns undef if no name is known for the code. |
| 960 | |
| 961 | The name returned is the official name for the code point, if |
| 962 | available, otherwise your custom alias for it. This means that your |
| 963 | alias will only be returned for code points that don't have an official |
| 964 | Unicode name (nor Unicode version 1 name), such as private use code |
| 965 | points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099. |
| 966 | |
| 967 | Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK |
| 968 | SPACE", not "BYTE ORDER MARK". |
| 969 | |
| 970 | =head1 charnames::vianame(name) |
| 971 | |
| 972 | Returns the code point indicated by the name. |
| 973 | The example |
| 974 | |
| 975 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); |
| 976 | |
| 977 | prints "2722". |
| 978 | |
| 979 | Returns undef if the name is unknown. |
| 980 | |
| 981 | This works only for the standard names, and does not yet apply |
| 982 | to custom translators. |
| 983 | |
| 984 | =head1 CUSTOM TRANSLATORS |
| 985 | |
| 986 | The mechanism of translation of C<\N{...}> escapes is general and not |
| 987 | hardwired into F<charnames.pm>. A module can install custom |
| 988 | translations (inside the scope which C<use>s the module) with the |
| 989 | following magic incantation: |
| 990 | |
| 991 | sub import { |
| 992 | shift; |
| 993 | $^H{charnames} = \&translator; |
| 994 | } |
| 995 | |
| 996 | Here translator() is a subroutine which takes C<CHARNAME> as an |
| 997 | argument, and returns text to insert into the string instead of the |
| 998 | C<\N{CHARNAME}> escape. Since the text to insert should be different |
| 999 | in C<bytes> mode and out of it, the function should check the current |
| 1000 | state of C<bytes>-flag as in: |
| 1001 | |
| 1002 | use bytes (); # for $bytes::hint_bits |
| 1003 | sub translator { |
| 1004 | if ($^H & $bytes::hint_bits) { |
| 1005 | return bytes_translator(@_); |
| 1006 | } |
| 1007 | else { |
| 1008 | return utf8_translator(@_); |
| 1009 | } |
| 1010 | } |
| 1011 | |
| 1012 | See L</CUSTOM ALIASES> above for restrictions on C<CHARNAME>. |
| 1013 | |
| 1014 | =head1 ILLEGAL CHARACTERS |
| 1015 | |
| 1016 | If you ask by name for a character that does not exist, a warning is given and |
| 1017 | the Unicode I<replacement character> "\x{FFFD}" is returned. |
| 1018 | |
| 1019 | If you ask by code (C<charnames::viacode()>) for a character that is |
| 1020 | unassigned, no warning is given and C<undef> is returned. In Unicode |
| 1021 | the proper name of these is the empty string, which C<undef> stringifies |
| 1022 | to. (If you ask for a code point past the legal Unicode maximum of |
| 1023 | U+10FFFF you do get C<undef> and a warning.) |
| 1024 | |
| 1025 | =head1 BUGS |
| 1026 | |
| 1027 | vianame returns a chr if the input name is of the form C<U+...>, and an ord |
| 1028 | otherwise. It is proposed to change this to always return an ord. Send email |
| 1029 | to C<perl5-porters@perl.org> to comment on this proposal. |
| 1030 | |
| 1031 | None of the functions work on almost all the Hangul syllable and CJK Unicode |
| 1032 | characters that have their code points as part of their names. |
| 1033 | |
| 1034 | Names must be ASCII characters only, which means that you are out of luck if |
| 1035 | you want to create aliases in a language where some or all the characters of |
| 1036 | the desired aliases are non-ASCII. |
| 1037 | |
| 1038 | Unicode standard named sequences are not recognized, such as |
| 1039 | C<LATIN CAPITAL LETTER A WITH MACRON AND GRAVE> |
| 1040 | (which should mean C<LATIN CAPITAL LETTER A WITH MACRON> with an additional |
| 1041 | C<COMBINING GRAVE ACCENT>). |
| 1042 | |
| 1043 | Since evaluation of the translation function happens in the middle of |
| 1044 | compilation (of a string literal), the translation function should not |
| 1045 | do any C<eval>s or C<require>s. This restriction should be lifted in |
| 1046 | a future version of Perl. |
| 1047 | |
| 1048 | =cut |