Commit | Line | Data |
---|---|---|
7899b636 KW |
1 | =provides |
2 | ||
3 | __UNDEFINED__ | |
4 | utf8_to_uvchr_buf | |
0570adb7 P |
5 | sv_len_utf8 |
6 | sv_len_utf8_nomg | |
7899b636 KW |
7 | |
8 | =implementation | |
9 | ||
10 | #define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b)) | |
11 | ||
12 | __UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD | |
13 | ||
d4cea7f8 | 14 | #ifdef UTF8_MAXLEN |
7899b636 | 15 | __UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN |
d4cea7f8 | 16 | #endif |
7899b636 | 17 | |
a807d978 KW |
18 | __UNDEF_NOT_PROVIDED__ UTF_START_MARK(len) \ |
19 | (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len))))) | |
20 | ||
21 | #if 'A' == 65 | |
22 | __UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT 6 | |
23 | #else | |
24 | __UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT 5 | |
25 | #endif | |
26 | ||
27 | #ifdef NATIVE_TO_UTF | |
28 | __UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) NATIVE_TO_UTF(c) | |
29 | #else /* System doesn't support EBCDIC */ | |
30 | __UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) (c) | |
31 | #endif | |
32 | ||
33 | #ifdef UTF_TO_NATIVE | |
34 | __UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) UTF_TO_NATIVE(c) | |
35 | #else /* System doesn't support EBCDIC */ | |
36 | __UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) (c) | |
37 | #endif | |
38 | ||
39 | __UNDEF_NOT_PROVIDED__ UTF_START_MASK(len) \ | |
40 | (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2))) | |
41 | __UNDEF_NOT_PROVIDED__ UTF_IS_CONTINUATION_MASK \ | |
42 | ((U8) (0xFF << UTF_ACCUMULATION_SHIFT)) | |
43 | __UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MARK \ | |
44 | (UTF_IS_CONTINUATION_MASK & 0xB0) | |
45 | __UNDEF_NOT_PROVIDED__ UTF_MIN_START_BYTE \ | |
46 | ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) | |
47 | ||
48 | __UNDEF_NOT_PROVIDED__ UTF_MIN_ABOVE_LATIN1_BYTE \ | |
49 | ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) | |
50 | ||
51 | #if { VERSION < 5.007 } /* Was the complement of what should have been */ | |
52 | # undef UTF8_IS_DOWNGRADEABLE_START | |
53 | #endif | |
54 | __UNDEF_NOT_PROVIDED__ UTF8_IS_DOWNGRADEABLE_START(c) \ | |
55 | inRANGE(NATIVE_UTF8_TO_I8(c), \ | |
56 | UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1) | |
57 | __UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MASK \ | |
58 | ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1)) | |
59 | ||
60 | __UNDEF_NOT_PROVIDED__ UTF8_ACCUMULATE(base, added) \ | |
61 | (((base) << UTF_ACCUMULATION_SHIFT) \ | |
62 | | ((NATIVE_UTF8_TO_I8(added)) \ | |
63 | & UTF_CONTINUATION_MASK)) | |
64 | ||
971bc248 KW |
65 | __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANYUV 0 |
66 | __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_EMPTY 0x0001 | |
67 | __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_CONTINUATION 0x0002 | |
68 | __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_NON_CONTINUATION 0x0004 | |
69 | __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_SHORT 0x0008 | |
70 | __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_LONG 0x0010 | |
71 | __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_OVERFLOW 0x0080 | |
72 | __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \ | |
73 | |UTF8_ALLOW_NON_CONTINUATION \ | |
74 | |UTF8_ALLOW_SHORT \ | |
75 | |UTF8_ALLOW_LONG \ | |
76 | |UTF8_ALLOW_OVERFLOW) | |
7899b636 KW |
77 | |
78 | #if defined UTF8SKIP | |
79 | ||
80 | /* Don't use official version because it uses MIN, which may not be available */ | |
81 | #undef UTF8_SAFE_SKIP | |
82 | ||
83 | __UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ | |
84 | ((((e) - (s)) <= 0) \ | |
85 | ? 0 \ | |
86 | : D_PPP_MIN(((e) - (s)), UTF8SKIP(s)))) | |
be9a67f5 | 87 | |
7f7c413a KW |
88 | __UNDEFINED__ UTF8_CHK_SKIP(s) \ |
89 | (s[0] == '\0' ? 1 : ((U8) D_PPP_MIN(my_strnlen((char *) (s), UTF8SKIP(s)), \ | |
90 | UTF8SKIP(s)))) | |
be9a67f5 | 91 | __UNDEFINED__ UTF8_SKIP(s) UTF8SKIP(s) |
7899b636 KW |
92 | #endif |
93 | ||
e2da5d9e KW |
94 | #if 'A' == 65 |
95 | __UNDEFINED__ UTF8_IS_INVARIANT(c) isASCII(c) | |
96 | #else | |
97 | __UNDEFINED__ UTF8_IS_INVARIANT(c) (isASCII(c) || isCNTRL_L1(c)) | |
98 | #endif | |
99 | ||
100 | __UNDEFINED__ UVCHR_IS_INVARIANT(c) UTF8_IS_INVARIANT(c) | |
101 | ||
08d12710 KW |
102 | #ifdef UVCHR_IS_INVARIANT |
103 | # if 'A' == 65 | |
104 | # define D_PPP_BYTE_INFO_BITS 6 /* 6 bits meaningful in continuation bytes */ | |
105 | # ifdef QUADKIND | |
106 | # define D_PPP_UVCHR_SKIP_UPPER(c) \ | |
107 | (WIDEST_UTYPE) (c) < \ | |
108 | (((WIDEST_UTYPE) 1) << (6 * D_PPP_BYTE_INFO_BITS)) ? 7 : 13 | |
109 | # else | |
110 | # define D_PPP_UVCHR_SKIP_UPPER(c) 7 /* 32 bit platform */ | |
111 | # endif | |
112 | # else | |
113 | # define D_PPP_BYTE_INFO_BITS 5 /* EBCDIC has only 5 meaningful bits */ | |
114 | ||
115 | /* In the releases this is backported to, UTF-EBCDIC had a max of 2**31-1 */ | |
116 | # define D_PPP_UVCHR_SKIP_UPPER(c) 7 | |
117 | # endif | |
118 | ||
119 | __UNDEFINED__ UVCHR_SKIP(c) \ | |
120 | UVCHR_IS_INVARIANT(c) ? 1 : \ | |
121 | (WIDEST_UTYPE) (c) < (32 * (1U << ( D_PPP_BYTE_INFO_BITS))) ? 2 : \ | |
122 | (WIDEST_UTYPE) (c) < (16 * (1U << (2 * D_PPP_BYTE_INFO_BITS))) ? 3 : \ | |
123 | (WIDEST_UTYPE) (c) < ( 8 * (1U << (3 * D_PPP_BYTE_INFO_BITS))) ? 4 : \ | |
124 | (WIDEST_UTYPE) (c) < ( 4 * (1U << (4 * D_PPP_BYTE_INFO_BITS))) ? 5 : \ | |
125 | (WIDEST_UTYPE) (c) < ( 2 * (1U << (5 * D_PPP_BYTE_INFO_BITS))) ? 6 : \ | |
126 | D_PPP_UVCHR_SKIP_UPPER(c) | |
127 | #endif | |
128 | ||
d2b44e2b KW |
129 | #ifdef is_ascii_string |
130 | __UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l) | |
131 | __UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l) | |
132 | ||
1242cf1e KW |
133 | /* Hint: is_ascii_string, is_invariant_string |
134 | is_utf8_invariant_string() does the same thing and is preferred because its | |
135 | name is more accurate as to what it does */ | |
d2b44e2b KW |
136 | #endif |
137 | ||
2ff9e5e8 KW |
138 | #ifdef ibcmp_utf8 |
139 | __UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \ | |
140 | cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2)) | |
141 | #endif | |
142 | ||
7899b636 KW |
143 | #if defined(is_utf8_string) && defined(UTF8SKIP) |
144 | __UNDEFINED__ isUTF8_CHAR(s0, e) ( \ | |
145 | (e) <= (s0) || ! is_utf8_string(s0, D_PPP_MIN(UTF8SKIP(s0), (e) - (s0))) \ | |
146 | ? 0 \ | |
147 | : UTF8SKIP(s0)) | |
148 | #endif | |
149 | ||
8fc7db65 KW |
150 | #if 'A' == 65 |
151 | __UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF" | |
152 | __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD" | |
153 | #elif '^' == 95 | |
154 | __UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73" | |
155 | __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71" | |
156 | #elif '^' == 176 | |
157 | __UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72" | |
158 | __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70" | |
159 | #else | |
160 | # error Unknown character set | |
161 | #endif | |
162 | ||
96656f64 | 163 | #if { VERSION < 5.31.4 } |
7899b636 KW |
164 | /* Versions prior to this accepted things that are now considered |
165 | * malformations, and didn't return -1 on error with warnings enabled | |
166 | * */ | |
167 | # undef utf8_to_uvchr_buf | |
168 | #endif | |
169 | ||
170 | /* This implementation brings modern, generally more restricted standards to | |
171 | * utf8_to_uvchr_buf. Some of these are security related, and clearly must | |
172 | * be done. But its arguable that the others need not, and hence should not. | |
173 | * The reason they're here is that a module that intends to play with the | |
174 | * latest perls should be able to work the same in all releases. An example is | |
175 | * that perl no longer accepts any UV for a code point, but limits them to | |
176 | * IV_MAX or below. This is for future internal use of the larger code points. | |
177 | * If it turns out that some of these changes are breaking code that isn't | |
178 | * intended to work with modern perls, the tighter restrictions could be | |
179 | * relaxed. khw thinks this is unlikely, but has been wrong in the past. */ | |
180 | ||
181 | /* 5.6.0 is the first release with UTF-8, and we don't implement this function | |
182 | * there due to its likely lack of still being in use, and the underlying | |
183 | * implementation is very different from later ones, without the later | |
184 | * safeguards, so would require extra work to deal with */ | |
185 | #if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf) | |
186 | /* Choose which underlying implementation to use. At least one must be | |
187 | * present or the perl is too early to handle this function */ | |
08811920 | 188 | # if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv) |
7899b636 KW |
189 | # if defined(utf8n_to_uvchr) /* This is the preferred implementation */ |
190 | # define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr | |
08811920 KW |
191 | # elif /* Must be at least 5.6.1 from #if above; \ |
192 | If have both regular and _simple, regular has all args */ \ | |
193 | defined(utf8_to_uv) && defined(utf8_to_uv_simple) | |
194 | # define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv | |
195 | # elif defined(utf8_to_uvchr) /* The below won't work well on error input */ | |
196 | # define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \ | |
197 | utf8_to_uvchr((U8 *)(s), (retlen)) | |
198 | # else | |
199 | # define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \ | |
200 | utf8_to_uv((U8 *)(s), (retlen)) | |
7899b636 KW |
201 | # endif |
202 | # endif | |
203 | ||
204 | # if { NEED utf8_to_uvchr_buf } | |
205 | ||
206 | UV | |
207 | utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) | |
208 | { | |
209 | UV ret; | |
210 | STRLEN curlen; | |
211 | bool overflows = 0; | |
212 | const U8 *cur_s = s; | |
213 | const bool do_warnings = ckWARN_d(WARN_UTF8); | |
214 | # if { VERSION < 5.26.0 } && ! defined(EBCDIC) | |
215 | STRLEN overflow_length = 0; | |
216 | # endif | |
217 | ||
218 | if (send > s) { | |
219 | curlen = send - s; | |
220 | } | |
221 | else { | |
222 | assert(0); /* Modern perls die under this circumstance */ | |
223 | curlen = 0; | |
224 | if (! do_warnings) { /* Handle empty here if no warnings needed */ | |
225 | if (retlen) *retlen = 0; | |
226 | return UNICODE_REPLACEMENT; | |
227 | } | |
228 | } | |
229 | ||
230 | # if { VERSION < 5.26.0 } && ! defined(EBCDIC) | |
231 | ||
232 | /* Perl did not properly detect overflow for much of its history on | |
233 | * non-EBCDIC platforms, often returning an overlong value which may or may | |
234 | * not have been tolerated in the call. Also, earlier versions, when they | |
235 | * did detect overflow, may have disallowed it completely. Modern ones can | |
236 | * replace it with the REPLACEMENT CHARACTER, depending on calling | |
237 | * parameters. Therefore detect it ourselves in releases it was | |
238 | * problematic in. */ | |
239 | ||
240 | if (curlen > 0 && UNLIKELY(*s >= 0xFE)) { | |
241 | ||
242 | /* First, on a 32-bit machine the first byte being at least \xFE | |
243 | * automatically is overflow, as it indicates something requiring more | |
244 | * than 31 bits */ | |
245 | if (sizeof(ret) < 8) { | |
246 | overflows = 1; | |
247 | overflow_length = 7; | |
248 | } | |
249 | else { | |
250 | const U8 highest[] = /* 2*63-1 */ | |
251 | "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; | |
252 | const U8 *cur_h = highest; | |
253 | ||
254 | for (cur_s = s; cur_s < send; cur_s++, cur_h++) { | |
255 | if (UNLIKELY(*cur_s == *cur_h)) { | |
256 | continue; | |
257 | } | |
258 | ||
259 | /* If this byte is larger than the corresponding highest UTF-8 | |
260 | * byte, the sequence overflows; otherwise the byte is less | |
261 | * than (as we handled the equality case above), and so the | |
262 | * sequence doesn't overflow */ | |
263 | overflows = *cur_s > *cur_h; | |
264 | break; | |
265 | ||
266 | } | |
267 | ||
268 | /* Here, either we set the bool and broke out of the loop, or got | |
269 | * to the end and all bytes are the same which indicates it doesn't | |
270 | * overflow. If it did overflow, it would be this number of bytes | |
271 | * */ | |
272 | overflow_length = 13; | |
273 | } | |
274 | } | |
275 | ||
276 | if (UNLIKELY(overflows)) { | |
277 | ret = 0; | |
278 | ||
279 | if (! do_warnings && retlen) { | |
280 | *retlen = overflow_length; | |
281 | } | |
282 | } | |
283 | else | |
284 | ||
285 | # endif /* < 5.26 */ | |
286 | ||
287 | /* Here, we are either in a release that properly detects overflow, or | |
288 | * we have checked for overflow and the next statement is executing as | |
289 | * part of the above conditional where we know we don't have overflow. | |
290 | * | |
291 | * The modern versions allow anything that evaluates to a legal UV, but | |
292 | * not overlongs nor an empty input */ | |
293 | ret = D_PPP_utf8_to_uvchr_buf_callee( | |
294 | s, curlen, retlen, (UTF8_ALLOW_ANYUV | |
295 | & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); | |
296 | ||
297 | # if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 } | |
298 | ||
299 | /* But actually, more modern versions restrict the UV to being no more than | |
f807bbfa KW |
300 | * what an IV can hold, so it could still have gotten it wrong about |
301 | * overflowing. */ | |
7899b636 KW |
302 | if (UNLIKELY(ret > IV_MAX)) { |
303 | overflows = 1; | |
304 | } | |
305 | ||
306 | # endif | |
307 | ||
308 | if (UNLIKELY(overflows)) { | |
309 | if (! do_warnings) { | |
310 | if (retlen) { | |
311 | *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); | |
312 | *retlen = D_PPP_MIN(*retlen, curlen); | |
313 | } | |
314 | return UNICODE_REPLACEMENT; | |
315 | } | |
316 | else { | |
317 | ||
318 | /* We use the error message in use from 5.8-5.26 */ | |
319 | Perl_warner(aTHX_ packWARN(WARN_UTF8), | |
320 | "Malformed UTF-8 character (overflow at 0x%" UVxf | |
321 | ", byte 0x%02x, after start byte 0x%02x)", | |
322 | ret, *cur_s, *s); | |
323 | if (retlen) { | |
324 | *retlen = (STRLEN) -1; | |
325 | } | |
326 | return 0; | |
327 | } | |
328 | } | |
329 | ||
330 | /* Here, did not overflow, but if it failed for some other reason, and | |
331 | * warnings are off, to emulate the behavior of the real utf8_to_uvchr(), | |
332 | * try again, allowing anything. (Note a return of 0 is ok if the input | |
333 | * was '\0') */ | |
334 | if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { | |
335 | ||
336 | /* If curlen is 0, we already handled the case where warnings are | |
337 | * disabled, so this 'if' will be true, and so later on, we know that | |
338 | * 's' is dereferencible */ | |
339 | if (do_warnings) { | |
340 | *retlen = (STRLEN) -1; | |
341 | } | |
342 | else { | |
343 | ret = D_PPP_utf8_to_uvchr_buf_callee( | |
344 | s, curlen, retlen, UTF8_ALLOW_ANY); | |
345 | /* Override with the REPLACEMENT character, as that is what the | |
346 | * modern version of this function returns */ | |
347 | ret = UNICODE_REPLACEMENT; | |
348 | ||
349 | # if { VERSION < 5.16.0 } | |
350 | ||
351 | /* Versions earlier than this don't necessarily return the proper | |
352 | * length. It should not extend past the end of string, nor past | |
353 | * what the first byte indicates the length is, nor past the | |
354 | * continuation characters */ | |
355 | if (retlen && *retlen >= 0) { | |
356 | unsigned int i = 1; | |
357 | ||
358 | *retlen = D_PPP_MIN(*retlen, curlen); | |
359 | *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); | |
360 | do { | |
d747fab5 KW |
361 | # ifdef UTF8_IS_CONTINUATION |
362 | if (! UTF8_IS_CONTINUATION(s[i])) | |
363 | # else /* Versions without the above don't support EBCDIC anyway */ | |
364 | if (s[i] < 0x80 || s[i] > 0xBF) | |
365 | # endif | |
366 | { | |
7899b636 KW |
367 | *retlen = i; |
368 | break; | |
369 | } | |
370 | } while (++i < *retlen); | |
371 | } | |
372 | ||
373 | # endif | |
374 | ||
375 | } | |
376 | } | |
377 | ||
378 | return ret; | |
379 | } | |
380 | ||
381 | # endif | |
382 | #endif | |
383 | ||
384 | #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) | |
385 | #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses | |
386 | to read past a NUL, making it much less likely to read | |
387 | off the end of the buffer. A NUL indicates the start | |
388 | of the next character anyway. If the input isn't | |
389 | NUL-terminated, the function remains unsafe, as it | |
390 | always has been. */ | |
391 | ||
392 | __UNDEFINED__ utf8_to_uvchr(s, lp) \ | |
393 | ((*(s) == '\0') \ | |
394 | ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ | |
7f7c413a | 395 | : utf8_to_uvchr_buf(s, (s) + UTF8_CHK_SKIP(s), (lp))) |
7899b636 KW |
396 | |
397 | #endif | |
398 | ||
7252d154 KW |
399 | /* Hint: utf8_to_uvchr |
400 | Use utf8_to_uvchr_buf() instead. But ONLY if you KNOW the upper bound | |
401 | of the input string (not resorting to using UTF8SKIP, etc., to infer it). | |
402 | The backported utf8_to_uvchr() will do a better job to prevent most cases | |
403 | of trying to read beyond the end of the buffer */ | |
404 | ||
405 | /* Replace utf8_to_uvchr with utf8_to_uvchr_buf */ | |
406 | ||
0570adb7 P |
407 | #ifdef SV_NOSTEAL |
408 | /* Older Perl versions have broken sv_len_utf8() when passed sv does not have SVf_UTF8 flag set */ | |
409 | /* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */ | |
410 | # if { VERSION < 5.17.5 } | |
411 | # undef sv_len_utf8 | |
412 | # if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN) | |
413 | # define sv_len_utf8_nomg(sv) ({ SV *_sv2 = (sv); (SvUTF8(_sv2) ? Perl_sv_len_utf8(aTHX_ (!SvGMAGICAL(_sv2) ? _sv2 : sv_mortalcopy_flags(_sv2, SV_NOSTEAL))) : ({ STRLEN _len; SvPV_nomg(_sv2, _len); _len; })); }) | |
414 | # define sv_len_utf8(sv) ({ SV *_sv1 = (sv); SvGETMAGIC(_sv1); sv_len_utf8_nomg(_sv1); }) | |
415 | # else | |
416 | # define sv_len_utf8_nomg(sv) (PL_Sv = (sv), (SvUTF8(PL_Sv) ? Perl_sv_len_utf8(aTHX_ (!SvGMAGICAL(PL_Sv) ? PL_Sv : sv_mortalcopy_flags(PL_Sv, SV_NOSTEAL))) : (SvPV_nomg(PL_Sv, PL_na), PL_na))) | |
417 | # define sv_len_utf8(sv) (PL_Sv = (sv), SvGETMAGIC(PL_Sv), sv_len_utf8_nomg(PL_Sv)) | |
418 | # endif | |
419 | # endif | |
420 | # if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN) | |
421 | __UNDEFINED__ sv_len_utf8_nomg(sv) ({ SV *_sv = (sv); sv_len_utf8(!SvGMAGICAL(_sv) ? _sv : sv_mortalcopy_flags(_sv, SV_NOSTEAL)); }) | |
422 | # else | |
423 | __UNDEFINED__ sv_len_utf8_nomg(sv) ((PL_Sv = (sv)), sv_len_utf8(!SvGMAGICAL(PL_Sv) ? PL_Sv : sv_mortalcopy_flags(PL_Sv, SV_NOSTEAL))) | |
424 | # endif | |
425 | #endif | |
426 | ||
7899b636 KW |
427 | =xsinit |
428 | ||
429 | #define NEED_utf8_to_uvchr_buf | |
430 | ||
431 | =xsubs | |
432 | ||
a807d978 KW |
433 | #if { VERSION >= 5.006 } /* This is just a helper fcn, not publicized */ \ |
434 | /* as being available and params not what the */ \ | |
435 | /* API function has; works on EBCDIC too */ | |
436 | ||
437 | SV * | |
438 | uvoffuni_to_utf8(uni) | |
439 | ||
440 | UV uni | |
441 | PREINIT: | |
442 | int len; | |
443 | U8 string[UTF8_MAXBYTES+1]; | |
444 | int i; | |
445 | UV native; | |
446 | CODE: | |
447 | native = UNI_TO_NATIVE(uni); | |
448 | ||
449 | len = UVCHR_SKIP(native); | |
450 | ||
451 | for (i = 0; i < len; i++) { | |
452 | string[i] = '\0'; | |
453 | } | |
454 | ||
455 | if (len <= 1) { | |
456 | string[0] = native; | |
457 | } | |
458 | else { | |
459 | i = len; | |
460 | while (i-- > 1) { | |
461 | string[i] = I8_TO_NATIVE_UTF8((uni & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); | |
462 | uni >>= UTF_ACCUMULATION_SHIFT; | |
463 | } | |
464 | string[0] = I8_TO_NATIVE_UTF8((uni & UTF_START_MASK(len)) | UTF_START_MARK(len)); | |
465 | } | |
466 | ||
467 | RETVAL = newSVpvn((char *) string, len); | |
468 | SvUTF8_on(RETVAL); | |
469 | OUTPUT: | |
470 | RETVAL | |
471 | ||
472 | #endif | |
7899b636 KW |
473 | #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP) |
474 | ||
475 | STRLEN | |
476 | UTF8_SAFE_SKIP(s, adjustment) | |
477 | char * s | |
478 | int adjustment | |
479 | PREINIT: | |
480 | const char *const_s; | |
481 | CODE: | |
482 | const_s = s; | |
483 | /* Instead of passing in an 'e' ptr, use the real end, adjusted */ | |
484 | RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment); | |
485 | OUTPUT: | |
486 | RETVAL | |
487 | ||
488 | #endif | |
489 | ||
490 | #ifdef isUTF8_CHAR | |
491 | ||
492 | STRLEN | |
493 | isUTF8_CHAR(s, adjustment) | |
494 | unsigned char * s | |
495 | int adjustment | |
496 | PREINIT: | |
497 | const unsigned char *const_s; | |
498 | const unsigned char *const_e; | |
499 | CODE: | |
500 | const_s = s; | |
501 | /* Instead of passing in an 'e' ptr, use the real end, adjusted */ | |
502 | const_e = const_s + UTF8SKIP(const_s) + adjustment; | |
503 | RETVAL = isUTF8_CHAR(const_s, const_e); | |
504 | OUTPUT: | |
505 | RETVAL | |
506 | ||
507 | #endif | |
508 | ||
2ff9e5e8 KW |
509 | |
510 | #ifdef foldEQ_utf8 | |
511 | ||
512 | STRLEN | |
513 | foldEQ_utf8(s1, l1, u1, s2, l2, u2) | |
514 | char *s1 | |
515 | UV l1 | |
516 | bool u1 | |
517 | char *s2 | |
518 | UV l2 | |
519 | bool u2 | |
520 | PREINIT: | |
521 | const char *const_s1; | |
522 | const char *const_s2; | |
523 | CODE: | |
524 | const_s1 = s1; | |
525 | const_s2 = s2; | |
526 | RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2); | |
527 | OUTPUT: | |
528 | RETVAL | |
529 | ||
530 | #endif | |
531 | ||
7899b636 KW |
532 | #ifdef utf8_to_uvchr_buf |
533 | ||
534 | AV * | |
535 | utf8_to_uvchr_buf(s, adjustment) | |
536 | unsigned char *s | |
537 | int adjustment | |
538 | PREINIT: | |
539 | AV *av; | |
540 | STRLEN len; | |
541 | const unsigned char *const_s; | |
542 | CODE: | |
543 | av = newAV(); | |
544 | const_s = s; | |
545 | av_push(av, newSVuv(utf8_to_uvchr_buf(const_s, | |
546 | s + UTF8SKIP(s) + adjustment, | |
547 | &len))); | |
548 | if (len == (STRLEN) -1) { | |
549 | av_push(av, newSViv(-1)); | |
550 | } | |
551 | else { | |
552 | av_push(av, newSVuv(len)); | |
553 | } | |
554 | RETVAL = av; | |
555 | OUTPUT: | |
556 | RETVAL | |
557 | ||
558 | #endif | |
559 | ||
560 | #ifdef utf8_to_uvchr | |
561 | ||
562 | AV * | |
563 | utf8_to_uvchr(s) | |
564 | unsigned char *s | |
565 | PREINIT: | |
566 | AV *av; | |
567 | STRLEN len; | |
568 | const unsigned char *const_s; | |
569 | CODE: | |
570 | av = newAV(); | |
571 | const_s = s; | |
572 | av_push(av, newSVuv(utf8_to_uvchr(const_s, &len))); | |
573 | if (len == (STRLEN) -1) { | |
574 | av_push(av, newSViv(-1)); | |
575 | } | |
576 | else { | |
577 | av_push(av, newSVuv(len)); | |
578 | } | |
579 | RETVAL = av; | |
580 | OUTPUT: | |
581 | RETVAL | |
582 | ||
583 | #endif | |
584 | ||
0570adb7 P |
585 | #ifdef SV_NOSTEAL |
586 | ||
587 | STRLEN | |
588 | sv_len_utf8(sv) | |
589 | SV *sv | |
590 | CODE: | |
591 | RETVAL = sv_len_utf8(sv); | |
592 | OUTPUT: | |
593 | RETVAL | |
594 | ||
595 | STRLEN | |
596 | sv_len_utf8_nomg(sv) | |
597 | SV *sv | |
598 | CODE: | |
599 | RETVAL = sv_len_utf8_nomg(sv); | |
600 | OUTPUT: | |
601 | RETVAL | |
602 | ||
603 | #endif | |
604 | ||
e2da5d9e KW |
605 | #ifdef UVCHR_IS_INVARIANT |
606 | ||
607 | bool | |
608 | UVCHR_IS_INVARIANT(c) | |
609 | unsigned c | |
610 | PREINIT: | |
611 | CODE: | |
612 | RETVAL = UVCHR_IS_INVARIANT(c); | |
613 | OUTPUT: | |
614 | RETVAL | |
615 | ||
616 | #endif | |
617 | ||
08d12710 KW |
618 | #ifdef UVCHR_SKIP |
619 | ||
620 | STRLEN | |
621 | UVCHR_SKIP(c) | |
622 | UV c | |
623 | PREINIT: | |
624 | CODE: | |
625 | RETVAL = UVCHR_SKIP(c); | |
626 | OUTPUT: | |
627 | RETVAL | |
628 | ||
629 | #endif | |
630 | ||
631 | =tests plan => 93 | |
7899b636 | 632 | |
c45043a0 | 633 | BEGIN { require warnings if "$]" > '5.006' } |
7899b636 | 634 | |
08811920 KW |
635 | # skip tests on 5.6.0 and earlier, plus 7.0 |
636 | if ("$]" <= '5.006' || "$]" == '5.007' ) { | |
08d12710 | 637 | for (1..93) { |
08811920 KW |
638 | skip 'skip: broken utf8 support', 0; |
639 | } | |
7899b636 KW |
640 | exit; |
641 | } | |
642 | ||
643 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); | |
644 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0); | |
645 | ||
646 | ok(&Devel::PPPort::isUTF8_CHAR("A", -1), 0); | |
647 | ok(&Devel::PPPort::isUTF8_CHAR("A", 0), 1); | |
648 | ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0); | |
649 | ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2); | |
650 | ||
e2da5d9e KW |
651 | ok(&Devel::PPPort::UVCHR_IS_INVARIANT(ord("A")), 1); |
652 | ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0xb6)); | |
653 | ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0x100)); | |
654 | ||
08d12710 KW |
655 | if ("$]" < '5.006') { |
656 | for (1 ..9) { | |
657 | ok(1, 1) | |
658 | } | |
659 | } | |
660 | else { | |
661 | ok(&Devel::PPPort::UVCHR_SKIP(ord("A")), 1); | |
662 | ok(&Devel::PPPort::UVCHR_SKIP(0xb6), 2, "This is a test"); | |
663 | ok(&Devel::PPPort::UVCHR_SKIP(0x3FF), 2); | |
664 | ok(&Devel::PPPort::UVCHR_SKIP(0x3FFF), 3); | |
665 | ok(&Devel::PPPort::UVCHR_SKIP(0x3FFFF), 4); | |
666 | ok(&Devel::PPPort::UVCHR_SKIP(0x3FFFFF), 5); | |
667 | ok(&Devel::PPPort::UVCHR_SKIP(0x3FFFFFF), ord("A") == 65 ? 5 : 6); | |
668 | ok(&Devel::PPPort::UVCHR_SKIP(0x4000000), ord("A") == 65 ? 6 : 7); | |
669 | if (ord("A") != 65) { | |
670 | ok(1, 1) | |
671 | } | |
672 | else { | |
673 | ok(&Devel::PPPort::UVCHR_SKIP(0xFFFFFFFF), 7); | |
674 | } | |
675 | } | |
676 | ||
c45043a0 | 677 | if ("$]" < '5.008') { |
66da169a KW |
678 | for (1 ..3) { |
679 | ok(1, 1) | |
680 | } | |
2ff9e5e8 KW |
681 | } |
682 | else { | |
683 | ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1); | |
684 | ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0); | |
685 | ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0); | |
686 | } | |
687 | ||
7899b636 KW |
688 | my $ret = &Devel::PPPort::utf8_to_uvchr("A"); |
689 | ok($ret->[0], ord("A")); | |
690 | ok($ret->[1], 1); | |
691 | ||
692 | $ret = &Devel::PPPort::utf8_to_uvchr("\0"); | |
693 | ok($ret->[0], 0); | |
694 | ok($ret->[1], 1); | |
695 | ||
696 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); | |
697 | ok($ret->[0], ord("A")); | |
698 | ok($ret->[1], 1); | |
699 | ||
700 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); | |
701 | ok($ret->[0], 0); | |
702 | ok($ret->[1], 1); | |
703 | ||
704 | if (ord("A") != 65) { # tests not valid for EBCDIC | |
66da169a KW |
705 | for (1 .. (2 + 4 + (7 * 5))) { |
706 | ok(1, 1); | |
707 | } | |
7899b636 KW |
708 | } |
709 | else { | |
710 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); | |
711 | ok($ret->[0], 0x100); | |
712 | ok($ret->[1], 2); | |
713 | ||
714 | my @warnings; | |
715 | local $SIG{__WARN__} = sub { push @warnings, @_; }; | |
716 | ||
717 | { | |
c45043a0 | 718 | BEGIN { 'warnings'->import('utf8') if "$]" > '5.006' } |
7899b636 KW |
719 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); |
720 | ok($ret->[0], 0); | |
721 | ok($ret->[1], -1); | |
722 | ||
c45043a0 | 723 | BEGIN { 'warnings'->unimport() if "$]" > '5.006' } |
7899b636 KW |
724 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); |
725 | ok($ret->[0], 0xFFFD); | |
726 | ok($ret->[1], 1); | |
727 | } | |
728 | ||
729 | my @buf_tests = ( | |
730 | { | |
731 | input => "A", | |
732 | adjustment => -1, | |
21e22f08 | 733 | warning => eval "qr/empty/", |
7899b636 KW |
734 | no_warnings_returned_length => 0, |
735 | }, | |
736 | { | |
737 | input => "\xc4\xc5", | |
738 | adjustment => 0, | |
21e22f08 | 739 | warning => eval "qr/non-continuation/", |
7899b636 KW |
740 | no_warnings_returned_length => 1, |
741 | }, | |
742 | { | |
743 | input => "\xc4\x80", | |
744 | adjustment => -1, | |
21e22f08 | 745 | warning => eval "qr/short|1 byte, need 2/", |
7899b636 KW |
746 | no_warnings_returned_length => 1, |
747 | }, | |
748 | { | |
749 | input => "\xc0\x81", | |
750 | adjustment => 0, | |
21e22f08 | 751 | warning => eval "qr/overlong|2 bytes, need 1/", |
7899b636 KW |
752 | no_warnings_returned_length => 2, |
753 | }, | |
754 | { | |
755 | input => "\xe0\x80\x81", | |
756 | adjustment => 0, | |
21e22f08 | 757 | warning => eval "qr/overlong|3 bytes, need 1/", |
7899b636 KW |
758 | no_warnings_returned_length => 3, |
759 | }, | |
760 | { | |
761 | input => "\xf0\x80\x80\x81", | |
762 | adjustment => 0, | |
21e22f08 | 763 | warning => eval "qr/overlong|4 bytes, need 1/", |
7899b636 KW |
764 | no_warnings_returned_length => 4, |
765 | }, | |
766 | { # Old algorithm failed to detect this | |
767 | input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", | |
768 | adjustment => 0, | |
21e22f08 | 769 | warning => eval "qr/overflow/", |
7899b636 KW |
770 | no_warnings_returned_length => 13, |
771 | }, | |
772 | ); | |
773 | ||
774 | # An empty input is an assertion failure on debugging builds. It is | |
775 | # deliberately the first test. | |
776 | require Config; import Config; | |
777 | use vars '%Config'; | |
778 | if ($Config{ccflags} =~ /-DDEBUGGING/) { | |
779 | shift @buf_tests; | |
66da169a KW |
780 | for (1..5) { |
781 | ok(1, 1); | |
782 | } | |
7899b636 KW |
783 | } |
784 | ||
66da169a KW |
785 | my $test; |
786 | for $test (@buf_tests) { | |
7899b636 KW |
787 | my $input = $test->{'input'}; |
788 | my $adjustment = $test->{'adjustment'}; | |
789 | my $display = 'utf8_to_uvchr_buf("'; | |
66da169a KW |
790 | my $i; |
791 | for ($i = 0; $i < length($input) + $adjustment; $i++) { | |
7899b636 KW |
792 | $display .= sprintf "\\x%02x", ord substr($input, $i, 1); |
793 | } | |
794 | ||
795 | $display .= '")'; | |
796 | my $warning = $test->{'warning'}; | |
797 | ||
798 | undef @warnings; | |
c45043a0 | 799 | BEGIN { 'warnings'->import('utf8') if "$]" > '5.006' } |
7899b636 KW |
800 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); |
801 | ok($ret->[0], 0, "returned value $display; warnings enabled"); | |
802 | ok($ret->[1], -1, "returned length $display; warnings enabled"); | |
803 | my $all_warnings = join "; ", @warnings; | |
804 | my $contains = grep { $_ =~ $warning } $all_warnings; | |
805 | ok($contains, 1, $display | |
806 | . "; Got: '$all_warnings', which should contain '$warning'"); | |
807 | ||
808 | undef @warnings; | |
c45043a0 | 809 | BEGIN { 'warnings'->unimport('utf8') if "$]" > '5.006' } |
7899b636 KW |
810 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); |
811 | ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled"); | |
812 | ok($ret->[1], $test->{'no_warnings_returned_length'}, | |
813 | "returned length $display; warnings disabled"); | |
814 | } | |
815 | } | |
0570adb7 P |
816 | |
817 | if ("$]" ge '5.008') { | |
818 | BEGIN { if ("$]" ge '5.008') { require utf8; "utf8"->import() } } | |
819 | ||
820 | ok(Devel::PPPort::sv_len_utf8("aščť"), 4); | |
821 | ok(Devel::PPPort::sv_len_utf8_nomg("aščť"), 4); | |
822 | ||
823 | my $str = "áíé"; | |
824 | utf8::downgrade($str); | |
825 | ok(Devel::PPPort::sv_len_utf8($str), 3); | |
826 | utf8::downgrade($str); | |
827 | ok(Devel::PPPort::sv_len_utf8_nomg($str), 3); | |
828 | utf8::upgrade($str); | |
829 | ok(Devel::PPPort::sv_len_utf8($str), 3); | |
830 | utf8::upgrade($str); | |
831 | ok(Devel::PPPort::sv_len_utf8_nomg($str), 3); | |
832 | ||
833 | tie my $scalar, 'TieScalarCounter', "é"; | |
834 | ||
835 | ok(tied($scalar)->{fetch}, 0); | |
836 | ok(tied($scalar)->{store}, 0); | |
837 | ok(Devel::PPPort::sv_len_utf8($scalar), 2); | |
838 | ok(tied($scalar)->{fetch}, 1); | |
839 | ok(tied($scalar)->{store}, 0); | |
840 | ok(Devel::PPPort::sv_len_utf8($scalar), 3); | |
841 | ok(tied($scalar)->{fetch}, 2); | |
842 | ok(tied($scalar)->{store}, 0); | |
843 | ok(Devel::PPPort::sv_len_utf8($scalar), 4); | |
844 | ok(tied($scalar)->{fetch}, 3); | |
845 | ok(tied($scalar)->{store}, 0); | |
846 | ok(Devel::PPPort::sv_len_utf8_nomg($scalar), 4); | |
847 | ok(tied($scalar)->{fetch}, 3); | |
848 | ok(tied($scalar)->{store}, 0); | |
849 | ok(Devel::PPPort::sv_len_utf8_nomg($scalar), 4); | |
850 | ok(tied($scalar)->{fetch}, 3); | |
851 | ok(tied($scalar)->{store}, 0); | |
852 | } else { | |
8583f9b2 KW |
853 | for (1..23) { |
854 | skip 'skip: no SV_NOSTEAL support', 0; | |
855 | } | |
0570adb7 P |
856 | } |
857 | ||
858 | package TieScalarCounter; | |
859 | ||
860 | sub TIESCALAR { | |
861 | my ($class, $value) = @_; | |
862 | return bless { fetch => 0, store => 0, value => $value }, $class; | |
863 | } | |
864 | ||
865 | sub FETCH { | |
866 | BEGIN { if ("$]" ge '5.008') { require utf8; "utf8"->import() } } | |
867 | my ($self) = @_; | |
868 | $self->{fetch}++; | |
869 | return $self->{value} .= "é"; | |
870 | } | |
871 | ||
872 | sub STORE { | |
873 | my ($self, $value) = @_; | |
874 | $self->{store}++; | |
875 | $self->{value} = $value; | |
876 | } |