This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
embed.fnc: Mark several do_dump fcns as Core only
[perl5.git] / pp_pack.c
... / ...
CommitLineData
1/* pp_pack.c
2 *
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * He still hopefully carried some of his gear in his pack: a small tinder-box,
13 * two small shallow pans, the smaller fitting into the larger; inside them a
14 * wooden spoon, a short two-pronged fork and some skewers were stowed; and
15 * hidden at the bottom of the pack in a flat wooden box a dwindling treasure,
16 * some salt.
17 *
18 * [p.653 of _The Lord of the Rings_, IV/iv: "Of Herbs and Stewed Rabbit"]
19 */
20
21/* This file contains pp ("push/pop") functions that
22 * execute the opcodes that make up a perl program. A typical pp function
23 * expects to find its arguments on the stack, and usually pushes its
24 * results onto the stack, hence the 'pp' terminology. Each OP structure
25 * contains a pointer to the relevant pp_foo() function.
26 *
27 * This particular file just contains pp_pack() and pp_unpack(). See the
28 * other pp*.c files for the rest of the pp_ functions.
29 */
30
31#include "EXTERN.h"
32#define PERL_IN_PP_PACK_C
33#include "perl.h"
34
35/* Types used by pack/unpack */
36typedef enum {
37 e_no_len, /* no length */
38 e_number, /* number, [] */
39 e_star /* asterisk */
40} howlen_t;
41
42typedef struct tempsym {
43 const char* patptr; /* current template char */
44 const char* patend; /* one after last char */
45 const char* grpbeg; /* 1st char of ()-group */
46 const char* grpend; /* end of ()-group */
47 I32 code; /* template code (!<>) */
48 U32 flags; /* /=4, comma=2, pack=1 */
49 /* and group modifiers */
50 SSize_t length; /* length/repeat count */
51 howlen_t howlen; /* how length is given */
52 int level; /* () nesting level */
53 STRLEN strbeg; /* offset of group start */
54 struct tempsym *previous; /* previous group */
55} tempsym_t;
56
57#define TEMPSYM_INIT(symptr, p, e, f) \
58 STMT_START { \
59 (symptr)->patptr = (p); \
60 (symptr)->patend = (e); \
61 (symptr)->grpbeg = NULL; \
62 (symptr)->grpend = NULL; \
63 (symptr)->grpend = NULL; \
64 (symptr)->code = 0; \
65 (symptr)->length = 0; \
66 (symptr)->howlen = e_no_len; \
67 (symptr)->level = 0; \
68 (symptr)->flags = (f); \
69 (symptr)->strbeg = 0; \
70 (symptr)->previous = NULL; \
71 } STMT_END
72
73typedef union {
74 NV nv;
75 U8 bytes[sizeof(NV)];
76} NV_bytes;
77
78#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
79typedef union {
80 long double ld;
81 U8 bytes[sizeof(long double)];
82} ld_bytes;
83#endif
84
85#ifndef CHAR_BIT
86# define CHAR_BIT 8
87#endif
88/* Maximum number of bytes to which a byte can grow due to upgrade */
89#define UTF8_EXPAND 2
90
91/*
92 * Offset for integer pack/unpack.
93 *
94 * On architectures where I16 and I32 aren't really 16 and 32 bits,
95 * which for now are all Crays, pack and unpack have to play games.
96 */
97
98/*
99 * These values are required for portability of pack() output.
100 * If they're not right on your machine, then pack() and unpack()
101 * wouldn't work right anyway; you'll need to apply the Cray hack.
102 * (I'd like to check them with #if, but you can't use sizeof() in
103 * the preprocessor.) --???
104 */
105/*
106 The appropriate SHORTSIZE, INTSIZE, LONGSIZE, and LONGLONGSIZE
107 defines are now in config.h. --Andy Dougherty April 1998
108 */
109#define SIZE16 2
110#define SIZE32 4
111
112/* CROSSCOMPILE and MULTIARCH are going to affect pp_pack() and pp_unpack().
113 --jhi Feb 1999 */
114
115#if U16SIZE <= SIZE16 && U32SIZE <= SIZE32
116# define OFF16(p) ((char *) (p))
117# define OFF32(p) ((char *) (p))
118#elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */
119# define OFF16(p) ((char*)(p))
120# define OFF32(p) ((char*)(p))
121#elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */
122# define OFF16(p) ((char*)(p) + (sizeof(U16) - SIZE16))
123# define OFF32(p) ((char*)(p) + (sizeof(U32) - SIZE32))
124#else
125# error "bad cray byte order"
126#endif
127
128#define PUSH16(utf8, cur, p, needs_swap) \
129 PUSH_BYTES(utf8, cur, OFF16(p), SIZE16, needs_swap)
130#define PUSH32(utf8, cur, p, needs_swap) \
131 PUSH_BYTES(utf8, cur, OFF32(p), SIZE32, needs_swap)
132
133#if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */
134# define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_LITTLE_ENDIAN)
135#elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */
136# define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_BIG_ENDIAN)
137#else
138# error "Unsupported byteorder"
139 /* Need to add code here to re-instate mixed endian support.
140 NEEDS_SWAP would need to hold a flag indicating which action to
141 take, and S_reverse_copy and the code in S_utf8_to_bytes would need
142 logic adding to deal with any mixed-endian transformations needed.
143 */
144#endif
145
146/* Only to be used inside a loop (see the break) */
147#define SHIFT_BYTES(utf8, s, strend, buf, len, datumtype, needs_swap) \
148STMT_START { \
149 if (UNLIKELY(utf8)) { \
150 if (!S_utf8_to_bytes(aTHX_ &s, strend, \
151 (char *) (buf), len, datumtype)) break; \
152 } else { \
153 if (UNLIKELY(needs_swap)) \
154 S_reverse_copy(s, (char *) (buf), len); \
155 else \
156 Copy(s, (char *) (buf), len, char); \
157 s += len; \
158 } \
159} STMT_END
160
161#define SHIFT16(utf8, s, strend, p, datumtype, needs_swap) \
162 SHIFT_BYTES(utf8, s, strend, OFF16(p), SIZE16, datumtype, needs_swap)
163
164#define SHIFT32(utf8, s, strend, p, datumtype, needs_swap) \
165 SHIFT_BYTES(utf8, s, strend, OFF32(p), SIZE32, datumtype, needs_swap)
166
167#define SHIFT_VAR(utf8, s, strend, var, datumtype, needs_swap) \
168 SHIFT_BYTES(utf8, s, strend, &(var), sizeof(var), datumtype, needs_swap)
169
170#define PUSH_VAR(utf8, aptr, var, needs_swap) \
171 PUSH_BYTES(utf8, aptr, &(var), sizeof(var), needs_swap)
172
173/* Avoid stack overflow due to pathological templates. 100 should be plenty. */
174#define MAX_SUB_TEMPLATE_LEVEL 100
175
176/* flags (note that type modifiers can also be used as flags!) */
177#define FLAG_WAS_UTF8 0x40
178#define FLAG_PARSE_UTF8 0x20 /* Parse as utf8 */
179#define FLAG_UNPACK_ONLY_ONE 0x10
180#define FLAG_DO_UTF8 0x08 /* The underlying string is utf8 */
181#define FLAG_SLASH 0x04
182#define FLAG_COMMA 0x02
183#define FLAG_PACK 0x01
184
185STATIC SV *
186S_mul128(pTHX_ SV *sv, U8 m)
187{
188 STRLEN len;
189 char *s = SvPV(sv, len);
190 char *t;
191
192 PERL_ARGS_ASSERT_MUL128;
193
194 if (! memBEGINs(s, len, "0000")) { /* need to grow sv */
195 SV * const tmpNew = newSVpvs("0000000000");
196
197 sv_catsv(tmpNew, sv);
198 SvREFCNT_dec(sv); /* free old sv */
199 sv = tmpNew;
200 s = SvPV(sv, len);
201 }
202 t = s + len - 1;
203 while (!*t) /* trailing '\0'? */
204 t--;
205 while (t > s) {
206 const U32 i = ((*t - '0') << 7) + m;
207 *(t--) = '0' + (char)(i % 10);
208 m = (char)(i / 10);
209 }
210 return (sv);
211}
212
213/* Explosives and implosives. */
214
215#define ISUUCHAR(ch) inRANGE(NATIVE_TO_LATIN1(ch), \
216 NATIVE_TO_LATIN1(' '), \
217 NATIVE_TO_LATIN1('a') - 1)
218
219/* type modifiers */
220#define TYPE_IS_SHRIEKING 0x100
221#define TYPE_IS_BIG_ENDIAN 0x200
222#define TYPE_IS_LITTLE_ENDIAN 0x400
223#define TYPE_IS_PACK 0x800
224#define TYPE_ENDIANNESS_MASK (TYPE_IS_BIG_ENDIAN|TYPE_IS_LITTLE_ENDIAN)
225#define TYPE_MODIFIERS(t) ((t) & ~0xFF)
226#define TYPE_NO_MODIFIERS(t) ((t) & 0xFF)
227
228# define TYPE_ENDIANNESS(t) ((t) & TYPE_ENDIANNESS_MASK)
229# define TYPE_NO_ENDIANNESS(t) ((t) & ~TYPE_ENDIANNESS_MASK)
230
231# define ENDIANNESS_ALLOWED_TYPES "sSiIlLqQjJfFdDpP("
232
233#define PACK_SIZE_CANNOT_CSUM 0x80
234#define PACK_SIZE_UNPREDICTABLE 0x40 /* Not a fixed size element */
235#define PACK_SIZE_MASK 0x3F
236
237#include "packsizetables.inc"
238
239static void
240S_reverse_copy(const char *src, char *dest, STRLEN len)
241{
242 dest += len;
243 while (len--)
244 *--dest = *src++;
245}
246
247STATIC U8
248utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype)
249{
250 STRLEN retlen;
251 UV val;
252
253 if (*s >= end) {
254 goto croak;
255 }
256 val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen,
257 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
258 if (retlen == (STRLEN) -1)
259 croak:
260 Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack",
261 (int) TYPE_NO_MODIFIERS(datumtype));
262 if (val >= 0x100) {
263 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK),
264 "Character in '%c' format wrapped in unpack",
265 (int) TYPE_NO_MODIFIERS(datumtype));
266 val &= 0xff;
267 }
268 *s += retlen;
269 return (U8)val;
270}
271
272#define SHIFT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \
273 utf8_to_byte(aTHX_ &(s), (strend), (datumtype)) : \
274 *(U8 *)(s)++)
275
276STATIC bool
277S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t buf_len, I32 datumtype)
278{
279 UV val;
280 STRLEN retlen;
281 const char *from = *s;
282 int bad = 0;
283 const U32 flags = ckWARN(WARN_UTF8) ?
284 UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY);
285 const bool needs_swap = NEEDS_SWAP(datumtype);
286
287 if (UNLIKELY(needs_swap))
288 buf += buf_len;
289
290 for (;buf_len > 0; buf_len--) {
291 if (from >= end) return FALSE;
292 val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags);
293 if (retlen == (STRLEN) -1) {
294 from += UTF8_SAFE_SKIP(from, end);
295 bad |= 1;
296 } else from += retlen;
297 if (val >= 0x100) {
298 bad |= 2;
299 val &= 0xff;
300 }
301 if (UNLIKELY(needs_swap))
302 *(U8 *)--buf = (U8)val;
303 else
304 *(U8 *)buf++ = (U8)val;
305 }
306 /* We have enough characters for the buffer. Did we have problems ? */
307 if (bad) {
308 if (bad & 1) {
309 /* Rewalk the string fragment while warning */
310 const char *ptr;
311 const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
312 for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) {
313 if (ptr >= end) break;
314 utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags);
315 }
316 if (from > end) from = end;
317 }
318 if ((bad & 2))
319 Perl_ck_warner(aTHX_ packWARN(datumtype & TYPE_IS_PACK ?
320 WARN_PACK : WARN_UNPACK),
321 "Character(s) in '%c' format wrapped in %s",
322 (int) TYPE_NO_MODIFIERS(datumtype),
323 datumtype & TYPE_IS_PACK ? "pack" : "unpack");
324 }
325 *s = from;
326 return TRUE;
327}
328
329STATIC char *
330S_my_bytes_to_utf8(const U8 *start, STRLEN len, char *dest, const bool needs_swap) {
331 PERL_ARGS_ASSERT_MY_BYTES_TO_UTF8;
332
333 if (UNLIKELY(needs_swap)) {
334 const U8 *p = start + len;
335 while (p-- > start) {
336 append_utf8_from_native_byte(*p, (U8 **) & dest);
337 }
338 } else {
339 const U8 * const end = start + len;
340 while (start < end) {
341 append_utf8_from_native_byte(*start, (U8 **) & dest);
342 start++;
343 }
344 }
345 return dest;
346}
347
348#define PUSH_BYTES(utf8, cur, buf, len, needs_swap) \
349STMT_START { \
350 if (UNLIKELY(utf8)) \
351 (cur) = my_bytes_to_utf8((U8 *) buf, len, (cur), needs_swap); \
352 else { \
353 if (UNLIKELY(needs_swap)) \
354 S_reverse_copy((char *)(buf), cur, len); \
355 else \
356 Copy(buf, cur, len, char); \
357 (cur) += (len); \
358 } \
359} STMT_END
360
361#define SAFE_UTF8_EXPAND(var) \
362STMT_START { \
363 if ((var) > SSize_t_MAX / UTF8_EXPAND) \
364 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \
365 (var) = (var) * UTF8_EXPAND; \
366} STMT_END
367
368#define GROWING2(utf8, cat, start, cur, item_size, item_count) \
369STMT_START { \
370 if (SSize_t_MAX / (item_size) < (item_count)) \
371 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \
372 GROWING((utf8), (cat), (start), (cur), (item_size) * (item_count)); \
373} STMT_END
374
375#define GROWING(utf8, cat, start, cur, in_len) \
376STMT_START { \
377 STRLEN glen = (in_len); \
378 STRLEN catcur = (STRLEN)((cur) - (start)); \
379 if (utf8) SAFE_UTF8_EXPAND(glen); \
380 if (SSize_t_MAX - glen < catcur) \
381 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \
382 if (catcur + glen >= SvLEN(cat)) { \
383 (start) = sv_exp_grow(cat, glen); \
384 (cur) = (start) + SvCUR(cat); \
385 } \
386} STMT_END
387
388#define PUSH_GROWING_BYTES(utf8, cat, start, cur, buf, in_len) \
389STMT_START { \
390 const STRLEN glen = (in_len); \
391 STRLEN gl = glen; \
392 if (utf8) SAFE_UTF8_EXPAND(gl); \
393 if ((cur) + gl >= (start) + SvLEN(cat)) { \
394 *cur = '\0'; \
395 SvCUR_set((cat), (cur) - (start)); \
396 (start) = sv_exp_grow(cat, gl); \
397 (cur) = (start) + SvCUR(cat); \
398 } \
399 PUSH_BYTES(utf8, cur, buf, glen, 0); \
400} STMT_END
401
402#define PUSH_BYTE(utf8, s, byte) \
403STMT_START { \
404 if (utf8) { \
405 const U8 au8 = (byte); \
406 (s) = my_bytes_to_utf8(&au8, 1, (s), 0);\
407 } else *(U8 *)(s)++ = (byte); \
408} STMT_END
409
410/* Only to be used inside a loop (see the break) */
411#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \
412STMT_START { \
413 STRLEN retlen; \
414 if (str >= end) break; \
415 val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \
416 if (retlen == (STRLEN) -1) { \
417 *cur = '\0'; \
418 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); \
419 } \
420 str += retlen; \
421} STMT_END
422
423static const char *_action( const tempsym_t* symptr )
424{
425 return (const char *)(( symptr->flags & FLAG_PACK ) ? "pack" : "unpack");
426}
427
428/* Returns the sizeof() struct described by pat */
429STATIC SSize_t
430S_measure_struct(pTHX_ tempsym_t* symptr)
431{
432 SSize_t total = 0;
433
434 PERL_ARGS_ASSERT_MEASURE_STRUCT;
435
436 while (next_symbol(symptr)) {
437 SSize_t len, size;
438
439 switch (symptr->howlen) {
440 case e_star:
441 Perl_croak(aTHX_ "Within []-length '*' not allowed in %s",
442 _action( symptr ) );
443
444 default:
445 /* e_no_len and e_number */
446 len = symptr->length;
447 break;
448 }
449
450 size = packprops[TYPE_NO_ENDIANNESS(symptr->code)] & PACK_SIZE_MASK;
451 if (!size) {
452 SSize_t star;
453 /* endianness doesn't influence the size of a type */
454 switch(TYPE_NO_ENDIANNESS(symptr->code)) {
455 default:
456 Perl_croak(aTHX_ "Invalid type '%c' in %s",
457 (int)TYPE_NO_MODIFIERS(symptr->code),
458 _action( symptr ) );
459 case '.' | TYPE_IS_SHRIEKING:
460 case '@' | TYPE_IS_SHRIEKING:
461 case '@':
462 case '.':
463 case '/':
464 case 'U': /* XXXX Is it correct? */
465 case 'w':
466 case 'u':
467 Perl_croak(aTHX_ "Within []-length '%c' not allowed in %s",
468 (int) TYPE_NO_MODIFIERS(symptr->code),
469 _action( symptr ) );
470 case '%':
471 size = 0;
472 break;
473 case '(':
474 {
475 tempsym_t savsym = *symptr;
476 symptr->patptr = savsym.grpbeg;
477 symptr->patend = savsym.grpend;
478 /* XXXX Theoretically, we need to measure many times at
479 different positions, since the subexpression may contain
480 alignment commands, but be not of aligned length.
481 Need to detect this and croak(). */
482 size = measure_struct(symptr);
483 *symptr = savsym;
484 break;
485 }
486 case 'X' | TYPE_IS_SHRIEKING:
487 /* XXXX Is this useful? Then need to treat MEASURE_BACKWARDS.
488 */
489 if (!len) /* Avoid division by 0 */
490 len = 1;
491 len = total % len; /* Assumed: the start is aligned. */
492 /* FALLTHROUGH */
493 case 'X':
494 size = -1;
495 if (total < len)
496 Perl_croak(aTHX_ "'X' outside of string in %s", _action( symptr ) );
497 break;
498 case 'x' | TYPE_IS_SHRIEKING:
499 if (!len) /* Avoid division by 0 */
500 len = 1;
501 star = total % len; /* Assumed: the start is aligned. */
502 if (star) /* Other portable ways? */
503 len = len - star;
504 else
505 len = 0;
506 /* FALLTHROUGH */
507 case 'x':
508 case 'A':
509 case 'Z':
510 case 'a':
511 size = 1;
512 break;
513 case 'B':
514 case 'b':
515 len = (len + 7)/8;
516 size = 1;
517 break;
518 case 'H':
519 case 'h':
520 len = (len + 1)/2;
521 size = 1;
522 break;
523
524 case 'P':
525 len = 1;
526 size = sizeof(char*);
527 break;
528 }
529 }
530 total += len * size;
531 }
532 return total;
533}
534
535
536/* locate matching closing parenthesis or bracket
537 * returns char pointer to char after match, or NULL
538 */
539STATIC const char *
540S_group_end(pTHX_ const char *patptr, const char *patend, char ender)
541{
542 PERL_ARGS_ASSERT_GROUP_END;
543
544 while (patptr < patend) {
545 const char c = *patptr++;
546
547 if (isSPACE(c))
548 continue;
549 else if (c == ender)
550 return patptr-1;
551 else if (c == '#') {
552 while (patptr < patend && *patptr != '\n')
553 patptr++;
554 continue;
555 } else if (c == '(')
556 patptr = group_end(patptr, patend, ')') + 1;
557 else if (c == '[')
558 patptr = group_end(patptr, patend, ']') + 1;
559 }
560 Perl_croak(aTHX_ "No group ending character '%c' found in template",
561 ender);
562 NOT_REACHED; /* NOTREACHED */
563}
564
565
566/* Convert unsigned decimal number to binary.
567 * Expects a pointer to the first digit and address of length variable
568 * Advances char pointer to 1st non-digit char and returns number
569 */
570STATIC const char *
571S_get_num(pTHX_ const char *patptr, SSize_t *lenptr )
572{
573 SSize_t len = *patptr++ - '0';
574
575 PERL_ARGS_ASSERT_GET_NUM;
576
577 while (isDIGIT(*patptr)) {
578 SSize_t nlen = (len * 10) + (*patptr++ - '0');
579 if (nlen < 0 || nlen/10 != len)
580 Perl_croak(aTHX_ "pack/unpack repeat count overflow");
581 len = nlen;
582 }
583 *lenptr = len;
584 return patptr;
585}
586
587/* The marvellous template parsing routine: Using state stored in *symptr,
588 * locates next template code and count
589 */
590STATIC bool
591S_next_symbol(pTHX_ tempsym_t* symptr )
592{
593 const char* patptr = symptr->patptr;
594 const char* const patend = symptr->patend;
595
596 PERL_ARGS_ASSERT_NEXT_SYMBOL;
597
598 symptr->flags &= ~FLAG_SLASH;
599
600 while (patptr < patend) {
601 if (isSPACE(*patptr))
602 patptr++;
603 else if (*patptr == '#') {
604 patptr++;
605 while (patptr < patend && *patptr != '\n')
606 patptr++;
607 if (patptr < patend)
608 patptr++;
609 } else {
610 /* We should have found a template code */
611 I32 code = *patptr++ & 0xFF;
612 U32 inherited_modifiers = 0;
613
614 if (code == ','){ /* grandfather in commas but with a warning */
615 if (((symptr->flags & FLAG_COMMA) == 0) && ckWARN(WARN_UNPACK)){
616 symptr->flags |= FLAG_COMMA;
617 Perl_warner(aTHX_ packWARN(WARN_UNPACK),
618 "Invalid type ',' in %s", _action( symptr ) );
619 }
620 continue;
621 }
622
623 /* for '(', skip to ')' */
624 if (code == '(') {
625 if( isDIGIT(*patptr) || *patptr == '*' || *patptr == '[' )
626 Perl_croak(aTHX_ "()-group starts with a count in %s",
627 _action( symptr ) );
628 symptr->grpbeg = patptr;
629 patptr = 1 + ( symptr->grpend = group_end(patptr, patend, ')') );
630 if( symptr->level >= MAX_SUB_TEMPLATE_LEVEL )
631 Perl_croak(aTHX_ "Too deeply nested ()-groups in %s",
632 _action( symptr ) );
633 }
634
635 /* look for group modifiers to inherit */
636 if (TYPE_ENDIANNESS(symptr->flags)) {
637 if (strchr(ENDIANNESS_ALLOWED_TYPES, TYPE_NO_MODIFIERS(code)))
638 inherited_modifiers |= TYPE_ENDIANNESS(symptr->flags);
639 }
640
641 /* look for modifiers */
642 while (patptr < patend) {
643 const char *allowed;
644 I32 modifier;
645 switch (*patptr) {
646 case '!':
647 modifier = TYPE_IS_SHRIEKING;
648 allowed = "sSiIlLxXnNvV@.";
649 break;
650 case '>':
651 modifier = TYPE_IS_BIG_ENDIAN;
652 allowed = ENDIANNESS_ALLOWED_TYPES;
653 break;
654 case '<':
655 modifier = TYPE_IS_LITTLE_ENDIAN;
656 allowed = ENDIANNESS_ALLOWED_TYPES;
657 break;
658 default:
659 allowed = "";
660 modifier = 0;
661 break;
662 }
663
664 if (modifier == 0)
665 break;
666
667 if (!strchr(allowed, TYPE_NO_MODIFIERS(code)))
668 Perl_croak(aTHX_ "'%c' allowed only after types %s in %s", *patptr,
669 allowed, _action( symptr ) );
670
671 if (TYPE_ENDIANNESS(code | modifier) == TYPE_ENDIANNESS_MASK)
672 Perl_croak(aTHX_ "Can't use both '<' and '>' after type '%c' in %s",
673 (int) TYPE_NO_MODIFIERS(code), _action( symptr ) );
674 else if (TYPE_ENDIANNESS(code | modifier | inherited_modifiers) ==
675 TYPE_ENDIANNESS_MASK)
676 Perl_croak(aTHX_ "Can't use '%c' in a group with different byte-order in %s",
677 *patptr, _action( symptr ) );
678
679 if ((code & modifier)) {
680 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK),
681 "Duplicate modifier '%c' after '%c' in %s",
682 *patptr, (int) TYPE_NO_MODIFIERS(code),
683 _action( symptr ) );
684 }
685
686 code |= modifier;
687 patptr++;
688 }
689
690 /* inherit modifiers */
691 code |= inherited_modifiers;
692
693 /* look for count and/or / */
694 if (patptr < patend) {
695 if (isDIGIT(*patptr)) {
696 patptr = get_num( patptr, &symptr->length );
697 symptr->howlen = e_number;
698
699 } else if (*patptr == '*') {
700 patptr++;
701 symptr->howlen = e_star;
702
703 } else if (*patptr == '[') {
704 const char* lenptr = ++patptr;
705 symptr->howlen = e_number;
706 patptr = group_end( patptr, patend, ']' ) + 1;
707 /* what kind of [] is it? */
708 if (isDIGIT(*lenptr)) {
709 lenptr = get_num( lenptr, &symptr->length );
710 if( *lenptr != ']' )
711 Perl_croak(aTHX_ "Malformed integer in [] in %s",
712 _action( symptr ) );
713 } else {
714 tempsym_t savsym = *symptr;
715 symptr->patend = patptr-1;
716 symptr->patptr = lenptr;
717 savsym.length = measure_struct(symptr);
718 *symptr = savsym;
719 }
720 } else {
721 symptr->howlen = e_no_len;
722 symptr->length = 1;
723 }
724
725 /* try to find / */
726 while (patptr < patend) {
727 if (isSPACE(*patptr))
728 patptr++;
729 else if (*patptr == '#') {
730 patptr++;
731 while (patptr < patend && *patptr != '\n')
732 patptr++;
733 if (patptr < patend)
734 patptr++;
735 } else {
736 if (*patptr == '/') {
737 symptr->flags |= FLAG_SLASH;
738 patptr++;
739 if (patptr < patend &&
740 (isDIGIT(*patptr) || *patptr == '*' || *patptr == '['))
741 Perl_croak(aTHX_ "'/' does not take a repeat count in %s",
742 _action( symptr ) );
743 }
744 break;
745 }
746 }
747 } else {
748 /* at end - no count, no / */
749 symptr->howlen = e_no_len;
750 symptr->length = 1;
751 }
752
753 symptr->code = code;
754 symptr->patptr = patptr;
755 return TRUE;
756 }
757 }
758 symptr->patptr = patptr;
759 return FALSE;
760}
761
762/*
763 There is no way to cleanly handle the case where we should process the
764 string per byte in its upgraded form while it's really in downgraded form
765 (e.g. estimates like strend-s as an upper bound for the number of
766 characters left wouldn't work). So if we foresee the need of this
767 (pattern starts with U or contains U0), we want to work on the encoded
768 version of the string. Users are advised to upgrade their pack string
769 themselves if they need to do a lot of unpacks like this on it
770*/
771STATIC bool
772need_utf8(const char *pat, const char *patend)
773{
774 bool first = TRUE;
775
776 PERL_ARGS_ASSERT_NEED_UTF8;
777
778 while (pat < patend) {
779 if (pat[0] == '#') {
780 pat++;
781 pat = (const char *) memchr(pat, '\n', patend-pat);
782 if (!pat) return FALSE;
783 } else if (pat[0] == 'U') {
784 if (first || pat[1] == '0') return TRUE;
785 } else first = FALSE;
786 pat++;
787 }
788 return FALSE;
789}
790
791STATIC char
792first_symbol(const char *pat, const char *patend) {
793 PERL_ARGS_ASSERT_FIRST_SYMBOL;
794
795 while (pat < patend) {
796 if (pat[0] != '#') return pat[0];
797 pat++;
798 pat = (const char *) memchr(pat, '\n', patend-pat);
799 if (!pat) return 0;
800 pat++;
801 }
802 return 0;
803}
804
805/*
806
807=for apidoc unpackstring
808
809The engine implementing the C<unpack()> Perl function.
810
811Using the template C<pat..patend>, this function unpacks the string
812C<s..strend> into a number of mortal SVs, which it pushes onto the perl
813argument (C<@_>) stack (so you will need to issue a C<PUTBACK> before and
814C<SPAGAIN> after the call to this function). It returns the number of
815pushed elements.
816
817The C<strend> and C<patend> pointers should point to the byte following the
818last character of each string.
819
820Although this function returns its values on the perl argument stack, it
821doesn't take any parameters from that stack (and thus in particular
822there's no need to do a C<PUSHMARK> before calling it, unlike L</call_pv> for
823example).
824
825=cut */
826
827SSize_t
828Perl_unpackstring(pTHX_ const char *pat, const char *patend, const char *s, const char *strend, U32 flags)
829{
830 tempsym_t sym;
831
832 PERL_ARGS_ASSERT_UNPACKSTRING;
833
834 if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8;
835 else if (need_utf8(pat, patend)) {
836 /* We probably should try to avoid this in case a scalar context call
837 wouldn't get to the "U0" */
838 STRLEN len = strend - s;
839 s = (char *) bytes_to_utf8((U8 *) s, &len);
840 SAVEFREEPV(s);
841 strend = s + len;
842 flags |= FLAG_DO_UTF8;
843 }
844
845 if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8))
846 flags |= FLAG_PARSE_UTF8;
847
848 TEMPSYM_INIT(&sym, pat, patend, flags);
849
850 return unpack_rec(&sym, s, s, strend, NULL );
851}
852
853STATIC SSize_t
854S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const char *strend, const char **new_s )
855{
856 dSP;
857 SV *sv = NULL;
858 const SSize_t start_sp_offset = SP - PL_stack_base;
859 howlen_t howlen;
860 SSize_t checksum = 0;
861 UV cuv = 0;
862 NV cdouble = 0.0;
863 const SSize_t bits_in_uv = CHAR_BIT * sizeof(cuv);
864 bool beyond = FALSE;
865 bool explicit_length;
866 const bool unpack_only_one = (symptr->flags & FLAG_UNPACK_ONLY_ONE) != 0;
867 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0;
868
869 PERL_ARGS_ASSERT_UNPACK_REC;
870
871 symptr->strbeg = s - strbeg;
872
873 while (next_symbol(symptr)) {
874 packprops_t props;
875 SSize_t len;
876 I32 datumtype = symptr->code;
877 bool needs_swap;
878 /* do first one only unless in list context
879 / is implemented by unpacking the count, then popping it from the
880 stack, so must check that we're not in the middle of a / */
881 if ( unpack_only_one
882 && (SP - PL_stack_base == start_sp_offset + 1)
883 && (datumtype != '/') ) /* XXX can this be omitted */
884 break;
885
886 switch (howlen = symptr->howlen) {
887 case e_star:
888 len = strend - strbeg; /* long enough */
889 break;
890 default:
891 /* e_no_len and e_number */
892 len = symptr->length;
893 break;
894 }
895
896 explicit_length = TRUE;
897 redo_switch:
898 beyond = s >= strend;
899
900 props = packprops[TYPE_NO_ENDIANNESS(datumtype)];
901 if (props) {
902 /* props nonzero means we can process this letter. */
903 const SSize_t size = props & PACK_SIZE_MASK;
904 const SSize_t howmany = (strend - s) / size;
905 if (len > howmany)
906 len = howmany;
907
908 if (!checksum || (props & PACK_SIZE_CANNOT_CSUM)) {
909 if (len && unpack_only_one) len = 1;
910 EXTEND(SP, len);
911 EXTEND_MORTAL(len);
912 }
913 }
914
915 needs_swap = NEEDS_SWAP(datumtype);
916
917 switch(TYPE_NO_ENDIANNESS(datumtype)) {
918 default:
919 Perl_croak(aTHX_ "Invalid type '%c' in unpack", (int)TYPE_NO_MODIFIERS(datumtype) );
920
921 case '%':
922 if (howlen == e_no_len)
923 len = 16; /* len is not specified */
924 checksum = len;
925 cuv = 0;
926 cdouble = 0;
927 continue;
928
929 case '(':
930 {
931 tempsym_t savsym = *symptr;
932 const U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags);
933 symptr->flags |= group_modifiers;
934 symptr->patend = savsym.grpend;
935 symptr->previous = &savsym;
936 symptr->level++;
937 PUTBACK;
938 if (len && unpack_only_one) len = 1;
939 while (len--) {
940 symptr->patptr = savsym.grpbeg;
941 if (utf8) symptr->flags |= FLAG_PARSE_UTF8;
942 else symptr->flags &= ~FLAG_PARSE_UTF8;
943 unpack_rec(symptr, s, strbeg, strend, &s);
944 if (s == strend && savsym.howlen == e_star)
945 break; /* No way to continue */
946 }
947 SPAGAIN;
948 savsym.flags = symptr->flags & ~group_modifiers;
949 *symptr = savsym;
950 break;
951 }
952 case '.' | TYPE_IS_SHRIEKING:
953 case '.': {
954 const char *from;
955 SV *sv;
956 const bool u8 = utf8 && !(datumtype & TYPE_IS_SHRIEKING);
957 if (howlen == e_star) from = strbeg;
958 else if (len <= 0) from = s;
959 else {
960 tempsym_t *group = symptr;
961
962 while (--len && group) group = group->previous;
963 from = group ? strbeg + group->strbeg : strbeg;
964 }
965 sv = from <= s ?
966 newSVuv( u8 ? (UV) utf8_length((const U8*)from, (const U8*)s) : (UV) (s-from)) :
967 newSViv(-(u8 ? (IV) utf8_length((const U8*)s, (const U8*)from) : (IV) (from-s)));
968 mXPUSHs(sv);
969 break;
970 }
971 case '@' | TYPE_IS_SHRIEKING:
972 case '@':
973 s = strbeg + symptr->strbeg;
974 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING))
975 {
976 while (len > 0) {
977 if (s >= strend)
978 Perl_croak(aTHX_ "'@' outside of string in unpack");
979 s += UTF8SKIP(s);
980 len--;
981 }
982 if (s > strend)
983 Perl_croak(aTHX_ "'@' outside of string with malformed UTF-8 in unpack");
984 } else {
985 if (strend-s < len)
986 Perl_croak(aTHX_ "'@' outside of string in unpack");
987 s += len;
988 }
989 break;
990 case 'X' | TYPE_IS_SHRIEKING:
991 if (!len) /* Avoid division by 0 */
992 len = 1;
993 if (utf8) {
994 const char *hop, *last;
995 SSize_t l = len;
996 hop = last = strbeg;
997 while (hop < s) {
998 hop += UTF8SKIP(hop);
999 if (--l == 0) {
1000 last = hop;
1001 l = len;
1002 }
1003 }
1004 if (last > s)
1005 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1006 s = last;
1007 break;
1008 }
1009 len = (s - strbeg) % len;
1010 /* FALLTHROUGH */
1011 case 'X':
1012 if (utf8) {
1013 while (len > 0) {
1014 if (s <= strbeg)
1015 Perl_croak(aTHX_ "'X' outside of string in unpack");
1016 while (--s, UTF8_IS_CONTINUATION(*s)) {
1017 if (s <= strbeg)
1018 Perl_croak(aTHX_ "'X' outside of string in unpack");
1019 }
1020 len--;
1021 }
1022 } else {
1023 if (len > s - strbeg)
1024 Perl_croak(aTHX_ "'X' outside of string in unpack" );
1025 s -= len;
1026 }
1027 break;
1028 case 'x' | TYPE_IS_SHRIEKING: {
1029 SSize_t ai32;
1030 if (!len) /* Avoid division by 0 */
1031 len = 1;
1032 if (utf8) ai32 = utf8_length((U8 *) strbeg, (U8 *) s) % len;
1033 else ai32 = (s - strbeg) % len;
1034 if (ai32 == 0) break;
1035 len -= ai32;
1036 }
1037 /* FALLTHROUGH */
1038 case 'x':
1039 if (utf8) {
1040 while (len>0) {
1041 if (s >= strend)
1042 Perl_croak(aTHX_ "'x' outside of string in unpack");
1043 s += UTF8SKIP(s);
1044 len--;
1045 }
1046 } else {
1047 if (len > strend - s)
1048 Perl_croak(aTHX_ "'x' outside of string in unpack");
1049 s += len;
1050 }
1051 break;
1052 case '/':
1053 Perl_croak(aTHX_ "'/' must follow a numeric type in unpack");
1054
1055 case 'A':
1056 case 'Z':
1057 case 'a':
1058 if (checksum) {
1059 /* Preliminary length estimate is assumed done in 'W' */
1060 if (len > strend - s) len = strend - s;
1061 goto W_checksum;
1062 }
1063 if (utf8) {
1064 SSize_t l;
1065 const char *hop;
1066 for (l=len, hop=s; l>0; l--, hop += UTF8SKIP(hop)) {
1067 if (hop >= strend) {
1068 if (hop > strend)
1069 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1070 break;
1071 }
1072 }
1073 if (hop > strend)
1074 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1075 len = hop - s;
1076 } else if (len > strend - s)
1077 len = strend - s;
1078
1079 if (datumtype == 'Z') {
1080 /* 'Z' strips stuff after first null */
1081 const char *ptr, *end;
1082 end = s + len;
1083 for (ptr = s; ptr < end; ptr++) if (*ptr == 0) break;
1084 sv = newSVpvn(s, ptr-s);
1085 if (howlen == e_star) /* exact for 'Z*' */
1086 len = ptr-s + (ptr != strend ? 1 : 0);
1087 } else if (datumtype == 'A') {
1088 /* 'A' strips both nulls and spaces */
1089 const char *ptr;
1090 if (utf8 && (symptr->flags & FLAG_WAS_UTF8)) {
1091 for (ptr = s+len-1; ptr >= s; ptr--) {
1092 if ( *ptr != 0
1093 && !UTF8_IS_CONTINUATION(*ptr)
1094 && !isSPACE_utf8_safe(ptr, strend))
1095 {
1096 break;
1097 }
1098 }
1099 if (ptr >= s) ptr += UTF8SKIP(ptr);
1100 else ptr++;
1101 if (ptr > s+len)
1102 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1103 } else {
1104 for (ptr = s+len-1; ptr >= s; ptr--)
1105 if (*ptr != 0 && !isSPACE(*ptr)) break;
1106 ptr++;
1107 }
1108 sv = newSVpvn(s, ptr-s);
1109 } else sv = newSVpvn(s, len);
1110
1111 if (utf8) {
1112 SvUTF8_on(sv);
1113 /* Undo any upgrade done due to need_utf8() */
1114 if (!(symptr->flags & FLAG_WAS_UTF8))
1115 sv_utf8_downgrade(sv, 0);
1116 }
1117 mXPUSHs(sv);
1118 s += len;
1119 break;
1120 case 'B':
1121 case 'b': {
1122 char *str;
1123 if (howlen == e_star || len > (strend - s) * 8)
1124 len = (strend - s) * 8;
1125 if (checksum) {
1126 if (utf8)
1127 while (len >= 8 && s < strend) {
1128 cuv += PL_bitcount[utf8_to_byte(aTHX_ &s, strend, datumtype)];
1129 len -= 8;
1130 }
1131 else
1132 while (len >= 8) {
1133 cuv += PL_bitcount[*(U8 *)s++];
1134 len -= 8;
1135 }
1136 if (len && s < strend) {
1137 U8 bits;
1138 bits = SHIFT_BYTE(utf8, s, strend, datumtype);
1139 if (datumtype == 'b')
1140 while (len-- > 0) {
1141 if (bits & 1) cuv++;
1142 bits >>= 1;
1143 }
1144 else
1145 while (len-- > 0) {
1146 if (bits & 0x80) cuv++;
1147 bits <<= 1;
1148 }
1149 }
1150 break;
1151 }
1152
1153 sv = sv_2mortal(newSV(len ? len : 1));
1154 SvPOK_on(sv);
1155 str = SvPVX(sv);
1156 if (datumtype == 'b') {
1157 U8 bits = 0;
1158 const SSize_t ai32 = len;
1159 for (len = 0; len < ai32; len++) {
1160 if (len & 7) bits >>= 1;
1161 else if (utf8) {
1162 if (s >= strend) break;
1163 bits = utf8_to_byte(aTHX_ &s, strend, datumtype);
1164 } else bits = *(U8 *) s++;
1165 *str++ = bits & 1 ? '1' : '0';
1166 }
1167 } else {
1168 U8 bits = 0;
1169 const SSize_t ai32 = len;
1170 for (len = 0; len < ai32; len++) {
1171 if (len & 7) bits <<= 1;
1172 else if (utf8) {
1173 if (s >= strend) break;
1174 bits = utf8_to_byte(aTHX_ &s, strend, datumtype);
1175 } else bits = *(U8 *) s++;
1176 *str++ = bits & 0x80 ? '1' : '0';
1177 }
1178 }
1179 *str = '\0';
1180 SvCUR_set(sv, str - SvPVX_const(sv));
1181 XPUSHs(sv);
1182 break;
1183 }
1184 case 'H':
1185 case 'h': {
1186 char *str = NULL;
1187 /* Preliminary length estimate, acceptable for utf8 too */
1188 if (howlen == e_star || len > (strend - s) * 2)
1189 len = (strend - s) * 2;
1190 if (!checksum) {
1191 sv = sv_2mortal(newSV(len ? len : 1));
1192 SvPOK_on(sv);
1193 str = SvPVX(sv);
1194 }
1195 if (datumtype == 'h') {
1196 U8 bits = 0;
1197 SSize_t ai32 = len;
1198 for (len = 0; len < ai32; len++) {
1199 if (len & 1) bits >>= 4;
1200 else if (utf8) {
1201 if (s >= strend) break;
1202 bits = utf8_to_byte(aTHX_ &s, strend, datumtype);
1203 } else bits = * (U8 *) s++;
1204 if (!checksum)
1205 *str++ = PL_hexdigit[bits & 15];
1206 }
1207 } else {
1208 U8 bits = 0;
1209 const SSize_t ai32 = len;
1210 for (len = 0; len < ai32; len++) {
1211 if (len & 1) bits <<= 4;
1212 else if (utf8) {
1213 if (s >= strend) break;
1214 bits = utf8_to_byte(aTHX_ &s, strend, datumtype);
1215 } else bits = *(U8 *) s++;
1216 if (!checksum)
1217 *str++ = PL_hexdigit[(bits >> 4) & 15];
1218 }
1219 }
1220 if (!checksum) {
1221 *str = '\0';
1222 SvCUR_set(sv, str - SvPVX_const(sv));
1223 XPUSHs(sv);
1224 }
1225 break;
1226 }
1227 case 'C':
1228 if (len == 0) {
1229 if (explicit_length)
1230 /* Switch to "character" mode */
1231 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
1232 break;
1233 }
1234 /* FALLTHROUGH */
1235 case 'c':
1236 while (len-- > 0 && s < strend) {
1237 int aint;
1238 if (utf8)
1239 {
1240 STRLEN retlen;
1241 aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
1242 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1243 if (retlen == (STRLEN) -1)
1244 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1245 s += retlen;
1246 }
1247 else
1248 aint = *(U8 *)(s)++;
1249 if (aint >= 128 && datumtype != 'C') /* fake up signed chars */
1250 aint -= 256;
1251 if (!checksum)
1252 mPUSHi(aint);
1253 else if (checksum > bits_in_uv)
1254 cdouble += (NV)aint;
1255 else
1256 cuv += aint;
1257 }
1258 break;
1259 case 'W':
1260 W_checksum:
1261 if (utf8) {
1262 while (len-- > 0 && s < strend) {
1263 STRLEN retlen;
1264 const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
1265 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1266 if (retlen == (STRLEN) -1)
1267 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1268 s += retlen;
1269 if (!checksum)
1270 mPUSHu(val);
1271 else if (checksum > bits_in_uv)
1272 cdouble += (NV) val;
1273 else
1274 cuv += val;
1275 }
1276 } else if (!checksum)
1277 while (len-- > 0) {
1278 const U8 ch = *(U8 *) s++;
1279 mPUSHu(ch);
1280 }
1281 else if (checksum > bits_in_uv)
1282 while (len-- > 0) cdouble += (NV) *(U8 *) s++;
1283 else
1284 while (len-- > 0) cuv += *(U8 *) s++;
1285 break;
1286 case 'U':
1287 if (len == 0) {
1288 if (explicit_length && howlen != e_star) {
1289 /* Switch to "bytes in UTF-8" mode */
1290 if (symptr->flags & FLAG_DO_UTF8) utf8 = 0;
1291 else
1292 /* Should be impossible due to the need_utf8() test */
1293 Perl_croak(aTHX_ "U0 mode on a byte string");
1294 }
1295 break;
1296 }
1297 if (len > strend - s) len = strend - s;
1298 if (!checksum) {
1299 if (len && unpack_only_one) len = 1;
1300 EXTEND(SP, len);
1301 EXTEND_MORTAL(len);
1302 }
1303 while (len-- > 0 && s < strend) {
1304 STRLEN retlen;
1305 UV auv;
1306 if (utf8) {
1307 U8 result[UTF8_MAXLEN+1];
1308 const char *ptr = s;
1309 STRLEN len;
1310 /* Bug: warns about bad utf8 even if we are short on bytes
1311 and will break out of the loop */
1312 if (!S_utf8_to_bytes(aTHX_ &ptr, strend, (char *) result, 1,
1313 'U'))
1314 break;
1315 len = UTF8SKIP(result);
1316 if (!S_utf8_to_bytes(aTHX_ &ptr, strend,
1317 (char *) &result[1], len-1, 'U')) break;
1318 auv = NATIVE_TO_UNI(utf8n_to_uvchr(result,
1319 len,
1320 &retlen,
1321 UTF8_ALLOW_DEFAULT));
1322 s = ptr;
1323 } else {
1324 auv = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s,
1325 strend - s,
1326 &retlen,
1327 UTF8_ALLOW_DEFAULT));
1328 if (retlen == (STRLEN) -1)
1329 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1330 s += retlen;
1331 }
1332 if (!checksum)
1333 mPUSHu(auv);
1334 else if (checksum > bits_in_uv)
1335 cdouble += (NV) auv;
1336 else
1337 cuv += auv;
1338 }
1339 break;
1340 case 's' | TYPE_IS_SHRIEKING:
1341#if SHORTSIZE != SIZE16
1342 while (len-- > 0) {
1343 short ashort;
1344 SHIFT_VAR(utf8, s, strend, ashort, datumtype, needs_swap);
1345 if (!checksum)
1346 mPUSHi(ashort);
1347 else if (checksum > bits_in_uv)
1348 cdouble += (NV)ashort;
1349 else
1350 cuv += ashort;
1351 }
1352 break;
1353#else
1354 /* FALLTHROUGH */
1355#endif
1356 case 's':
1357 while (len-- > 0) {
1358 I16 ai16;
1359
1360#if U16SIZE > SIZE16
1361 ai16 = 0;
1362#endif
1363 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap);
1364#if U16SIZE > SIZE16
1365 if (ai16 > 32767)
1366 ai16 -= 65536;
1367#endif
1368 if (!checksum)
1369 mPUSHi(ai16);
1370 else if (checksum > bits_in_uv)
1371 cdouble += (NV)ai16;
1372 else
1373 cuv += ai16;
1374 }
1375 break;
1376 case 'S' | TYPE_IS_SHRIEKING:
1377#if SHORTSIZE != SIZE16
1378 while (len-- > 0) {
1379 unsigned short aushort;
1380 SHIFT_VAR(utf8, s, strend, aushort, datumtype, needs_swap);
1381 if (!checksum)
1382 mPUSHu(aushort);
1383 else if (checksum > bits_in_uv)
1384 cdouble += (NV)aushort;
1385 else
1386 cuv += aushort;
1387 }
1388 break;
1389#else
1390 /* FALLTHROUGH */
1391#endif
1392 case 'v':
1393 case 'n':
1394 case 'S':
1395 while (len-- > 0) {
1396 U16 au16;
1397#if U16SIZE > SIZE16
1398 au16 = 0;
1399#endif
1400 SHIFT16(utf8, s, strend, &au16, datumtype, needs_swap);
1401 if (datumtype == 'n')
1402 au16 = PerlSock_ntohs(au16);
1403 if (datumtype == 'v')
1404 au16 = vtohs(au16);
1405 if (!checksum)
1406 mPUSHu(au16);
1407 else if (checksum > bits_in_uv)
1408 cdouble += (NV) au16;
1409 else
1410 cuv += au16;
1411 }
1412 break;
1413 case 'v' | TYPE_IS_SHRIEKING:
1414 case 'n' | TYPE_IS_SHRIEKING:
1415 while (len-- > 0) {
1416 I16 ai16;
1417# if U16SIZE > SIZE16
1418 ai16 = 0;
1419# endif
1420 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap);
1421 /* There should never be any byte-swapping here. */
1422 assert(!TYPE_ENDIANNESS(datumtype));
1423 if (datumtype == ('n' | TYPE_IS_SHRIEKING))
1424 ai16 = (I16) PerlSock_ntohs((U16) ai16);
1425 if (datumtype == ('v' | TYPE_IS_SHRIEKING))
1426 ai16 = (I16) vtohs((U16) ai16);
1427 if (!checksum)
1428 mPUSHi(ai16);
1429 else if (checksum > bits_in_uv)
1430 cdouble += (NV) ai16;
1431 else
1432 cuv += ai16;
1433 }
1434 break;
1435 case 'i':
1436 case 'i' | TYPE_IS_SHRIEKING:
1437 while (len-- > 0) {
1438 int aint;
1439 SHIFT_VAR(utf8, s, strend, aint, datumtype, needs_swap);
1440 if (!checksum)
1441 mPUSHi(aint);
1442 else if (checksum > bits_in_uv)
1443 cdouble += (NV)aint;
1444 else
1445 cuv += aint;
1446 }
1447 break;
1448 case 'I':
1449 case 'I' | TYPE_IS_SHRIEKING:
1450 while (len-- > 0) {
1451 unsigned int auint;
1452 SHIFT_VAR(utf8, s, strend, auint, datumtype, needs_swap);
1453 if (!checksum)
1454 mPUSHu(auint);
1455 else if (checksum > bits_in_uv)
1456 cdouble += (NV)auint;
1457 else
1458 cuv += auint;
1459 }
1460 break;
1461 case 'j':
1462 while (len-- > 0) {
1463 IV aiv;
1464 SHIFT_VAR(utf8, s, strend, aiv, datumtype, needs_swap);
1465 if (!checksum)
1466 mPUSHi(aiv);
1467 else if (checksum > bits_in_uv)
1468 cdouble += (NV)aiv;
1469 else
1470 cuv += aiv;
1471 }
1472 break;
1473 case 'J':
1474 while (len-- > 0) {
1475 UV auv;
1476 SHIFT_VAR(utf8, s, strend, auv, datumtype, needs_swap);
1477 if (!checksum)
1478 mPUSHu(auv);
1479 else if (checksum > bits_in_uv)
1480 cdouble += (NV)auv;
1481 else
1482 cuv += auv;
1483 }
1484 break;
1485 case 'l' | TYPE_IS_SHRIEKING:
1486#if LONGSIZE != SIZE32
1487 while (len-- > 0) {
1488 long along;
1489 SHIFT_VAR(utf8, s, strend, along, datumtype, needs_swap);
1490 if (!checksum)
1491 mPUSHi(along);
1492 else if (checksum > bits_in_uv)
1493 cdouble += (NV)along;
1494 else
1495 cuv += along;
1496 }
1497 break;
1498#else
1499 /* FALLTHROUGH */
1500#endif
1501 case 'l':
1502 while (len-- > 0) {
1503 I32 ai32;
1504#if U32SIZE > SIZE32
1505 ai32 = 0;
1506#endif
1507 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap);
1508#if U32SIZE > SIZE32
1509 if (ai32 > 2147483647) ai32 -= 4294967296;
1510#endif
1511 if (!checksum)
1512 mPUSHi(ai32);
1513 else if (checksum > bits_in_uv)
1514 cdouble += (NV)ai32;
1515 else
1516 cuv += ai32;
1517 }
1518 break;
1519 case 'L' | TYPE_IS_SHRIEKING:
1520#if LONGSIZE != SIZE32
1521 while (len-- > 0) {
1522 unsigned long aulong;
1523 SHIFT_VAR(utf8, s, strend, aulong, datumtype, needs_swap);
1524 if (!checksum)
1525 mPUSHu(aulong);
1526 else if (checksum > bits_in_uv)
1527 cdouble += (NV)aulong;
1528 else
1529 cuv += aulong;
1530 }
1531 break;
1532#else
1533 /* FALLTHROUGH */
1534#endif
1535 case 'V':
1536 case 'N':
1537 case 'L':
1538 while (len-- > 0) {
1539 U32 au32;
1540#if U32SIZE > SIZE32
1541 au32 = 0;
1542#endif
1543 SHIFT32(utf8, s, strend, &au32, datumtype, needs_swap);
1544 if (datumtype == 'N')
1545 au32 = PerlSock_ntohl(au32);
1546 if (datumtype == 'V')
1547 au32 = vtohl(au32);
1548 if (!checksum)
1549 mPUSHu(au32);
1550 else if (checksum > bits_in_uv)
1551 cdouble += (NV)au32;
1552 else
1553 cuv += au32;
1554 }
1555 break;
1556 case 'V' | TYPE_IS_SHRIEKING:
1557 case 'N' | TYPE_IS_SHRIEKING:
1558 while (len-- > 0) {
1559 I32 ai32;
1560#if U32SIZE > SIZE32
1561 ai32 = 0;
1562#endif
1563 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap);
1564 /* There should never be any byte swapping here. */
1565 assert(!TYPE_ENDIANNESS(datumtype));
1566 if (datumtype == ('N' | TYPE_IS_SHRIEKING))
1567 ai32 = (I32)PerlSock_ntohl((U32)ai32);
1568 if (datumtype == ('V' | TYPE_IS_SHRIEKING))
1569 ai32 = (I32)vtohl((U32)ai32);
1570 if (!checksum)
1571 mPUSHi(ai32);
1572 else if (checksum > bits_in_uv)
1573 cdouble += (NV)ai32;
1574 else
1575 cuv += ai32;
1576 }
1577 break;
1578 case 'p':
1579 while (len-- > 0) {
1580 const char *aptr;
1581 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap);
1582 /* newSVpv generates undef if aptr is NULL */
1583 mPUSHs(newSVpv(aptr, 0));
1584 }
1585 break;
1586 case 'w':
1587 {
1588 UV auv = 0;
1589 size_t bytes = 0;
1590
1591 while (len > 0 && s < strend) {
1592 U8 ch;
1593 ch = SHIFT_BYTE(utf8, s, strend, datumtype);
1594 auv = (auv << 7) | (ch & 0x7f);
1595 /* UTF8_IS_XXXXX not right here because this is a BER, not
1596 * UTF-8 format - using constant 0x80 */
1597 if (ch < 0x80) {
1598 bytes = 0;
1599 mPUSHu(auv);
1600 len--;
1601 auv = 0;
1602 continue;
1603 }
1604 if (++bytes >= sizeof(UV)) { /* promote to string */
1605 const char *t;
1606
1607 sv = Perl_newSVpvf(aTHX_ "%.*" UVuf,
1608 (int)TYPE_DIGITS(UV), auv);
1609 while (s < strend) {
1610 ch = SHIFT_BYTE(utf8, s, strend, datumtype);
1611 sv = mul128(sv, (U8)(ch & 0x7f));
1612 if (!(ch & 0x80)) {
1613 bytes = 0;
1614 break;
1615 }
1616 }
1617 t = SvPV_nolen_const(sv);
1618 while (*t == '0')
1619 t++;
1620 sv_chop(sv, t);
1621 mPUSHs(sv);
1622 len--;
1623 auv = 0;
1624 }
1625 }
1626 if ((s >= strend) && bytes)
1627 Perl_croak(aTHX_ "Unterminated compressed integer in unpack");
1628 }
1629 break;
1630 case 'P':
1631 if (symptr->howlen == e_star)
1632 Perl_croak(aTHX_ "'P' must have an explicit size in unpack");
1633 EXTEND(SP, 1);
1634 if (s + sizeof(char*) <= strend) {
1635 char *aptr;
1636 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap);
1637 /* newSVpvn generates undef if aptr is NULL */
1638 PUSHs(newSVpvn_flags(aptr, len, SVs_TEMP));
1639 }
1640 break;
1641#if defined(HAS_QUAD) && IVSIZE >= 8
1642 case 'q':
1643 while (len-- > 0) {
1644 Quad_t aquad;
1645 SHIFT_VAR(utf8, s, strend, aquad, datumtype, needs_swap);
1646 if (!checksum)
1647 mPUSHs(newSViv((IV)aquad));
1648 else if (checksum > bits_in_uv)
1649 cdouble += (NV)aquad;
1650 else
1651 cuv += aquad;
1652 }
1653 break;
1654 case 'Q':
1655 while (len-- > 0) {
1656 Uquad_t auquad;
1657 SHIFT_VAR(utf8, s, strend, auquad, datumtype, needs_swap);
1658 if (!checksum)
1659 mPUSHs(newSVuv((UV)auquad));
1660 else if (checksum > bits_in_uv)
1661 cdouble += (NV)auquad;
1662 else
1663 cuv += auquad;
1664 }
1665 break;
1666#endif
1667 /* float and double added gnb@melba.bby.oz.au 22/11/89 */
1668 case 'f':
1669 while (len-- > 0) {
1670 float afloat;
1671 SHIFT_VAR(utf8, s, strend, afloat, datumtype, needs_swap);
1672 if (!checksum)
1673 mPUSHn(afloat);
1674 else
1675 cdouble += afloat;
1676 }
1677 break;
1678 case 'd':
1679 while (len-- > 0) {
1680 double adouble;
1681 SHIFT_VAR(utf8, s, strend, adouble, datumtype, needs_swap);
1682 if (!checksum)
1683 mPUSHn(adouble);
1684 else
1685 cdouble += adouble;
1686 }
1687 break;
1688 case 'F':
1689 while (len-- > 0) {
1690 NV_bytes anv;
1691 SHIFT_BYTES(utf8, s, strend, anv.bytes, sizeof(anv.bytes),
1692 datumtype, needs_swap);
1693 if (!checksum)
1694 mPUSHn(anv.nv);
1695 else
1696 cdouble += anv.nv;
1697 }
1698 break;
1699#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
1700 case 'D':
1701 while (len-- > 0) {
1702 ld_bytes aldouble;
1703 SHIFT_BYTES(utf8, s, strend, aldouble.bytes,
1704 sizeof(aldouble.bytes), datumtype, needs_swap);
1705 /* The most common long double format, the x86 80-bit
1706 * extended precision, has either 2 or 6 unused bytes,
1707 * which may contain garbage, which may contain
1708 * unintentional data. While we do zero the bytes of
1709 * the long double data in pack(), here in unpack() we
1710 * don't, because it's really hard to envision that
1711 * reading the long double off aldouble would be
1712 * affected by the unused bytes.
1713 *
1714 * Note that trying to unpack 'long doubles' of 'long
1715 * doubles' packed in another system is in the general
1716 * case doomed without having more detail. */
1717 if (!checksum)
1718 mPUSHn(aldouble.ld);
1719 else
1720 cdouble += aldouble.ld;
1721 }
1722 break;
1723#endif
1724 case 'u':
1725 if (!checksum) {
1726 const STRLEN l = (STRLEN) (strend - s) * 3 / 4;
1727 sv = sv_2mortal(newSV(l));
1728 if (l) {
1729 SvPOK_on(sv);
1730 *SvEND(sv) = '\0';
1731 }
1732 }
1733
1734 /* Note that all legal uuencoded strings are ASCII printables, so
1735 * have the same representation under UTF-8 vs not. This means we
1736 * can ignore UTF8ness on legal input. For illegal we stop at the
1737 * first failure, and don't report where/what that is, so again we
1738 * can ignore UTF8ness */
1739
1740 while (s < strend && *s != ' ' && ISUUCHAR(*s)) {
1741 I32 a, b, c, d;
1742 char hunk[3];
1743
1744 len = PL_uudmap[*(U8*)s++] & 077;
1745 while (len > 0) {
1746 if (s < strend && ISUUCHAR(*s))
1747 a = PL_uudmap[*(U8*)s++] & 077;
1748 else
1749 a = 0;
1750 if (s < strend && ISUUCHAR(*s))
1751 b = PL_uudmap[*(U8*)s++] & 077;
1752 else
1753 b = 0;
1754 if (s < strend && ISUUCHAR(*s))
1755 c = PL_uudmap[*(U8*)s++] & 077;
1756 else
1757 c = 0;
1758 if (s < strend && ISUUCHAR(*s))
1759 d = PL_uudmap[*(U8*)s++] & 077;
1760 else
1761 d = 0;
1762 hunk[0] = (char)((a << 2) | (b >> 4));
1763 hunk[1] = (char)((b << 4) | (c >> 2));
1764 hunk[2] = (char)((c << 6) | d);
1765 if (!checksum)
1766 sv_catpvn(sv, hunk, (len > 3) ? 3 : len);
1767 len -= 3;
1768 }
1769 if (*s == '\n')
1770 s++;
1771 else /* possible checksum byte */
1772 if (s + 1 < strend && s[1] == '\n')
1773 s += 2;
1774 }
1775 if (!checksum)
1776 XPUSHs(sv);
1777 break;
1778 } /* End of switch */
1779
1780 if (checksum) {
1781 if (memCHRs("fFdD", TYPE_NO_MODIFIERS(datumtype)) ||
1782 (checksum > bits_in_uv &&
1783 memCHRs("cCsSiIlLnNUWvVqQjJ", TYPE_NO_MODIFIERS(datumtype))) ) {
1784 NV trouble, anv;
1785
1786 anv = (NV) (1 << (checksum & 15));
1787 while (checksum >= 16) {
1788 checksum -= 16;
1789 anv *= 65536.0;
1790 }
1791 while (cdouble < 0.0)
1792 cdouble += anv;
1793 cdouble = Perl_modf(cdouble / anv, &trouble);
1794#ifdef LONGDOUBLE_DOUBLEDOUBLE
1795 /* Workaround for powerpc doubledouble modfl bug:
1796 * close to 1.0L and -1.0L cdouble is 0, and trouble
1797 * is cdouble / anv. */
1798 if (trouble != Perl_ceil(trouble)) {
1799 cdouble = trouble;
1800 if (cdouble > 1.0L) cdouble -= 1.0L;
1801 if (cdouble < -1.0L) cdouble += 1.0L;
1802 }
1803#endif
1804 cdouble *= anv;
1805 sv = newSVnv(cdouble);
1806 }
1807 else {
1808 if (checksum < bits_in_uv) {
1809 UV mask = nBIT_MASK(checksum);
1810 cuv &= mask;
1811 }
1812 sv = newSVuv(cuv);
1813 }
1814 mXPUSHs(sv);
1815 checksum = 0;
1816 }
1817
1818 if (symptr->flags & FLAG_SLASH){
1819 if (SP - PL_stack_base - start_sp_offset <= 0)
1820 break;
1821 if( next_symbol(symptr) ){
1822 if( symptr->howlen == e_number )
1823 Perl_croak(aTHX_ "Count after length/code in unpack" );
1824 if( beyond ){
1825 /* ...end of char buffer then no decent length available */
1826 Perl_croak(aTHX_ "length/code after end of string in unpack" );
1827 } else {
1828 /* take top of stack (hope it's numeric) */
1829 len = POPi;
1830 if( len < 0 )
1831 Perl_croak(aTHX_ "Negative '/' count in unpack" );
1832 }
1833 } else {
1834 Perl_croak(aTHX_ "Code missing after '/' in unpack" );
1835 }
1836 datumtype = symptr->code;
1837 explicit_length = FALSE;
1838 goto redo_switch;
1839 }
1840 }
1841
1842 if (new_s)
1843 *new_s = s;
1844 PUTBACK;
1845 return SP - PL_stack_base - start_sp_offset;
1846}
1847
1848PP(pp_unpack)
1849{
1850 dSP;
1851 dPOPPOPssrl;
1852 U8 gimme = GIMME_V;
1853 STRLEN llen;
1854 STRLEN rlen;
1855 const char *pat = SvPV_const(left, llen);
1856 const char *s = SvPV_const(right, rlen);
1857 const char *strend = s + rlen;
1858 const char *patend = pat + llen;
1859 SSize_t cnt;
1860
1861 PUTBACK;
1862 cnt = unpackstring(pat, patend, s, strend,
1863 ((gimme == G_SCALAR) ? FLAG_UNPACK_ONLY_ONE : 0)
1864 | (DO_UTF8(right) ? FLAG_DO_UTF8 : 0));
1865
1866 SPAGAIN;
1867 if ( !cnt && gimme == G_SCALAR )
1868 PUSHs(&PL_sv_undef);
1869 RETURN;
1870}
1871
1872STATIC U8 *
1873doencodes(U8 *h, const U8 *s, SSize_t len)
1874{
1875 *h++ = PL_uuemap[len];
1876 while (len > 2) {
1877 *h++ = PL_uuemap[(077 & (s[0] >> 2))];
1878 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((s[1] >> 4) & 017)))];
1879 *h++ = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))];
1880 *h++ = PL_uuemap[(077 & (s[2] & 077))];
1881 s += 3;
1882 len -= 3;
1883 }
1884 if (len > 0) {
1885 const U8 r = (len > 1 ? s[1] : '\0');
1886 *h++ = PL_uuemap[(077 & (s[0] >> 2))];
1887 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((r >> 4) & 017)))];
1888 *h++ = PL_uuemap[(077 & ((r << 2) & 074))];
1889 *h++ = PL_uuemap[0];
1890 }
1891 *h++ = '\n';
1892 return h;
1893}
1894
1895STATIC SV *
1896S_is_an_int(pTHX_ const char *s, STRLEN l)
1897{
1898 SV *result = newSVpvn(s, l);
1899 char *const result_c = SvPV_nolen(result); /* convenience */
1900 char *out = result_c;
1901 bool skip = 1;
1902 bool ignore = 0;
1903
1904 PERL_ARGS_ASSERT_IS_AN_INT;
1905
1906 while (*s) {
1907 switch (*s) {
1908 case ' ':
1909 break;
1910 case '+':
1911 if (!skip) {
1912 SvREFCNT_dec(result);
1913 return (NULL);
1914 }
1915 break;
1916 case '0':
1917 case '1':
1918 case '2':
1919 case '3':
1920 case '4':
1921 case '5':
1922 case '6':
1923 case '7':
1924 case '8':
1925 case '9':
1926 skip = 0;
1927 if (!ignore) {
1928 *(out++) = *s;
1929 }
1930 break;
1931 case '.':
1932 ignore = 1;
1933 break;
1934 default:
1935 SvREFCNT_dec(result);
1936 return (NULL);
1937 }
1938 s++;
1939 }
1940 *(out++) = '\0';
1941 SvCUR_set(result, out - result_c);
1942 return (result);
1943}
1944
1945/* pnum must be '\0' terminated */
1946STATIC int
1947S_div128(pTHX_ SV *pnum, bool *done)
1948{
1949 STRLEN len;
1950 char * const s = SvPV(pnum, len);
1951 char *t = s;
1952 int m = 0;
1953
1954 PERL_ARGS_ASSERT_DIV128;
1955
1956 *done = 1;
1957 while (*t) {
1958 const int i = m * 10 + (*t - '0');
1959 const int r = (i >> 7); /* r < 10 */
1960 m = i & 0x7F;
1961 if (r) {
1962 *done = 0;
1963 }
1964 *(t++) = '0' + r;
1965 }
1966 *(t++) = '\0';
1967 SvCUR_set(pnum, (STRLEN) (t - s));
1968 return (m);
1969}
1970
1971/*
1972=for apidoc packlist
1973
1974The engine implementing C<pack()> Perl function.
1975
1976=cut
1977*/
1978
1979void
1980Perl_packlist(pTHX_ SV *cat, const char *pat, const char *patend, SV **beglist, SV **endlist )
1981{
1982 tempsym_t sym;
1983
1984 PERL_ARGS_ASSERT_PACKLIST;
1985
1986 TEMPSYM_INIT(&sym, pat, patend, FLAG_PACK);
1987
1988 /* We're going to do changes through SvPVX(cat). Make sure it's valid.
1989 Also make sure any UTF8 flag is loaded */
1990 SvPV_force_nolen(cat);
1991 if (DO_UTF8(cat))
1992 sym.flags |= FLAG_PARSE_UTF8 | FLAG_DO_UTF8;
1993
1994 (void)pack_rec( cat, &sym, beglist, endlist );
1995}
1996
1997/* like sv_utf8_upgrade, but also repoint the group start markers */
1998STATIC void
1999marked_upgrade(pTHX_ SV *sv, tempsym_t *sym_ptr) {
2000 STRLEN len;
2001 tempsym_t *group;
2002 const char *from_ptr, *from_start, *from_end, **marks, **m;
2003 char *to_start, *to_ptr;
2004
2005 if (SvUTF8(sv)) return;
2006
2007 from_start = SvPVX_const(sv);
2008 from_end = from_start + SvCUR(sv);
2009 for (from_ptr = from_start; from_ptr < from_end; from_ptr++)
2010 if (!NATIVE_BYTE_IS_INVARIANT(*from_ptr)) break;
2011 if (from_ptr == from_end) {
2012 /* Simple case: no character needs to be changed */
2013 SvUTF8_on(sv);
2014 return;
2015 }
2016
2017 len = (from_end-from_ptr)*UTF8_EXPAND+(from_ptr-from_start)+1;
2018 Newx(to_start, len, char);
2019 Copy(from_start, to_start, from_ptr-from_start, char);
2020 to_ptr = to_start + (from_ptr-from_start);
2021
2022 Newx(marks, sym_ptr->level+2, const char *);
2023 for (group=sym_ptr; group; group = group->previous)
2024 marks[group->level] = from_start + group->strbeg;
2025 marks[sym_ptr->level+1] = from_end+1;
2026 for (m = marks; *m < from_ptr; m++)
2027 *m = to_start + (*m-from_start);
2028
2029 for (;from_ptr < from_end; from_ptr++) {
2030 while (*m == from_ptr) *m++ = to_ptr;
2031 to_ptr = (char *) uvchr_to_utf8((U8 *) to_ptr, *(U8 *) from_ptr);
2032 }
2033 *to_ptr = 0;
2034
2035 while (*m == from_ptr) *m++ = to_ptr;
2036 if (m != marks + sym_ptr->level+1) {
2037 Safefree(marks);
2038 Safefree(to_start);
2039 Perl_croak(aTHX_ "panic: marks beyond string end, m=%p, marks=%p, "
2040 "level=%d", m, marks, sym_ptr->level);
2041 }
2042 for (group=sym_ptr; group; group = group->previous)
2043 group->strbeg = marks[group->level] - to_start;
2044 Safefree(marks);
2045
2046 if (SvOOK(sv)) {
2047 if (SvIVX(sv)) {
2048 SvLEN_set(sv, SvLEN(sv) + SvIVX(sv));
2049 from_start -= SvIVX(sv);
2050 SvIV_set(sv, 0);
2051 }
2052 SvFLAGS(sv) &= ~SVf_OOK;
2053 }
2054 if (SvLEN(sv) != 0)
2055 Safefree(from_start);
2056 SvPV_set(sv, to_start);
2057 SvCUR_set(sv, to_ptr - to_start);
2058 SvLEN_set(sv, len);
2059 SvUTF8_on(sv);
2060}
2061
2062/* Exponential string grower. Makes string extension effectively O(n)
2063 needed says how many extra bytes we need (not counting the final '\0')
2064 Only grows the string if there is an actual lack of space
2065*/
2066STATIC char *
2067S_sv_exp_grow(pTHX_ SV *sv, STRLEN needed) {
2068 const STRLEN cur = SvCUR(sv);
2069 const STRLEN len = SvLEN(sv);
2070 STRLEN extend;
2071
2072 PERL_ARGS_ASSERT_SV_EXP_GROW;
2073
2074 if (len - cur > needed) return SvPVX(sv);
2075 extend = needed > len ? needed : len;
2076 return SvGROW(sv, len+extend+1);
2077}
2078
2079static SV *
2080S_sv_check_infnan(pTHX_ SV *sv, I32 datumtype)
2081{
2082 SvGETMAGIC(sv);
2083 if (UNLIKELY(SvAMAGIC(sv)))
2084 sv = sv_2num(sv);
2085 if (UNLIKELY(isinfnansv(sv))) {
2086 const I32 c = TYPE_NO_MODIFIERS(datumtype);
2087 const NV nv = SvNV_nomg(sv);
2088 if (c == 'w')
2089 Perl_croak(aTHX_ "Cannot compress %" NVgf " in pack", nv);
2090 else
2091 Perl_croak(aTHX_ "Cannot pack %" NVgf " with '%c'", nv, (int) c);
2092 }
2093 return sv;
2094}
2095
2096#define SvIV_no_inf(sv,d) \
2097 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvIV_nomg(sv))
2098#define SvUV_no_inf(sv,d) \
2099 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvUV_nomg(sv))
2100
2101STATIC
2102SV **
2103S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
2104{
2105 tempsym_t lookahead;
2106 SSize_t items = endlist - beglist;
2107 bool found = next_symbol(symptr);
2108 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0;
2109 bool warn_utf8 = ckWARN(WARN_UTF8);
2110 char* from;
2111
2112 PERL_ARGS_ASSERT_PACK_REC;
2113
2114 if (symptr->level == 0 && found && symptr->code == 'U') {
2115 marked_upgrade(aTHX_ cat, symptr);
2116 symptr->flags |= FLAG_DO_UTF8;
2117 utf8 = 0;
2118 }
2119 symptr->strbeg = SvCUR(cat);
2120
2121 while (found) {
2122 SV *fromstr;
2123 STRLEN fromlen;
2124 SSize_t len;
2125 SV *lengthcode = NULL;
2126 I32 datumtype = symptr->code;
2127 howlen_t howlen = symptr->howlen;
2128 char *start = SvPVX(cat);
2129 char *cur = start + SvCUR(cat);
2130 bool needs_swap;
2131
2132#define NEXTFROM (lengthcode ? lengthcode : items > 0 ? (--items, *beglist++) : &PL_sv_no)
2133#define PEEKFROM (lengthcode ? lengthcode : items > 0 ? *beglist : &PL_sv_no)
2134
2135 switch (howlen) {
2136 case e_star:
2137 len = memCHRs("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ?
2138 0 : items;
2139 break;
2140 default:
2141 /* e_no_len and e_number */
2142 len = symptr->length;
2143 break;
2144 }
2145
2146 if (len) {
2147 packprops_t props = packprops[TYPE_NO_ENDIANNESS(datumtype)];
2148
2149 if (props && !(props & PACK_SIZE_UNPREDICTABLE)) {
2150 /* We can process this letter. */
2151 STRLEN size = props & PACK_SIZE_MASK;
2152 GROWING2(utf8, cat, start, cur, size, (STRLEN)len);
2153 }
2154 }
2155
2156 /* Look ahead for next symbol. Do we have code/code? */
2157 lookahead = *symptr;
2158 found = next_symbol(&lookahead);
2159 if (symptr->flags & FLAG_SLASH) {
2160 IV count;
2161 if (!found) Perl_croak(aTHX_ "Code missing after '/' in pack");
2162 if (memCHRs("aAZ", lookahead.code)) {
2163 if (lookahead.howlen == e_number) count = lookahead.length;
2164 else {
2165 if (items > 0) {
2166 count = sv_len_utf8(*beglist);
2167 }
2168 else count = 0;
2169 if (lookahead.code == 'Z') count++;
2170 }
2171 } else {
2172 if (lookahead.howlen == e_number && lookahead.length < items)
2173 count = lookahead.length;
2174 else count = items;
2175 }
2176 lookahead.howlen = e_number;
2177 lookahead.length = count;
2178 lengthcode = sv_2mortal(newSViv(count));
2179 }
2180
2181 needs_swap = NEEDS_SWAP(datumtype);
2182
2183 /* Code inside the switch must take care to properly update
2184 cat (CUR length and '\0' termination) if it updated *cur and
2185 doesn't simply leave using break */
2186 switch (TYPE_NO_ENDIANNESS(datumtype)) {
2187 default:
2188 Perl_croak(aTHX_ "Invalid type '%c' in pack",
2189 (int) TYPE_NO_MODIFIERS(datumtype));
2190 case '%':
2191 Perl_croak(aTHX_ "'%%' may not be used in pack");
2192
2193 case '.' | TYPE_IS_SHRIEKING:
2194 case '.':
2195 if (howlen == e_star) from = start;
2196 else if (len == 0) from = cur;
2197 else {
2198 tempsym_t *group = symptr;
2199
2200 while (--len && group) group = group->previous;
2201 from = group ? start + group->strbeg : start;
2202 }
2203 fromstr = NEXTFROM;
2204 len = SvIV_no_inf(fromstr, datumtype);
2205 goto resize;
2206 case '@' | TYPE_IS_SHRIEKING:
2207 case '@':
2208 from = start + symptr->strbeg;
2209 resize:
2210 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING))
2211 if (len >= 0) {
2212 while (len && from < cur) {
2213 from += UTF8SKIP(from);
2214 len--;
2215 }
2216 if (from > cur)
2217 Perl_croak(aTHX_ "Malformed UTF-8 string in pack");
2218 if (len) {
2219 /* Here we know from == cur */
2220 grow:
2221 GROWING(0, cat, start, cur, len);
2222 Zero(cur, len, char);
2223 cur += len;
2224 } else if (from < cur) {
2225 len = cur - from;
2226 goto shrink;
2227 } else goto no_change;
2228 } else {
2229 cur = from;
2230 len = -len;
2231 goto utf8_shrink;
2232 }
2233 else {
2234 len -= cur - from;
2235 if (len > 0) goto grow;
2236 if (len == 0) goto no_change;
2237 len = -len;
2238 goto shrink;
2239 }
2240 break;
2241
2242 case '(': {
2243 tempsym_t savsym = *symptr;
2244 U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags);
2245 symptr->flags |= group_modifiers;
2246 symptr->patend = savsym.grpend;
2247 symptr->level++;
2248 symptr->previous = &lookahead;
2249 while (len--) {
2250 U32 was_utf8;
2251 if (utf8) symptr->flags |= FLAG_PARSE_UTF8;
2252 else symptr->flags &= ~FLAG_PARSE_UTF8;
2253 was_utf8 = SvUTF8(cat);
2254 symptr->patptr = savsym.grpbeg;
2255 beglist = pack_rec(cat, symptr, beglist, endlist);
2256 if (SvUTF8(cat) != was_utf8)
2257 /* This had better be an upgrade while in utf8==0 mode */
2258 utf8 = 1;
2259
2260 if (savsym.howlen == e_star && beglist == endlist)
2261 break; /* No way to continue */
2262 }
2263 items = endlist - beglist;
2264 lookahead.flags = symptr->flags & ~group_modifiers;
2265 goto no_change;
2266 }
2267 case 'X' | TYPE_IS_SHRIEKING:
2268 if (!len) /* Avoid division by 0 */
2269 len = 1;
2270 if (utf8) {
2271 char *hop, *last;
2272 SSize_t l = len;
2273 hop = last = start;
2274 while (hop < cur) {
2275 hop += UTF8SKIP(hop);
2276 if (--l == 0) {
2277 last = hop;
2278 l = len;
2279 }
2280 }
2281 if (last > cur)
2282 Perl_croak(aTHX_ "Malformed UTF-8 string in pack");
2283 cur = last;
2284 break;
2285 }
2286 len = (cur-start) % len;
2287 /* FALLTHROUGH */
2288 case 'X':
2289 if (utf8) {
2290 if (len < 1) goto no_change;
2291 utf8_shrink:
2292 while (len > 0) {
2293 if (cur <= start)
2294 Perl_croak(aTHX_ "'%c' outside of string in pack",
2295 (int) TYPE_NO_MODIFIERS(datumtype));
2296 while (--cur, UTF8_IS_CONTINUATION(*cur)) {
2297 if (cur <= start)
2298 Perl_croak(aTHX_ "'%c' outside of string in pack",
2299 (int) TYPE_NO_MODIFIERS(datumtype));
2300 }
2301 len--;
2302 }
2303 } else {
2304 shrink:
2305 if (cur - start < len)
2306 Perl_croak(aTHX_ "'%c' outside of string in pack",
2307 (int) TYPE_NO_MODIFIERS(datumtype));
2308 cur -= len;
2309 }
2310 if (cur < start+symptr->strbeg) {
2311 /* Make sure group starts don't point into the void */
2312 tempsym_t *group;
2313 const STRLEN length = cur-start;
2314 for (group = symptr;
2315 group && length < group->strbeg;
2316 group = group->previous) group->strbeg = length;
2317 lookahead.strbeg = length;
2318 }
2319 break;
2320 case 'x' | TYPE_IS_SHRIEKING: {
2321 SSize_t ai32;
2322 if (!len) /* Avoid division by 0 */
2323 len = 1;
2324 if (utf8) ai32 = utf8_length((U8 *) start, (U8 *) cur) % len;
2325 else ai32 = (cur - start) % len;
2326 if (ai32 == 0) goto no_change;
2327 len -= ai32;
2328 }
2329 /* FALLTHROUGH */
2330 case 'x':
2331 goto grow;
2332 case 'A':
2333 case 'Z':
2334 case 'a': {
2335 const char *aptr;
2336
2337 fromstr = NEXTFROM;
2338 aptr = SvPV_const(fromstr, fromlen);
2339 if (DO_UTF8(fromstr)) {
2340 const char *end, *s;
2341
2342 if (!utf8 && !SvUTF8(cat)) {
2343 marked_upgrade(aTHX_ cat, symptr);
2344 lookahead.flags |= FLAG_DO_UTF8;
2345 lookahead.strbeg = symptr->strbeg;
2346 utf8 = 1;
2347 start = SvPVX(cat);
2348 cur = start + SvCUR(cat);
2349 }
2350 if (howlen == e_star) {
2351 if (utf8) goto string_copy;
2352 len = fromlen+1;
2353 }
2354 s = aptr;
2355 end = aptr + fromlen;
2356 fromlen = datumtype == 'Z' ? len-1 : len;
2357 while ((SSize_t) fromlen > 0 && s < end) {
2358 s += UTF8SKIP(s);
2359 fromlen--;
2360 }
2361 if (s > end)
2362 Perl_croak(aTHX_ "Malformed UTF-8 string in pack");
2363 if (utf8) {
2364 len = fromlen;
2365 if (datumtype == 'Z') len++;
2366 fromlen = s-aptr;
2367 len += fromlen;
2368
2369 goto string_copy;
2370 }
2371 fromlen = len - fromlen;
2372 if (datumtype == 'Z') fromlen--;
2373 if (howlen == e_star) {
2374 len = fromlen;
2375 if (datumtype == 'Z') len++;
2376 }
2377 GROWING(0, cat, start, cur, len);
2378 if (!S_utf8_to_bytes(aTHX_ &aptr, end, cur, fromlen,
2379 datumtype | TYPE_IS_PACK))
2380 Perl_croak(aTHX_ "panic: predicted utf8 length not available, "
2381 "for '%c', aptr=%p end=%p cur=%p, fromlen=%zu",
2382 (int)datumtype, aptr, end, cur, fromlen);
2383 cur += fromlen;
2384 len -= fromlen;
2385 } else if (utf8) {
2386 if (howlen == e_star) {
2387 len = fromlen;
2388 if (datumtype == 'Z') len++;
2389 }
2390 if (len <= (SSize_t) fromlen) {
2391 fromlen = len;
2392 if (datumtype == 'Z' && fromlen > 0) fromlen--;
2393 }
2394 /* assumes a byte expands to at most UTF8_EXPAND bytes on
2395 upgrade, so:
2396 expected_length <= from_len*UTF8_EXPAND + (len-from_len) */
2397 GROWING(0, cat, start, cur, fromlen*(UTF8_EXPAND-1)+len);
2398 len -= fromlen;
2399 while (fromlen > 0) {
2400 cur = (char *) uvchr_to_utf8((U8 *) cur, * (U8 *) aptr);
2401 aptr++;
2402 fromlen--;
2403 }
2404 } else {
2405 string_copy:
2406 if (howlen == e_star) {
2407 len = fromlen;
2408 if (datumtype == 'Z') len++;
2409 }
2410 if (len <= (SSize_t) fromlen) {
2411 fromlen = len;
2412 if (datumtype == 'Z' && fromlen > 0) fromlen--;
2413 }
2414 GROWING(0, cat, start, cur, len);
2415 Copy(aptr, cur, fromlen, char);
2416 cur += fromlen;
2417 len -= fromlen;
2418 }
2419 memset(cur, datumtype == 'A' ? ' ' : '\0', len);
2420 cur += len;
2421 SvTAINT(cat);
2422 break;
2423 }
2424 case 'B':
2425 case 'b': {
2426 const char *str, *end;
2427 SSize_t l, field_len;
2428 U8 bits;
2429 bool utf8_source;
2430 U32 utf8_flags;
2431
2432 fromstr = NEXTFROM;
2433 str = SvPV_const(fromstr, fromlen);
2434 end = str + fromlen;
2435 if (DO_UTF8(fromstr)) {
2436 utf8_source = TRUE;
2437 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY;
2438 } else {
2439 utf8_source = FALSE;
2440 utf8_flags = 0; /* Unused, but keep compilers happy */
2441 }
2442 if (howlen == e_star) len = fromlen;
2443 field_len = (len+7)/8;
2444 GROWING(utf8, cat, start, cur, field_len);
2445 if (len > (SSize_t)fromlen) len = fromlen;
2446 bits = 0;
2447 l = 0;
2448 if (datumtype == 'B')
2449 while (l++ < len) {
2450 if (utf8_source) {
2451 UV val = 0;
2452 NEXT_UNI_VAL(val, cur, str, end, utf8_flags);
2453 bits |= val & 1;
2454 } else bits |= *str++ & 1;
2455 if (l & 7) bits <<= 1;
2456 else {
2457 PUSH_BYTE(utf8, cur, bits);
2458 bits = 0;
2459 }
2460 }
2461 else
2462 /* datumtype == 'b' */
2463 while (l++ < len) {
2464 if (utf8_source) {
2465 UV val = 0;
2466 NEXT_UNI_VAL(val, cur, str, end, utf8_flags);
2467 if (val & 1) bits |= 0x80;
2468 } else if (*str++ & 1)
2469 bits |= 0x80;
2470 if (l & 7) bits >>= 1;
2471 else {
2472 PUSH_BYTE(utf8, cur, bits);
2473 bits = 0;
2474 }
2475 }
2476 l--;
2477 if (l & 7) {
2478 if (datumtype == 'B')
2479 bits <<= 7 - (l & 7);
2480 else
2481 bits >>= 7 - (l & 7);
2482 PUSH_BYTE(utf8, cur, bits);
2483 l += 7;
2484 }
2485 /* Determine how many chars are left in the requested field */
2486 l /= 8;
2487 if (howlen == e_star) field_len = 0;
2488 else field_len -= l;
2489 Zero(cur, field_len, char);
2490 cur += field_len;
2491 break;
2492 }
2493 case 'H':
2494 case 'h': {
2495 const char *str, *end;
2496 SSize_t l, field_len;
2497 U8 bits;
2498 bool utf8_source;
2499 U32 utf8_flags;
2500
2501 fromstr = NEXTFROM;
2502 str = SvPV_const(fromstr, fromlen);
2503 end = str + fromlen;
2504 if (DO_UTF8(fromstr)) {
2505 utf8_source = TRUE;
2506 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY;
2507 } else {
2508 utf8_source = FALSE;
2509 utf8_flags = 0; /* Unused, but keep compilers happy */
2510 }
2511 if (howlen == e_star) len = fromlen;
2512 field_len = (len+1)/2;
2513 GROWING(utf8, cat, start, cur, field_len);
2514 if (!utf8_source && len > (SSize_t)fromlen) len = fromlen;
2515 bits = 0;
2516 l = 0;
2517 if (datumtype == 'H')
2518 while (l++ < len) {
2519 if (utf8_source) {
2520 UV val = 0;
2521 NEXT_UNI_VAL(val, cur, str, end, utf8_flags);
2522 if (val < 256 && isALPHA(val))
2523 bits |= (val + 9) & 0xf;
2524 else
2525 bits |= val & 0xf;
2526 } else if (isALPHA(*str))
2527 bits |= (*str++ + 9) & 0xf;
2528 else
2529 bits |= *str++ & 0xf;
2530 if (l & 1) bits <<= 4;
2531 else {
2532 PUSH_BYTE(utf8, cur, bits);
2533 bits = 0;
2534 }
2535 }
2536 else
2537 while (l++ < len) {
2538 if (utf8_source) {
2539 UV val = 0;
2540 NEXT_UNI_VAL(val, cur, str, end, utf8_flags);
2541 if (val < 256 && isALPHA(val))
2542 bits |= ((val + 9) & 0xf) << 4;
2543 else
2544 bits |= (val & 0xf) << 4;
2545 } else if (isALPHA(*str))
2546 bits |= ((*str++ + 9) & 0xf) << 4;
2547 else
2548 bits |= (*str++ & 0xf) << 4;
2549 if (l & 1) bits >>= 4;
2550 else {
2551 PUSH_BYTE(utf8, cur, bits);
2552 bits = 0;
2553 }
2554 }
2555 l--;
2556 if (l & 1) {
2557 PUSH_BYTE(utf8, cur, bits);
2558 l++;
2559 }
2560 /* Determine how many chars are left in the requested field */
2561 l /= 2;
2562 if (howlen == e_star) field_len = 0;
2563 else field_len -= l;
2564 Zero(cur, field_len, char);
2565 cur += field_len;
2566 break;
2567 }
2568 case 'c':
2569 while (len-- > 0) {
2570 IV aiv;
2571 fromstr = NEXTFROM;
2572 aiv = SvIV_no_inf(fromstr, datumtype);
2573 if ((-128 > aiv || aiv > 127))
2574 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
2575 "Character in 'c' format wrapped in pack");
2576 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff));
2577 }
2578 break;
2579 case 'C':
2580 if (len == 0) {
2581 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
2582 break;
2583 }
2584 while (len-- > 0) {
2585 IV aiv;
2586 fromstr = NEXTFROM;
2587 aiv = SvIV_no_inf(fromstr, datumtype);
2588 if ((0 > aiv || aiv > 0xff))
2589 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
2590 "Character in 'C' format wrapped in pack");
2591 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff));
2592 }
2593 break;
2594 case 'W': {
2595 char *end;
2596 U8 in_bytes = (U8)IN_BYTES;
2597
2598 end = start+SvLEN(cat)-1;
2599 if (utf8) end -= UTF8_MAXLEN-1;
2600 while (len-- > 0) {
2601 UV auv;
2602 fromstr = NEXTFROM;
2603 auv = SvUV_no_inf(fromstr, datumtype);
2604 if (in_bytes) auv = auv % 0x100;
2605 if (utf8) {
2606 W_utf8:
2607 if (cur >= end) {
2608 *cur = '\0';
2609 SvCUR_set(cat, cur - start);
2610
2611 GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
2612 end = start+SvLEN(cat)-UTF8_MAXLEN;
2613 }
2614 cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv, 0);
2615 } else {
2616 if (auv >= 0x100) {
2617 if (!SvUTF8(cat)) {
2618 *cur = '\0';
2619 SvCUR_set(cat, cur - start);
2620 marked_upgrade(aTHX_ cat, symptr);
2621 lookahead.flags |= FLAG_DO_UTF8;
2622 lookahead.strbeg = symptr->strbeg;
2623 utf8 = 1;
2624 start = SvPVX(cat);
2625 cur = start + SvCUR(cat);
2626 end = start+SvLEN(cat)-UTF8_MAXLEN;
2627 goto W_utf8;
2628 }
2629 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
2630 "Character in 'W' format wrapped in pack");
2631 auv &= 0xff;
2632 }
2633 if (cur >= end) {
2634 *cur = '\0';
2635 SvCUR_set(cat, cur - start);
2636 GROWING(0, cat, start, cur, len+1);
2637 end = start+SvLEN(cat)-1;
2638 }
2639 *(U8 *) cur++ = (U8)auv;
2640 }
2641 }
2642 break;
2643 }
2644 case 'U': {
2645 char *end;
2646
2647 if (len == 0) {
2648 if (!(symptr->flags & FLAG_DO_UTF8)) {
2649 marked_upgrade(aTHX_ cat, symptr);
2650 lookahead.flags |= FLAG_DO_UTF8;
2651 lookahead.strbeg = symptr->strbeg;
2652 }
2653 utf8 = 0;
2654 goto no_change;
2655 }
2656
2657 end = start+SvLEN(cat);
2658 if (!utf8) end -= UTF8_MAXLEN;
2659 while (len-- > 0) {
2660 UV auv;
2661 fromstr = NEXTFROM;
2662 auv = SvUV_no_inf(fromstr, datumtype);
2663 if (utf8) {
2664 U8 buffer[UTF8_MAXLEN+1], *endb;
2665 endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 0);
2666 if (cur+(endb-buffer)*UTF8_EXPAND >= end) {
2667 *cur = '\0';
2668 SvCUR_set(cat, cur - start);
2669 GROWING(0, cat, start, cur,
2670 len+(endb-buffer)*UTF8_EXPAND);
2671 end = start+SvLEN(cat);
2672 }
2673 cur = my_bytes_to_utf8(buffer, endb-buffer, cur, 0);
2674 } else {
2675 if (cur >= end) {
2676 *cur = '\0';
2677 SvCUR_set(cat, cur - start);
2678 GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
2679 end = start+SvLEN(cat)-UTF8_MAXLEN;
2680 }
2681 cur = (char *) uvchr_to_utf8_flags((U8 *) cur,
2682 UNI_TO_NATIVE(auv),
2683 0);
2684 }
2685 }
2686 break;
2687 }
2688 /* Float and double added by gnb@melba.bby.oz.au 22/11/89 */
2689 case 'f':
2690 while (len-- > 0) {
2691 float afloat;
2692 NV anv;
2693 fromstr = NEXTFROM;
2694 anv = SvNV(fromstr);
2695# if (defined(VMS) && !defined(_IEEE_FP)) || defined(DOUBLE_IS_VAX_FLOAT)
2696 /* IEEE fp overflow shenanigans are unavailable on VAX and optional
2697 * on Alpha; fake it if we don't have them.
2698 */
2699 if (anv > FLT_MAX)
2700 afloat = FLT_MAX;
2701 else if (anv < -FLT_MAX)
2702 afloat = -FLT_MAX;
2703 else afloat = (float)anv;
2704# else
2705# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2706 if(Perl_isnan(anv))
2707 afloat = (float)NV_NAN;
2708 else
2709# endif
2710# ifdef NV_INF
2711 /* a simple cast to float is undefined if outside
2712 * the range of values that can be represented */
2713 afloat = (float)(anv > FLT_MAX ? NV_INF :
2714 anv < -FLT_MAX ? -NV_INF : anv);
2715# endif
2716# endif
2717 PUSH_VAR(utf8, cur, afloat, needs_swap);
2718 }
2719 break;
2720 case 'd':
2721 while (len-- > 0) {
2722 double adouble;
2723 NV anv;
2724 fromstr = NEXTFROM;
2725 anv = SvNV(fromstr);
2726# if (defined(VMS) && !defined(_IEEE_FP)) || defined(DOUBLE_IS_VAX_FLOAT)
2727 /* IEEE fp overflow shenanigans are unavailable on VAX and optional
2728 * on Alpha; fake it if we don't have them.
2729 */
2730 if (anv > DBL_MAX)
2731 adouble = DBL_MAX;
2732 else if (anv < -DBL_MAX)
2733 adouble = -DBL_MAX;
2734 else adouble = (double)anv;
2735# else
2736 adouble = (double)anv;
2737# endif
2738 PUSH_VAR(utf8, cur, adouble, needs_swap);
2739 }
2740 break;
2741 case 'F': {
2742 NV_bytes anv;
2743 Zero(&anv, 1, NV); /* can be long double with unused bits */
2744 while (len-- > 0) {
2745 fromstr = NEXTFROM;
2746#ifdef __GNUC__
2747 /* to work round a gcc/x86 bug; don't use SvNV */
2748 anv.nv = sv_2nv(fromstr);
2749# if defined(LONGDOUBLE_X86_80_BIT) && defined(USE_LONG_DOUBLE) \
2750 && LONG_DOUBLESIZE > 10
2751 /* GCC sometimes overwrites the padding in the
2752 assignment above */
2753 Zero(anv.bytes+10, sizeof(anv.bytes) - 10, U8);
2754# endif
2755#else
2756 anv.nv = SvNV(fromstr);
2757#endif
2758 PUSH_BYTES(utf8, cur, anv.bytes, sizeof(anv.bytes), needs_swap);
2759 }
2760 break;
2761 }
2762#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
2763 case 'D': {
2764 ld_bytes aldouble;
2765 /* long doubles can have unused bits, which may be nonzero */
2766 Zero(&aldouble, 1, long double);
2767 while (len-- > 0) {
2768 fromstr = NEXTFROM;
2769# ifdef __GNUC__
2770 /* to work round a gcc/x86 bug; don't use SvNV */
2771 aldouble.ld = (long double)sv_2nv(fromstr);
2772# if defined(LONGDOUBLE_X86_80_BIT) && LONG_DOUBLESIZE > 10
2773 /* GCC sometimes overwrites the padding in the
2774 assignment above */
2775 Zero(aldouble.bytes+10, sizeof(aldouble.bytes) - 10, U8);
2776# endif
2777# else
2778 aldouble.ld = (long double)SvNV(fromstr);
2779# endif
2780 PUSH_BYTES(utf8, cur, aldouble.bytes, sizeof(aldouble.bytes),
2781 needs_swap);
2782 }
2783 break;
2784 }
2785#endif
2786 case 'n' | TYPE_IS_SHRIEKING:
2787 case 'n':
2788 while (len-- > 0) {
2789 I16 ai16;
2790 fromstr = NEXTFROM;
2791 ai16 = (I16)SvIV_no_inf(fromstr, datumtype);
2792 ai16 = PerlSock_htons(ai16);
2793 PUSH16(utf8, cur, &ai16, FALSE);
2794 }
2795 break;
2796 case 'v' | TYPE_IS_SHRIEKING:
2797 case 'v':
2798 while (len-- > 0) {
2799 I16 ai16;
2800 fromstr = NEXTFROM;
2801 ai16 = (I16)SvIV_no_inf(fromstr, datumtype);
2802 ai16 = htovs(ai16);
2803 PUSH16(utf8, cur, &ai16, FALSE);
2804 }
2805 break;
2806 case 'S' | TYPE_IS_SHRIEKING:
2807#if SHORTSIZE != SIZE16
2808 while (len-- > 0) {
2809 unsigned short aushort;
2810 fromstr = NEXTFROM;
2811 aushort = SvUV_no_inf(fromstr, datumtype);
2812 PUSH_VAR(utf8, cur, aushort, needs_swap);
2813 }
2814 break;
2815#else
2816 /* FALLTHROUGH */
2817#endif
2818 case 'S':
2819 while (len-- > 0) {
2820 U16 au16;
2821 fromstr = NEXTFROM;
2822 au16 = (U16)SvUV_no_inf(fromstr, datumtype);
2823 PUSH16(utf8, cur, &au16, needs_swap);
2824 }
2825 break;
2826 case 's' | TYPE_IS_SHRIEKING:
2827#if SHORTSIZE != SIZE16
2828 while (len-- > 0) {
2829 short ashort;
2830 fromstr = NEXTFROM;
2831 ashort = SvIV_no_inf(fromstr, datumtype);
2832 PUSH_VAR(utf8, cur, ashort, needs_swap);
2833 }
2834 break;
2835#else
2836 /* FALLTHROUGH */
2837#endif
2838 case 's':
2839 while (len-- > 0) {
2840 I16 ai16;
2841 fromstr = NEXTFROM;
2842 ai16 = (I16)SvIV_no_inf(fromstr, datumtype);
2843 PUSH16(utf8, cur, &ai16, needs_swap);
2844 }
2845 break;
2846 case 'I':
2847 case 'I' | TYPE_IS_SHRIEKING:
2848 while (len-- > 0) {
2849 unsigned int auint;
2850 fromstr = NEXTFROM;
2851 auint = SvUV_no_inf(fromstr, datumtype);
2852 PUSH_VAR(utf8, cur, auint, needs_swap);
2853 }
2854 break;
2855 case 'j':
2856 while (len-- > 0) {
2857 IV aiv;
2858 fromstr = NEXTFROM;
2859 aiv = SvIV_no_inf(fromstr, datumtype);
2860 PUSH_VAR(utf8, cur, aiv, needs_swap);
2861 }
2862 break;
2863 case 'J':
2864 while (len-- > 0) {
2865 UV auv;
2866 fromstr = NEXTFROM;
2867 auv = SvUV_no_inf(fromstr, datumtype);
2868 PUSH_VAR(utf8, cur, auv, needs_swap);
2869 }
2870 break;
2871 case 'w':
2872 while (len-- > 0) {
2873 NV anv;
2874 fromstr = NEXTFROM;
2875 S_sv_check_infnan(aTHX_ fromstr, datumtype);
2876 anv = SvNV_nomg(fromstr);
2877
2878 if (anv < 0) {
2879 *cur = '\0';
2880 SvCUR_set(cat, cur - start);
2881 Perl_croak(aTHX_ "Cannot compress negative numbers in pack");
2882 }
2883
2884 /* 0xFFFFFFFFFFFFFFFF may cast to 18446744073709551616.0,
2885 which is == UV_MAX_P1. IOK is fine (instead of UV_only), as
2886 any negative IVs will have already been got by the croak()
2887 above. IOK is untrue for fractions, so we test them
2888 against UV_MAX_P1. */
2889 if (SvIOK(fromstr) || anv < UV_MAX_P1) {
2890 char buf[(sizeof(UV)*CHAR_BIT)/7+1];
2891 char *in = buf + sizeof(buf);
2892 UV auv = SvUV_nomg(fromstr);
2893
2894 do {
2895 *--in = (char)((auv & 0x7f) | 0x80);
2896 auv >>= 7;
2897 } while (auv);
2898 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */
2899 PUSH_GROWING_BYTES(utf8, cat, start, cur,
2900 in, (buf + sizeof(buf)) - in);
2901 } else if (SvPOKp(fromstr))
2902 goto w_string;
2903 else if (SvNOKp(fromstr)) {
2904 /* 10**NV_MAX_10_EXP is the largest power of 10
2905 so 10**(NV_MAX_10_EXP+1) is definitely unrepresentable
2906 given 10**(NV_MAX_10_EXP+1) == 128 ** x solve for x:
2907 x = (NV_MAX_10_EXP+1) * log (10) / log (128)
2908 And with that many bytes only Inf can overflow.
2909 Some C compilers are strict about integral constant
2910 expressions so we conservatively divide by a slightly
2911 smaller integer instead of multiplying by the exact
2912 floating-point value.
2913 */
2914#ifdef NV_MAX_10_EXP
2915 /* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */
2916 char buf[1 + (int)((NV_MAX_10_EXP + 1) / 2)]; /* valid C */
2917#else
2918 /* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */
2919 char buf[1 + (int)((308 + 1) / 2)]; /* valid C */
2920#endif
2921 char *in = buf + sizeof(buf);
2922
2923 anv = Perl_floor(anv);
2924 do {
2925 const NV next = Perl_floor(anv / 128);
2926 if (in <= buf) /* this cannot happen ;-) */
2927 Perl_croak(aTHX_ "Cannot compress integer in pack");
2928 *--in = (unsigned char)(anv - (next * 128)) | 0x80;
2929 anv = next;
2930 } while (anv > 0);
2931 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */
2932 PUSH_GROWING_BYTES(utf8, cat, start, cur,
2933 in, (buf + sizeof(buf)) - in);
2934 } else {
2935 const char *from;
2936 char *result, *in;
2937 SV *norm;
2938 STRLEN len;
2939 bool done;
2940
2941 w_string:
2942 /* Copy string and check for compliance */
2943 from = SvPV_nomg_const(fromstr, len);
2944 if ((norm = is_an_int(from, len)) == NULL)
2945 Perl_croak(aTHX_ "Can only compress unsigned integers in pack");
2946
2947 Newx(result, len, char);
2948 in = result + len;
2949 done = FALSE;
2950 while (!done) *--in = div128(norm, &done) | 0x80;
2951 result[len - 1] &= 0x7F; /* clear continue bit */
2952 PUSH_GROWING_BYTES(utf8, cat, start, cur,
2953 in, (result + len) - in);
2954 Safefree(result);
2955 SvREFCNT_dec(norm); /* free norm */
2956 }
2957 }
2958 break;
2959 case 'i':
2960 case 'i' | TYPE_IS_SHRIEKING:
2961 while (len-- > 0) {
2962 int aint;
2963 fromstr = NEXTFROM;
2964 aint = SvIV_no_inf(fromstr, datumtype);
2965 PUSH_VAR(utf8, cur, aint, needs_swap);
2966 }
2967 break;
2968 case 'N' | TYPE_IS_SHRIEKING:
2969 case 'N':
2970 while (len-- > 0) {
2971 U32 au32;
2972 fromstr = NEXTFROM;
2973 au32 = SvUV_no_inf(fromstr, datumtype);
2974 au32 = PerlSock_htonl(au32);
2975 PUSH32(utf8, cur, &au32, FALSE);
2976 }
2977 break;
2978 case 'V' | TYPE_IS_SHRIEKING:
2979 case 'V':
2980 while (len-- > 0) {
2981 U32 au32;
2982 fromstr = NEXTFROM;
2983 au32 = SvUV_no_inf(fromstr, datumtype);
2984 au32 = htovl(au32);
2985 PUSH32(utf8, cur, &au32, FALSE);
2986 }
2987 break;
2988 case 'L' | TYPE_IS_SHRIEKING:
2989#if LONGSIZE != SIZE32
2990 while (len-- > 0) {
2991 unsigned long aulong;
2992 fromstr = NEXTFROM;
2993 aulong = SvUV_no_inf(fromstr, datumtype);
2994 PUSH_VAR(utf8, cur, aulong, needs_swap);
2995 }
2996 break;
2997#else
2998 /* Fall though! */
2999#endif
3000 case 'L':
3001 while (len-- > 0) {
3002 U32 au32;
3003 fromstr = NEXTFROM;
3004 au32 = SvUV_no_inf(fromstr, datumtype);
3005 PUSH32(utf8, cur, &au32, needs_swap);
3006 }
3007 break;
3008 case 'l' | TYPE_IS_SHRIEKING:
3009#if LONGSIZE != SIZE32
3010 while (len-- > 0) {
3011 long along;
3012 fromstr = NEXTFROM;
3013 along = SvIV_no_inf(fromstr, datumtype);
3014 PUSH_VAR(utf8, cur, along, needs_swap);
3015 }
3016 break;
3017#else
3018 /* Fall though! */
3019#endif
3020 case 'l':
3021 while (len-- > 0) {
3022 I32 ai32;
3023 fromstr = NEXTFROM;
3024 ai32 = SvIV_no_inf(fromstr, datumtype);
3025 PUSH32(utf8, cur, &ai32, needs_swap);
3026 }
3027 break;
3028#if defined(HAS_QUAD) && IVSIZE >= 8
3029 case 'Q':
3030 while (len-- > 0) {
3031 Uquad_t auquad;
3032 fromstr = NEXTFROM;
3033 auquad = (Uquad_t) SvUV_no_inf(fromstr, datumtype);
3034 PUSH_VAR(utf8, cur, auquad, needs_swap);
3035 }
3036 break;
3037 case 'q':
3038 while (len-- > 0) {
3039 Quad_t aquad;
3040 fromstr = NEXTFROM;
3041 aquad = (Quad_t)SvIV_no_inf(fromstr, datumtype);
3042 PUSH_VAR(utf8, cur, aquad, needs_swap);
3043 }
3044 break;
3045#endif
3046 case 'P':
3047 len = 1; /* assume SV is correct length */
3048 GROWING(utf8, cat, start, cur, sizeof(char *));
3049 /* FALLTHROUGH */
3050 case 'p':
3051 while (len-- > 0) {
3052 const char *aptr;
3053
3054 fromstr = NEXTFROM;
3055 SvGETMAGIC(fromstr);
3056 if (!SvOK(fromstr)) aptr = NULL;
3057 else {
3058 /* XXX better yet, could spirit away the string to
3059 * a safe spot and hang on to it until the result
3060 * of pack() (and all copies of the result) are
3061 * gone.
3062 */
3063 if (((SvTEMP(fromstr) && SvREFCNT(fromstr) == 1)
3064 || (SvPADTMP(fromstr) &&
3065 !SvREADONLY(fromstr)))) {
3066 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
3067 "Attempt to pack pointer to temporary value");
3068 }
3069 if (SvPOK(fromstr) || SvNIOK(fromstr))
3070 aptr = SvPV_nomg_const_nolen(fromstr);
3071 else
3072 aptr = SvPV_force_flags_nolen(fromstr, 0);
3073 }
3074 PUSH_VAR(utf8, cur, aptr, needs_swap);
3075 }
3076 break;
3077 case 'u': {
3078 const char *aptr, *aend;
3079 bool from_utf8;
3080
3081 fromstr = NEXTFROM;
3082 if (len <= 2) len = 45;
3083 else len = len / 3 * 3;
3084 if (len >= 64) {
3085 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
3086 "Field too wide in 'u' format in pack");
3087 len = 63;
3088 }
3089 aptr = SvPV_const(fromstr, fromlen);
3090 from_utf8 = DO_UTF8(fromstr);
3091 if (from_utf8) {
3092 aend = aptr + fromlen;
3093 fromlen = sv_len_utf8_nomg(fromstr);
3094 } else aend = NULL; /* Unused, but keep compilers happy */
3095 GROWING(utf8, cat, start, cur, (fromlen+2) / 3 * 4 + (fromlen+len-1)/len * 2);
3096 while (fromlen > 0) {
3097 U8 *end;
3098 SSize_t todo;
3099 U8 hunk[1+63/3*4+1];
3100
3101 if ((SSize_t)fromlen > len)
3102 todo = len;
3103 else
3104 todo = fromlen;
3105 if (from_utf8) {
3106 char buffer[64];
3107 if (!S_utf8_to_bytes(aTHX_ &aptr, aend, buffer, todo,
3108 'u' | TYPE_IS_PACK)) {
3109 *cur = '\0';
3110 SvCUR_set(cat, cur - start);
3111 Perl_croak(aTHX_ "panic: string is shorter than advertised, "
3112 "aptr=%p, aend=%p, buffer=%p, todo=%zd",
3113 aptr, aend, buffer, todo);
3114 }
3115 end = doencodes(hunk, (const U8 *)buffer, todo);
3116 } else {
3117 end = doencodes(hunk, (const U8 *)aptr, todo);
3118 aptr += todo;
3119 }
3120 PUSH_BYTES(utf8, cur, hunk, end-hunk, 0);
3121 fromlen -= todo;
3122 }
3123 break;
3124 }
3125 }
3126 *cur = '\0';
3127 SvCUR_set(cat, cur - start);
3128 no_change:
3129 *symptr = lookahead;
3130 }
3131 return beglist;
3132}
3133#undef NEXTFROM
3134
3135
3136PP(pp_pack)
3137{
3138 dSP; dMARK; dORIGMARK; dTARGET;
3139 SV *cat = TARG;
3140 STRLEN fromlen;
3141 SV *pat_sv = *++MARK;
3142 const char *pat = SvPV_const(pat_sv, fromlen);
3143 const char *patend = pat + fromlen;
3144
3145 MARK++;
3146 SvPVCLEAR(cat);
3147 SvUTF8_off(cat);
3148
3149 packlist(cat, pat, patend, MARK, SP + 1);
3150
3151 if (SvUTF8(cat)) {
3152 STRLEN result_len;
3153 const char * result = SvPV_nomg(cat, result_len);
3154 const U8 * error_pos;
3155
3156 if (! is_utf8_string_loc((U8 *) result, result_len, &error_pos)) {
3157 _force_out_malformed_utf8_message(error_pos,
3158 (U8 *) result + result_len,
3159 0, /* no flags */
3160 1 /* Die */
3161 );
3162 NOT_REACHED; /* NOTREACHED */
3163 }
3164 }
3165
3166 SvSETMAGIC(cat);
3167 SP = ORIGMARK;
3168 PUSHs(cat);
3169 RETURN;
3170}
3171
3172/*
3173 * ex: set ts=8 sts=4 sw=4 et:
3174 */