This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
(perl #131844) fix various space calculation issues in pp_pack.c
[perl5.git] / pp_pack.c
... / ...
CommitLineData
1/* pp_pack.c
2 *
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * He still hopefully carried some of his gear in his pack: a small tinder-box,
13 * two small shallow pans, the smaller fitting into the larger; inside them a
14 * wooden spoon, a short two-pronged fork and some skewers were stowed; and
15 * hidden at the bottom of the pack in a flat wooden box a dwindling treasure,
16 * some salt.
17 *
18 * [p.653 of _The Lord of the Rings_, IV/iv: "Of Herbs and Stewed Rabbit"]
19 */
20
21/* This file contains pp ("push/pop") functions that
22 * execute the opcodes that make up a perl program. A typical pp function
23 * expects to find its arguments on the stack, and usually pushes its
24 * results onto the stack, hence the 'pp' terminology. Each OP structure
25 * contains a pointer to the relevant pp_foo() function.
26 *
27 * This particular file just contains pp_pack() and pp_unpack(). See the
28 * other pp*.c files for the rest of the pp_ functions.
29 */
30
31#include "EXTERN.h"
32#define PERL_IN_PP_PACK_C
33#include "perl.h"
34
35/* Types used by pack/unpack */
36typedef enum {
37 e_no_len, /* no length */
38 e_number, /* number, [] */
39 e_star /* asterisk */
40} howlen_t;
41
42typedef struct tempsym {
43 const char* patptr; /* current template char */
44 const char* patend; /* one after last char */
45 const char* grpbeg; /* 1st char of ()-group */
46 const char* grpend; /* end of ()-group */
47 I32 code; /* template code (!<>) */
48 U32 flags; /* /=4, comma=2, pack=1 */
49 /* and group modifiers */
50 SSize_t length; /* length/repeat count */
51 howlen_t howlen; /* how length is given */
52 int level; /* () nesting level */
53 STRLEN strbeg; /* offset of group start */
54 struct tempsym *previous; /* previous group */
55} tempsym_t;
56
57#define TEMPSYM_INIT(symptr, p, e, f) \
58 STMT_START { \
59 (symptr)->patptr = (p); \
60 (symptr)->patend = (e); \
61 (symptr)->grpbeg = NULL; \
62 (symptr)->grpend = NULL; \
63 (symptr)->grpend = NULL; \
64 (symptr)->code = 0; \
65 (symptr)->length = 0; \
66 (symptr)->howlen = e_no_len; \
67 (symptr)->level = 0; \
68 (symptr)->flags = (f); \
69 (symptr)->strbeg = 0; \
70 (symptr)->previous = NULL; \
71 } STMT_END
72
73typedef union {
74 NV nv;
75 U8 bytes[sizeof(NV)];
76} NV_bytes;
77
78#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
79typedef union {
80 long double ld;
81 U8 bytes[sizeof(long double)];
82} ld_bytes;
83#endif
84
85#ifndef CHAR_BIT
86# define CHAR_BIT 8
87#endif
88/* Maximum number of bytes to which a byte can grow due to upgrade */
89#define UTF8_EXPAND 2
90
91/*
92 * Offset for integer pack/unpack.
93 *
94 * On architectures where I16 and I32 aren't really 16 and 32 bits,
95 * which for now are all Crays, pack and unpack have to play games.
96 */
97
98/*
99 * These values are required for portability of pack() output.
100 * If they're not right on your machine, then pack() and unpack()
101 * wouldn't work right anyway; you'll need to apply the Cray hack.
102 * (I'd like to check them with #if, but you can't use sizeof() in
103 * the preprocessor.) --???
104 */
105/*
106 The appropriate SHORTSIZE, INTSIZE, LONGSIZE, and LONGLONGSIZE
107 defines are now in config.h. --Andy Dougherty April 1998
108 */
109#define SIZE16 2
110#define SIZE32 4
111
112/* CROSSCOMPILE and MULTIARCH are going to affect pp_pack() and pp_unpack().
113 --jhi Feb 1999 */
114
115#if U16SIZE <= SIZE16 && U32SIZE <= SIZE32
116# define OFF16(p) ((char *) (p))
117# define OFF32(p) ((char *) (p))
118#elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */
119# define OFF16(p) ((char*)(p))
120# define OFF32(p) ((char*)(p))
121#elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */
122# define OFF16(p) ((char*)(p) + (sizeof(U16) - SIZE16))
123# define OFF32(p) ((char*)(p) + (sizeof(U32) - SIZE32))
124#else
125# error "bad cray byte order"
126#endif
127
128#define PUSH16(utf8, cur, p, needs_swap) \
129 PUSH_BYTES(utf8, cur, OFF16(p), SIZE16, needs_swap)
130#define PUSH32(utf8, cur, p, needs_swap) \
131 PUSH_BYTES(utf8, cur, OFF32(p), SIZE32, needs_swap)
132
133#if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */
134# define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_LITTLE_ENDIAN)
135#elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */
136# define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_BIG_ENDIAN)
137#else
138# error "Unsupported byteorder"
139 /* Need to add code here to re-instate mixed endian support.
140 NEEDS_SWAP would need to hold a flag indicating which action to
141 take, and S_reverse_copy and the code in S_utf8_to_bytes would need
142 logic adding to deal with any mixed-endian transformations needed.
143 */
144#endif
145
146/* Only to be used inside a loop (see the break) */
147#define SHIFT_BYTES(utf8, s, strend, buf, len, datumtype, needs_swap) \
148STMT_START { \
149 if (UNLIKELY(utf8)) { \
150 if (!S_utf8_to_bytes(aTHX_ &s, strend, \
151 (char *) (buf), len, datumtype)) break; \
152 } else { \
153 if (UNLIKELY(needs_swap)) \
154 S_reverse_copy(s, (char *) (buf), len); \
155 else \
156 Copy(s, (char *) (buf), len, char); \
157 s += len; \
158 } \
159} STMT_END
160
161#define SHIFT16(utf8, s, strend, p, datumtype, needs_swap) \
162 SHIFT_BYTES(utf8, s, strend, OFF16(p), SIZE16, datumtype, needs_swap)
163
164#define SHIFT32(utf8, s, strend, p, datumtype, needs_swap) \
165 SHIFT_BYTES(utf8, s, strend, OFF32(p), SIZE32, datumtype, needs_swap)
166
167#define SHIFT_VAR(utf8, s, strend, var, datumtype, needs_swap) \
168 SHIFT_BYTES(utf8, s, strend, &(var), sizeof(var), datumtype, needs_swap)
169
170#define PUSH_VAR(utf8, aptr, var, needs_swap) \
171 PUSH_BYTES(utf8, aptr, &(var), sizeof(var), needs_swap)
172
173/* Avoid stack overflow due to pathological templates. 100 should be plenty. */
174#define MAX_SUB_TEMPLATE_LEVEL 100
175
176/* flags (note that type modifiers can also be used as flags!) */
177#define FLAG_WAS_UTF8 0x40
178#define FLAG_PARSE_UTF8 0x20 /* Parse as utf8 */
179#define FLAG_UNPACK_ONLY_ONE 0x10
180#define FLAG_DO_UTF8 0x08 /* The underlying string is utf8 */
181#define FLAG_SLASH 0x04
182#define FLAG_COMMA 0x02
183#define FLAG_PACK 0x01
184
185STATIC SV *
186S_mul128(pTHX_ SV *sv, U8 m)
187{
188 STRLEN len;
189 char *s = SvPV(sv, len);
190 char *t;
191
192 PERL_ARGS_ASSERT_MUL128;
193
194 if (! memBEGINs(s, len, "0000")) { /* need to grow sv */
195 SV * const tmpNew = newSVpvs("0000000000");
196
197 sv_catsv(tmpNew, sv);
198 SvREFCNT_dec(sv); /* free old sv */
199 sv = tmpNew;
200 s = SvPV(sv, len);
201 }
202 t = s + len - 1;
203 while (!*t) /* trailing '\0'? */
204 t--;
205 while (t > s) {
206 const U32 i = ((*t - '0') << 7) + m;
207 *(t--) = '0' + (char)(i % 10);
208 m = (char)(i / 10);
209 }
210 return (sv);
211}
212
213/* Explosives and implosives. */
214
215#define ISUUCHAR(ch) (NATIVE_TO_LATIN1(ch) >= NATIVE_TO_LATIN1(' ') \
216 && NATIVE_TO_LATIN1(ch) < NATIVE_TO_LATIN1('a'))
217
218/* type modifiers */
219#define TYPE_IS_SHRIEKING 0x100
220#define TYPE_IS_BIG_ENDIAN 0x200
221#define TYPE_IS_LITTLE_ENDIAN 0x400
222#define TYPE_IS_PACK 0x800
223#define TYPE_ENDIANNESS_MASK (TYPE_IS_BIG_ENDIAN|TYPE_IS_LITTLE_ENDIAN)
224#define TYPE_MODIFIERS(t) ((t) & ~0xFF)
225#define TYPE_NO_MODIFIERS(t) ((t) & 0xFF)
226
227# define TYPE_ENDIANNESS(t) ((t) & TYPE_ENDIANNESS_MASK)
228# define TYPE_NO_ENDIANNESS(t) ((t) & ~TYPE_ENDIANNESS_MASK)
229
230# define ENDIANNESS_ALLOWED_TYPES "sSiIlLqQjJfFdDpP("
231
232#define PACK_SIZE_CANNOT_CSUM 0x80
233#define PACK_SIZE_UNPREDICTABLE 0x40 /* Not a fixed size element */
234#define PACK_SIZE_MASK 0x3F
235
236#include "packsizetables.inc"
237
238static void
239S_reverse_copy(const char *src, char *dest, STRLEN len)
240{
241 dest += len;
242 while (len--)
243 *--dest = *src++;
244}
245
246STATIC U8
247utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype)
248{
249 STRLEN retlen;
250 UV val;
251
252 if (*s >= end) {
253 goto croak;
254 }
255 val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen,
256 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
257 if (retlen == (STRLEN) -1)
258 croak:
259 Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack",
260 (int) TYPE_NO_MODIFIERS(datumtype));
261 if (val >= 0x100) {
262 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK),
263 "Character in '%c' format wrapped in unpack",
264 (int) TYPE_NO_MODIFIERS(datumtype));
265 val &= 0xff;
266 }
267 *s += retlen;
268 return (U8)val;
269}
270
271#define SHIFT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \
272 utf8_to_byte(aTHX_ &(s), (strend), (datumtype)) : \
273 *(U8 *)(s)++)
274
275STATIC bool
276S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t buf_len, I32 datumtype)
277{
278 UV val;
279 STRLEN retlen;
280 const char *from = *s;
281 int bad = 0;
282 const U32 flags = ckWARN(WARN_UTF8) ?
283 UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY);
284 const bool needs_swap = NEEDS_SWAP(datumtype);
285
286 if (UNLIKELY(needs_swap))
287 buf += buf_len;
288
289 for (;buf_len > 0; buf_len--) {
290 if (from >= end) return FALSE;
291 val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags);
292 if (retlen == (STRLEN) -1) {
293 from += UTF8SKIP(from);
294 bad |= 1;
295 } else from += retlen;
296 if (val >= 0x100) {
297 bad |= 2;
298 val &= 0xff;
299 }
300 if (UNLIKELY(needs_swap))
301 *(U8 *)--buf = (U8)val;
302 else
303 *(U8 *)buf++ = (U8)val;
304 }
305 /* We have enough characters for the buffer. Did we have problems ? */
306 if (bad) {
307 if (bad & 1) {
308 /* Rewalk the string fragment while warning */
309 const char *ptr;
310 const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
311 for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) {
312 if (ptr >= end) break;
313 utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags);
314 }
315 if (from > end) from = end;
316 }
317 if ((bad & 2))
318 Perl_ck_warner(aTHX_ packWARN(datumtype & TYPE_IS_PACK ?
319 WARN_PACK : WARN_UNPACK),
320 "Character(s) in '%c' format wrapped in %s",
321 (int) TYPE_NO_MODIFIERS(datumtype),
322 datumtype & TYPE_IS_PACK ? "pack" : "unpack");
323 }
324 *s = from;
325 return TRUE;
326}
327
328STATIC char *
329S_my_bytes_to_utf8(const U8 *start, STRLEN len, char *dest, const bool needs_swap) {
330 PERL_ARGS_ASSERT_MY_BYTES_TO_UTF8;
331
332 if (UNLIKELY(needs_swap)) {
333 const U8 *p = start + len;
334 while (p-- > start) {
335 append_utf8_from_native_byte(*p, (U8 **) & dest);
336 }
337 } else {
338 const U8 * const end = start + len;
339 while (start < end) {
340 append_utf8_from_native_byte(*start, (U8 **) & dest);
341 start++;
342 }
343 }
344 return dest;
345}
346
347#define PUSH_BYTES(utf8, cur, buf, len, needs_swap) \
348STMT_START { \
349 if (UNLIKELY(utf8)) \
350 (cur) = my_bytes_to_utf8((U8 *) buf, len, (cur), needs_swap); \
351 else { \
352 if (UNLIKELY(needs_swap)) \
353 S_reverse_copy((char *)(buf), cur, len); \
354 else \
355 Copy(buf, cur, len, char); \
356 (cur) += (len); \
357 } \
358} STMT_END
359
360#define SAFE_UTF8_EXPAND(var) \
361STMT_START { \
362 if ((var) > SSize_t_MAX / UTF8_EXPAND) \
363 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \
364 (var) = (var) * UTF8_EXPAND; \
365} STMT_END
366
367#define GROWING2(utf8, cat, start, cur, item_size, item_count) \
368STMT_START { \
369 if (SSize_t_MAX / (item_size) < (item_count)) \
370 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \
371 GROWING((utf8), (cat), (start), (cur), (item_size) * (item_count)); \
372} STMT_END
373
374#define GROWING(utf8, cat, start, cur, in_len) \
375STMT_START { \
376 STRLEN glen = (in_len); \
377 STRLEN catcur = (STRLEN)((cur) - (start)); \
378 if (utf8) SAFE_UTF8_EXPAND(glen); \
379 if (SSize_t_MAX - glen < catcur) \
380 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \
381 if (catcur + glen >= SvLEN(cat)) { \
382 (start) = sv_exp_grow(cat, glen); \
383 (cur) = (start) + SvCUR(cat); \
384 } \
385} STMT_END
386
387#define PUSH_GROWING_BYTES(utf8, cat, start, cur, buf, in_len) \
388STMT_START { \
389 const STRLEN glen = (in_len); \
390 STRLEN gl = glen; \
391 if (utf8) SAFE_UTF8_EXPAND(gl); \
392 if ((cur) + gl >= (start) + SvLEN(cat)) { \
393 *cur = '\0'; \
394 SvCUR_set((cat), (cur) - (start)); \
395 (start) = sv_exp_grow(cat, gl); \
396 (cur) = (start) + SvCUR(cat); \
397 } \
398 PUSH_BYTES(utf8, cur, buf, glen, 0); \
399} STMT_END
400
401#define PUSH_BYTE(utf8, s, byte) \
402STMT_START { \
403 if (utf8) { \
404 const U8 au8 = (byte); \
405 (s) = my_bytes_to_utf8(&au8, 1, (s), 0);\
406 } else *(U8 *)(s)++ = (byte); \
407} STMT_END
408
409/* Only to be used inside a loop (see the break) */
410#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \
411STMT_START { \
412 STRLEN retlen; \
413 if (str >= end) break; \
414 val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \
415 if (retlen == (STRLEN) -1) { \
416 *cur = '\0'; \
417 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); \
418 } \
419 str += retlen; \
420} STMT_END
421
422static const char *_action( const tempsym_t* symptr )
423{
424 return (const char *)(( symptr->flags & FLAG_PACK ) ? "pack" : "unpack");
425}
426
427/* Returns the sizeof() struct described by pat */
428STATIC SSize_t
429S_measure_struct(pTHX_ tempsym_t* symptr)
430{
431 SSize_t total = 0;
432
433 PERL_ARGS_ASSERT_MEASURE_STRUCT;
434
435 while (next_symbol(symptr)) {
436 SSize_t len, size;
437
438 switch (symptr->howlen) {
439 case e_star:
440 Perl_croak(aTHX_ "Within []-length '*' not allowed in %s",
441 _action( symptr ) );
442
443 default:
444 /* e_no_len and e_number */
445 len = symptr->length;
446 break;
447 }
448
449 size = packprops[TYPE_NO_ENDIANNESS(symptr->code)] & PACK_SIZE_MASK;
450 if (!size) {
451 SSize_t star;
452 /* endianness doesn't influence the size of a type */
453 switch(TYPE_NO_ENDIANNESS(symptr->code)) {
454 default:
455 Perl_croak(aTHX_ "Invalid type '%c' in %s",
456 (int)TYPE_NO_MODIFIERS(symptr->code),
457 _action( symptr ) );
458 case '.' | TYPE_IS_SHRIEKING:
459 case '@' | TYPE_IS_SHRIEKING:
460 case '@':
461 case '.':
462 case '/':
463 case 'U': /* XXXX Is it correct? */
464 case 'w':
465 case 'u':
466 Perl_croak(aTHX_ "Within []-length '%c' not allowed in %s",
467 (int) TYPE_NO_MODIFIERS(symptr->code),
468 _action( symptr ) );
469 case '%':
470 size = 0;
471 break;
472 case '(':
473 {
474 tempsym_t savsym = *symptr;
475 symptr->patptr = savsym.grpbeg;
476 symptr->patend = savsym.grpend;
477 /* XXXX Theoretically, we need to measure many times at
478 different positions, since the subexpression may contain
479 alignment commands, but be not of aligned length.
480 Need to detect this and croak(). */
481 size = measure_struct(symptr);
482 *symptr = savsym;
483 break;
484 }
485 case 'X' | TYPE_IS_SHRIEKING:
486 /* XXXX Is this useful? Then need to treat MEASURE_BACKWARDS.
487 */
488 if (!len) /* Avoid division by 0 */
489 len = 1;
490 len = total % len; /* Assumed: the start is aligned. */
491 /* FALLTHROUGH */
492 case 'X':
493 size = -1;
494 if (total < len)
495 Perl_croak(aTHX_ "'X' outside of string in %s", _action( symptr ) );
496 break;
497 case 'x' | TYPE_IS_SHRIEKING:
498 if (!len) /* Avoid division by 0 */
499 len = 1;
500 star = total % len; /* Assumed: the start is aligned. */
501 if (star) /* Other portable ways? */
502 len = len - star;
503 else
504 len = 0;
505 /* FALLTHROUGH */
506 case 'x':
507 case 'A':
508 case 'Z':
509 case 'a':
510 size = 1;
511 break;
512 case 'B':
513 case 'b':
514 len = (len + 7)/8;
515 size = 1;
516 break;
517 case 'H':
518 case 'h':
519 len = (len + 1)/2;
520 size = 1;
521 break;
522
523 case 'P':
524 len = 1;
525 size = sizeof(char*);
526 break;
527 }
528 }
529 total += len * size;
530 }
531 return total;
532}
533
534
535/* locate matching closing parenthesis or bracket
536 * returns char pointer to char after match, or NULL
537 */
538STATIC const char *
539S_group_end(pTHX_ const char *patptr, const char *patend, char ender)
540{
541 PERL_ARGS_ASSERT_GROUP_END;
542
543 while (patptr < patend) {
544 const char c = *patptr++;
545
546 if (isSPACE(c))
547 continue;
548 else if (c == ender)
549 return patptr-1;
550 else if (c == '#') {
551 while (patptr < patend && *patptr != '\n')
552 patptr++;
553 continue;
554 } else if (c == '(')
555 patptr = group_end(patptr, patend, ')') + 1;
556 else if (c == '[')
557 patptr = group_end(patptr, patend, ']') + 1;
558 }
559 Perl_croak(aTHX_ "No group ending character '%c' found in template",
560 ender);
561 NOT_REACHED; /* NOTREACHED */
562}
563
564
565/* Convert unsigned decimal number to binary.
566 * Expects a pointer to the first digit and address of length variable
567 * Advances char pointer to 1st non-digit char and returns number
568 */
569STATIC const char *
570S_get_num(pTHX_ const char *patptr, SSize_t *lenptr )
571{
572 SSize_t len = *patptr++ - '0';
573
574 PERL_ARGS_ASSERT_GET_NUM;
575
576 while (isDIGIT(*patptr)) {
577 SSize_t nlen = (len * 10) + (*patptr++ - '0');
578 if (nlen < 0 || nlen/10 != len)
579 Perl_croak(aTHX_ "pack/unpack repeat count overflow");
580 len = nlen;
581 }
582 *lenptr = len;
583 return patptr;
584}
585
586/* The marvellous template parsing routine: Using state stored in *symptr,
587 * locates next template code and count
588 */
589STATIC bool
590S_next_symbol(pTHX_ tempsym_t* symptr )
591{
592 const char* patptr = symptr->patptr;
593 const char* const patend = symptr->patend;
594
595 PERL_ARGS_ASSERT_NEXT_SYMBOL;
596
597 symptr->flags &= ~FLAG_SLASH;
598
599 while (patptr < patend) {
600 if (isSPACE(*patptr))
601 patptr++;
602 else if (*patptr == '#') {
603 patptr++;
604 while (patptr < patend && *patptr != '\n')
605 patptr++;
606 if (patptr < patend)
607 patptr++;
608 } else {
609 /* We should have found a template code */
610 I32 code = *patptr++ & 0xFF;
611 U32 inherited_modifiers = 0;
612
613 if (code == ','){ /* grandfather in commas but with a warning */
614 if (((symptr->flags & FLAG_COMMA) == 0) && ckWARN(WARN_UNPACK)){
615 symptr->flags |= FLAG_COMMA;
616 Perl_warner(aTHX_ packWARN(WARN_UNPACK),
617 "Invalid type ',' in %s", _action( symptr ) );
618 }
619 continue;
620 }
621
622 /* for '(', skip to ')' */
623 if (code == '(') {
624 if( isDIGIT(*patptr) || *patptr == '*' || *patptr == '[' )
625 Perl_croak(aTHX_ "()-group starts with a count in %s",
626 _action( symptr ) );
627 symptr->grpbeg = patptr;
628 patptr = 1 + ( symptr->grpend = group_end(patptr, patend, ')') );
629 if( symptr->level >= MAX_SUB_TEMPLATE_LEVEL )
630 Perl_croak(aTHX_ "Too deeply nested ()-groups in %s",
631 _action( symptr ) );
632 }
633
634 /* look for group modifiers to inherit */
635 if (TYPE_ENDIANNESS(symptr->flags)) {
636 if (strchr(ENDIANNESS_ALLOWED_TYPES, TYPE_NO_MODIFIERS(code)))
637 inherited_modifiers |= TYPE_ENDIANNESS(symptr->flags);
638 }
639
640 /* look for modifiers */
641 while (patptr < patend) {
642 const char *allowed;
643 I32 modifier;
644 switch (*patptr) {
645 case '!':
646 modifier = TYPE_IS_SHRIEKING;
647 allowed = "sSiIlLxXnNvV@.";
648 break;
649 case '>':
650 modifier = TYPE_IS_BIG_ENDIAN;
651 allowed = ENDIANNESS_ALLOWED_TYPES;
652 break;
653 case '<':
654 modifier = TYPE_IS_LITTLE_ENDIAN;
655 allowed = ENDIANNESS_ALLOWED_TYPES;
656 break;
657 default:
658 allowed = "";
659 modifier = 0;
660 break;
661 }
662
663 if (modifier == 0)
664 break;
665
666 if (!strchr(allowed, TYPE_NO_MODIFIERS(code)))
667 Perl_croak(aTHX_ "'%c' allowed only after types %s in %s", *patptr,
668 allowed, _action( symptr ) );
669
670 if (TYPE_ENDIANNESS(code | modifier) == TYPE_ENDIANNESS_MASK)
671 Perl_croak(aTHX_ "Can't use both '<' and '>' after type '%c' in %s",
672 (int) TYPE_NO_MODIFIERS(code), _action( symptr ) );
673 else if (TYPE_ENDIANNESS(code | modifier | inherited_modifiers) ==
674 TYPE_ENDIANNESS_MASK)
675 Perl_croak(aTHX_ "Can't use '%c' in a group with different byte-order in %s",
676 *patptr, _action( symptr ) );
677
678 if ((code & modifier)) {
679 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK),
680 "Duplicate modifier '%c' after '%c' in %s",
681 *patptr, (int) TYPE_NO_MODIFIERS(code),
682 _action( symptr ) );
683 }
684
685 code |= modifier;
686 patptr++;
687 }
688
689 /* inherit modifiers */
690 code |= inherited_modifiers;
691
692 /* look for count and/or / */
693 if (patptr < patend) {
694 if (isDIGIT(*patptr)) {
695 patptr = get_num( patptr, &symptr->length );
696 symptr->howlen = e_number;
697
698 } else if (*patptr == '*') {
699 patptr++;
700 symptr->howlen = e_star;
701
702 } else if (*patptr == '[') {
703 const char* lenptr = ++patptr;
704 symptr->howlen = e_number;
705 patptr = group_end( patptr, patend, ']' ) + 1;
706 /* what kind of [] is it? */
707 if (isDIGIT(*lenptr)) {
708 lenptr = get_num( lenptr, &symptr->length );
709 if( *lenptr != ']' )
710 Perl_croak(aTHX_ "Malformed integer in [] in %s",
711 _action( symptr ) );
712 } else {
713 tempsym_t savsym = *symptr;
714 symptr->patend = patptr-1;
715 symptr->patptr = lenptr;
716 savsym.length = measure_struct(symptr);
717 *symptr = savsym;
718 }
719 } else {
720 symptr->howlen = e_no_len;
721 symptr->length = 1;
722 }
723
724 /* try to find / */
725 while (patptr < patend) {
726 if (isSPACE(*patptr))
727 patptr++;
728 else if (*patptr == '#') {
729 patptr++;
730 while (patptr < patend && *patptr != '\n')
731 patptr++;
732 if (patptr < patend)
733 patptr++;
734 } else {
735 if (*patptr == '/') {
736 symptr->flags |= FLAG_SLASH;
737 patptr++;
738 if (patptr < patend &&
739 (isDIGIT(*patptr) || *patptr == '*' || *patptr == '['))
740 Perl_croak(aTHX_ "'/' does not take a repeat count in %s",
741 _action( symptr ) );
742 }
743 break;
744 }
745 }
746 } else {
747 /* at end - no count, no / */
748 symptr->howlen = e_no_len;
749 symptr->length = 1;
750 }
751
752 symptr->code = code;
753 symptr->patptr = patptr;
754 return TRUE;
755 }
756 }
757 symptr->patptr = patptr;
758 return FALSE;
759}
760
761/*
762 There is no way to cleanly handle the case where we should process the
763 string per byte in its upgraded form while it's really in downgraded form
764 (e.g. estimates like strend-s as an upper bound for the number of
765 characters left wouldn't work). So if we foresee the need of this
766 (pattern starts with U or contains U0), we want to work on the encoded
767 version of the string. Users are advised to upgrade their pack string
768 themselves if they need to do a lot of unpacks like this on it
769*/
770STATIC bool
771need_utf8(const char *pat, const char *patend)
772{
773 bool first = TRUE;
774
775 PERL_ARGS_ASSERT_NEED_UTF8;
776
777 while (pat < patend) {
778 if (pat[0] == '#') {
779 pat++;
780 pat = (const char *) memchr(pat, '\n', patend-pat);
781 if (!pat) return FALSE;
782 } else if (pat[0] == 'U') {
783 if (first || pat[1] == '0') return TRUE;
784 } else first = FALSE;
785 pat++;
786 }
787 return FALSE;
788}
789
790STATIC char
791first_symbol(const char *pat, const char *patend) {
792 PERL_ARGS_ASSERT_FIRST_SYMBOL;
793
794 while (pat < patend) {
795 if (pat[0] != '#') return pat[0];
796 pat++;
797 pat = (const char *) memchr(pat, '\n', patend-pat);
798 if (!pat) return 0;
799 pat++;
800 }
801 return 0;
802}
803
804/*
805
806=head1 Pack and Unpack
807
808=for apidoc unpackstring
809
810The engine implementing the C<unpack()> Perl function.
811
812Using the template C<pat..patend>, this function unpacks the string
813C<s..strend> into a number of mortal SVs, which it pushes onto the perl
814argument (C<@_>) stack (so you will need to issue a C<PUTBACK> before and
815C<SPAGAIN> after the call to this function). It returns the number of
816pushed elements.
817
818The C<strend> and C<patend> pointers should point to the byte following the
819last character of each string.
820
821Although this function returns its values on the perl argument stack, it
822doesn't take any parameters from that stack (and thus in particular
823there's no need to do a C<PUSHMARK> before calling it, unlike L</call_pv> for
824example).
825
826=cut */
827
828SSize_t
829Perl_unpackstring(pTHX_ const char *pat, const char *patend, const char *s, const char *strend, U32 flags)
830{
831 tempsym_t sym;
832
833 PERL_ARGS_ASSERT_UNPACKSTRING;
834
835 if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8;
836 else if (need_utf8(pat, patend)) {
837 /* We probably should try to avoid this in case a scalar context call
838 wouldn't get to the "U0" */
839 STRLEN len = strend - s;
840 s = (char *) bytes_to_utf8((U8 *) s, &len);
841 SAVEFREEPV(s);
842 strend = s + len;
843 flags |= FLAG_DO_UTF8;
844 }
845
846 if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8))
847 flags |= FLAG_PARSE_UTF8;
848
849 TEMPSYM_INIT(&sym, pat, patend, flags);
850
851 return unpack_rec(&sym, s, s, strend, NULL );
852}
853
854STATIC SSize_t
855S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const char *strend, const char **new_s )
856{
857 dSP;
858 SV *sv = NULL;
859 const SSize_t start_sp_offset = SP - PL_stack_base;
860 howlen_t howlen;
861 SSize_t checksum = 0;
862 UV cuv = 0;
863 NV cdouble = 0.0;
864 const SSize_t bits_in_uv = CHAR_BIT * sizeof(cuv);
865 bool beyond = FALSE;
866 bool explicit_length;
867 const bool unpack_only_one = (symptr->flags & FLAG_UNPACK_ONLY_ONE) != 0;
868 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0;
869
870 PERL_ARGS_ASSERT_UNPACK_REC;
871
872 symptr->strbeg = s - strbeg;
873
874 while (next_symbol(symptr)) {
875 packprops_t props;
876 SSize_t len;
877 I32 datumtype = symptr->code;
878 bool needs_swap;
879 /* do first one only unless in list context
880 / is implemented by unpacking the count, then popping it from the
881 stack, so must check that we're not in the middle of a / */
882 if ( unpack_only_one
883 && (SP - PL_stack_base == start_sp_offset + 1)
884 && (datumtype != '/') ) /* XXX can this be omitted */
885 break;
886
887 switch (howlen = symptr->howlen) {
888 case e_star:
889 len = strend - strbeg; /* long enough */
890 break;
891 default:
892 /* e_no_len and e_number */
893 len = symptr->length;
894 break;
895 }
896
897 explicit_length = TRUE;
898 redo_switch:
899 beyond = s >= strend;
900
901 props = packprops[TYPE_NO_ENDIANNESS(datumtype)];
902 if (props) {
903 /* props nonzero means we can process this letter. */
904 const SSize_t size = props & PACK_SIZE_MASK;
905 const SSize_t howmany = (strend - s) / size;
906 if (len > howmany)
907 len = howmany;
908
909 if (!checksum || (props & PACK_SIZE_CANNOT_CSUM)) {
910 if (len && unpack_only_one) len = 1;
911 EXTEND(SP, len);
912 EXTEND_MORTAL(len);
913 }
914 }
915
916 needs_swap = NEEDS_SWAP(datumtype);
917
918 switch(TYPE_NO_ENDIANNESS(datumtype)) {
919 default:
920 Perl_croak(aTHX_ "Invalid type '%c' in unpack", (int)TYPE_NO_MODIFIERS(datumtype) );
921
922 case '%':
923 if (howlen == e_no_len)
924 len = 16; /* len is not specified */
925 checksum = len;
926 cuv = 0;
927 cdouble = 0;
928 continue;
929
930 case '(':
931 {
932 tempsym_t savsym = *symptr;
933 const U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags);
934 symptr->flags |= group_modifiers;
935 symptr->patend = savsym.grpend;
936 symptr->previous = &savsym;
937 symptr->level++;
938 PUTBACK;
939 if (len && unpack_only_one) len = 1;
940 while (len--) {
941 symptr->patptr = savsym.grpbeg;
942 if (utf8) symptr->flags |= FLAG_PARSE_UTF8;
943 else symptr->flags &= ~FLAG_PARSE_UTF8;
944 unpack_rec(symptr, s, strbeg, strend, &s);
945 if (s == strend && savsym.howlen == e_star)
946 break; /* No way to continue */
947 }
948 SPAGAIN;
949 savsym.flags = symptr->flags & ~group_modifiers;
950 *symptr = savsym;
951 break;
952 }
953 case '.' | TYPE_IS_SHRIEKING:
954 case '.': {
955 const char *from;
956 SV *sv;
957 const bool u8 = utf8 && !(datumtype & TYPE_IS_SHRIEKING);
958 if (howlen == e_star) from = strbeg;
959 else if (len <= 0) from = s;
960 else {
961 tempsym_t *group = symptr;
962
963 while (--len && group) group = group->previous;
964 from = group ? strbeg + group->strbeg : strbeg;
965 }
966 sv = from <= s ?
967 newSVuv( u8 ? (UV) utf8_length((const U8*)from, (const U8*)s) : (UV) (s-from)) :
968 newSViv(-(u8 ? (IV) utf8_length((const U8*)s, (const U8*)from) : (IV) (from-s)));
969 mXPUSHs(sv);
970 break;
971 }
972 case '@' | TYPE_IS_SHRIEKING:
973 case '@':
974 s = strbeg + symptr->strbeg;
975 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING))
976 {
977 while (len > 0) {
978 if (s >= strend)
979 Perl_croak(aTHX_ "'@' outside of string in unpack");
980 s += UTF8SKIP(s);
981 len--;
982 }
983 if (s > strend)
984 Perl_croak(aTHX_ "'@' outside of string with malformed UTF-8 in unpack");
985 } else {
986 if (strend-s < len)
987 Perl_croak(aTHX_ "'@' outside of string in unpack");
988 s += len;
989 }
990 break;
991 case 'X' | TYPE_IS_SHRIEKING:
992 if (!len) /* Avoid division by 0 */
993 len = 1;
994 if (utf8) {
995 const char *hop, *last;
996 SSize_t l = len;
997 hop = last = strbeg;
998 while (hop < s) {
999 hop += UTF8SKIP(hop);
1000 if (--l == 0) {
1001 last = hop;
1002 l = len;
1003 }
1004 }
1005 if (last > s)
1006 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1007 s = last;
1008 break;
1009 }
1010 len = (s - strbeg) % len;
1011 /* FALLTHROUGH */
1012 case 'X':
1013 if (utf8) {
1014 while (len > 0) {
1015 if (s <= strbeg)
1016 Perl_croak(aTHX_ "'X' outside of string in unpack");
1017 while (--s, UTF8_IS_CONTINUATION(*s)) {
1018 if (s <= strbeg)
1019 Perl_croak(aTHX_ "'X' outside of string in unpack");
1020 }
1021 len--;
1022 }
1023 } else {
1024 if (len > s - strbeg)
1025 Perl_croak(aTHX_ "'X' outside of string in unpack" );
1026 s -= len;
1027 }
1028 break;
1029 case 'x' | TYPE_IS_SHRIEKING: {
1030 SSize_t ai32;
1031 if (!len) /* Avoid division by 0 */
1032 len = 1;
1033 if (utf8) ai32 = utf8_length((U8 *) strbeg, (U8 *) s) % len;
1034 else ai32 = (s - strbeg) % len;
1035 if (ai32 == 0) break;
1036 len -= ai32;
1037 }
1038 /* FALLTHROUGH */
1039 case 'x':
1040 if (utf8) {
1041 while (len>0) {
1042 if (s >= strend)
1043 Perl_croak(aTHX_ "'x' outside of string in unpack");
1044 s += UTF8SKIP(s);
1045 len--;
1046 }
1047 } else {
1048 if (len > strend - s)
1049 Perl_croak(aTHX_ "'x' outside of string in unpack");
1050 s += len;
1051 }
1052 break;
1053 case '/':
1054 Perl_croak(aTHX_ "'/' must follow a numeric type in unpack");
1055
1056 case 'A':
1057 case 'Z':
1058 case 'a':
1059 if (checksum) {
1060 /* Preliminary length estimate is assumed done in 'W' */
1061 if (len > strend - s) len = strend - s;
1062 goto W_checksum;
1063 }
1064 if (utf8) {
1065 SSize_t l;
1066 const char *hop;
1067 for (l=len, hop=s; l>0; l--, hop += UTF8SKIP(hop)) {
1068 if (hop >= strend) {
1069 if (hop > strend)
1070 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1071 break;
1072 }
1073 }
1074 if (hop > strend)
1075 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1076 len = hop - s;
1077 } else if (len > strend - s)
1078 len = strend - s;
1079
1080 if (datumtype == 'Z') {
1081 /* 'Z' strips stuff after first null */
1082 const char *ptr, *end;
1083 end = s + len;
1084 for (ptr = s; ptr < end; ptr++) if (*ptr == 0) break;
1085 sv = newSVpvn(s, ptr-s);
1086 if (howlen == e_star) /* exact for 'Z*' */
1087 len = ptr-s + (ptr != strend ? 1 : 0);
1088 } else if (datumtype == 'A') {
1089 /* 'A' strips both nulls and spaces */
1090 const char *ptr;
1091 if (utf8 && (symptr->flags & FLAG_WAS_UTF8)) {
1092 for (ptr = s+len-1; ptr >= s; ptr--) {
1093 if ( *ptr != 0
1094 && !UTF8_IS_CONTINUATION(*ptr)
1095 && !isSPACE_utf8_safe(ptr, strend))
1096 {
1097 break;
1098 }
1099 }
1100 if (ptr >= s) ptr += UTF8SKIP(ptr);
1101 else ptr++;
1102 if (ptr > s+len)
1103 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1104 } else {
1105 for (ptr = s+len-1; ptr >= s; ptr--)
1106 if (*ptr != 0 && !isSPACE(*ptr)) break;
1107 ptr++;
1108 }
1109 sv = newSVpvn(s, ptr-s);
1110 } else sv = newSVpvn(s, len);
1111
1112 if (utf8) {
1113 SvUTF8_on(sv);
1114 /* Undo any upgrade done due to need_utf8() */
1115 if (!(symptr->flags & FLAG_WAS_UTF8))
1116 sv_utf8_downgrade(sv, 0);
1117 }
1118 mXPUSHs(sv);
1119 s += len;
1120 break;
1121 case 'B':
1122 case 'b': {
1123 char *str;
1124 if (howlen == e_star || len > (strend - s) * 8)
1125 len = (strend - s) * 8;
1126 if (checksum) {
1127 if (utf8)
1128 while (len >= 8 && s < strend) {
1129 cuv += PL_bitcount[utf8_to_byte(aTHX_ &s, strend, datumtype)];
1130 len -= 8;
1131 }
1132 else
1133 while (len >= 8) {
1134 cuv += PL_bitcount[*(U8 *)s++];
1135 len -= 8;
1136 }
1137 if (len && s < strend) {
1138 U8 bits;
1139 bits = SHIFT_BYTE(utf8, s, strend, datumtype);
1140 if (datumtype == 'b')
1141 while (len-- > 0) {
1142 if (bits & 1) cuv++;
1143 bits >>= 1;
1144 }
1145 else
1146 while (len-- > 0) {
1147 if (bits & 0x80) cuv++;
1148 bits <<= 1;
1149 }
1150 }
1151 break;
1152 }
1153
1154 sv = sv_2mortal(newSV(len ? len : 1));
1155 SvPOK_on(sv);
1156 str = SvPVX(sv);
1157 if (datumtype == 'b') {
1158 U8 bits = 0;
1159 const SSize_t ai32 = len;
1160 for (len = 0; len < ai32; len++) {
1161 if (len & 7) bits >>= 1;
1162 else if (utf8) {
1163 if (s >= strend) break;
1164 bits = utf8_to_byte(aTHX_ &s, strend, datumtype);
1165 } else bits = *(U8 *) s++;
1166 *str++ = bits & 1 ? '1' : '0';
1167 }
1168 } else {
1169 U8 bits = 0;
1170 const SSize_t ai32 = len;
1171 for (len = 0; len < ai32; len++) {
1172 if (len & 7) bits <<= 1;
1173 else if (utf8) {
1174 if (s >= strend) break;
1175 bits = utf8_to_byte(aTHX_ &s, strend, datumtype);
1176 } else bits = *(U8 *) s++;
1177 *str++ = bits & 0x80 ? '1' : '0';
1178 }
1179 }
1180 *str = '\0';
1181 SvCUR_set(sv, str - SvPVX_const(sv));
1182 XPUSHs(sv);
1183 break;
1184 }
1185 case 'H':
1186 case 'h': {
1187 char *str = NULL;
1188 /* Preliminary length estimate, acceptable for utf8 too */
1189 if (howlen == e_star || len > (strend - s) * 2)
1190 len = (strend - s) * 2;
1191 if (!checksum) {
1192 sv = sv_2mortal(newSV(len ? len : 1));
1193 SvPOK_on(sv);
1194 str = SvPVX(sv);
1195 }
1196 if (datumtype == 'h') {
1197 U8 bits = 0;
1198 SSize_t ai32 = len;
1199 for (len = 0; len < ai32; len++) {
1200 if (len & 1) bits >>= 4;
1201 else if (utf8) {
1202 if (s >= strend) break;
1203 bits = utf8_to_byte(aTHX_ &s, strend, datumtype);
1204 } else bits = * (U8 *) s++;
1205 if (!checksum)
1206 *str++ = PL_hexdigit[bits & 15];
1207 }
1208 } else {
1209 U8 bits = 0;
1210 const SSize_t ai32 = len;
1211 for (len = 0; len < ai32; len++) {
1212 if (len & 1) bits <<= 4;
1213 else if (utf8) {
1214 if (s >= strend) break;
1215 bits = utf8_to_byte(aTHX_ &s, strend, datumtype);
1216 } else bits = *(U8 *) s++;
1217 if (!checksum)
1218 *str++ = PL_hexdigit[(bits >> 4) & 15];
1219 }
1220 }
1221 if (!checksum) {
1222 *str = '\0';
1223 SvCUR_set(sv, str - SvPVX_const(sv));
1224 XPUSHs(sv);
1225 }
1226 break;
1227 }
1228 case 'C':
1229 if (len == 0) {
1230 if (explicit_length)
1231 /* Switch to "character" mode */
1232 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
1233 break;
1234 }
1235 /* FALLTHROUGH */
1236 case 'c':
1237 while (len-- > 0 && s < strend) {
1238 int aint;
1239 if (utf8)
1240 {
1241 STRLEN retlen;
1242 aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
1243 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1244 if (retlen == (STRLEN) -1)
1245 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1246 s += retlen;
1247 }
1248 else
1249 aint = *(U8 *)(s)++;
1250 if (aint >= 128 && datumtype != 'C') /* fake up signed chars */
1251 aint -= 256;
1252 if (!checksum)
1253 mPUSHi(aint);
1254 else if (checksum > bits_in_uv)
1255 cdouble += (NV)aint;
1256 else
1257 cuv += aint;
1258 }
1259 break;
1260 case 'W':
1261 W_checksum:
1262 if (utf8) {
1263 while (len-- > 0 && s < strend) {
1264 STRLEN retlen;
1265 const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
1266 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1267 if (retlen == (STRLEN) -1)
1268 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1269 s += retlen;
1270 if (!checksum)
1271 mPUSHu(val);
1272 else if (checksum > bits_in_uv)
1273 cdouble += (NV) val;
1274 else
1275 cuv += val;
1276 }
1277 } else if (!checksum)
1278 while (len-- > 0) {
1279 const U8 ch = *(U8 *) s++;
1280 mPUSHu(ch);
1281 }
1282 else if (checksum > bits_in_uv)
1283 while (len-- > 0) cdouble += (NV) *(U8 *) s++;
1284 else
1285 while (len-- > 0) cuv += *(U8 *) s++;
1286 break;
1287 case 'U':
1288 if (len == 0) {
1289 if (explicit_length && howlen != e_star) {
1290 /* Switch to "bytes in UTF-8" mode */
1291 if (symptr->flags & FLAG_DO_UTF8) utf8 = 0;
1292 else
1293 /* Should be impossible due to the need_utf8() test */
1294 Perl_croak(aTHX_ "U0 mode on a byte string");
1295 }
1296 break;
1297 }
1298 if (len > strend - s) len = strend - s;
1299 if (!checksum) {
1300 if (len && unpack_only_one) len = 1;
1301 EXTEND(SP, len);
1302 EXTEND_MORTAL(len);
1303 }
1304 while (len-- > 0 && s < strend) {
1305 STRLEN retlen;
1306 UV auv;
1307 if (utf8) {
1308 U8 result[UTF8_MAXLEN+1];
1309 const char *ptr = s;
1310 STRLEN len;
1311 /* Bug: warns about bad utf8 even if we are short on bytes
1312 and will break out of the loop */
1313 if (!S_utf8_to_bytes(aTHX_ &ptr, strend, (char *) result, 1,
1314 'U'))
1315 break;
1316 len = UTF8SKIP(result);
1317 if (!S_utf8_to_bytes(aTHX_ &ptr, strend,
1318 (char *) &result[1], len-1, 'U')) break;
1319 auv = NATIVE_TO_UNI(utf8n_to_uvchr(result,
1320 len,
1321 &retlen,
1322 UTF8_ALLOW_DEFAULT));
1323 s = ptr;
1324 } else {
1325 auv = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s,
1326 strend - s,
1327 &retlen,
1328 UTF8_ALLOW_DEFAULT));
1329 if (retlen == (STRLEN) -1)
1330 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
1331 s += retlen;
1332 }
1333 if (!checksum)
1334 mPUSHu(auv);
1335 else if (checksum > bits_in_uv)
1336 cdouble += (NV) auv;
1337 else
1338 cuv += auv;
1339 }
1340 break;
1341 case 's' | TYPE_IS_SHRIEKING:
1342#if SHORTSIZE != SIZE16
1343 while (len-- > 0) {
1344 short ashort;
1345 SHIFT_VAR(utf8, s, strend, ashort, datumtype, needs_swap);
1346 if (!checksum)
1347 mPUSHi(ashort);
1348 else if (checksum > bits_in_uv)
1349 cdouble += (NV)ashort;
1350 else
1351 cuv += ashort;
1352 }
1353 break;
1354#else
1355 /* FALLTHROUGH */
1356#endif
1357 case 's':
1358 while (len-- > 0) {
1359 I16 ai16;
1360
1361#if U16SIZE > SIZE16
1362 ai16 = 0;
1363#endif
1364 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap);
1365#if U16SIZE > SIZE16
1366 if (ai16 > 32767)
1367 ai16 -= 65536;
1368#endif
1369 if (!checksum)
1370 mPUSHi(ai16);
1371 else if (checksum > bits_in_uv)
1372 cdouble += (NV)ai16;
1373 else
1374 cuv += ai16;
1375 }
1376 break;
1377 case 'S' | TYPE_IS_SHRIEKING:
1378#if SHORTSIZE != SIZE16
1379 while (len-- > 0) {
1380 unsigned short aushort;
1381 SHIFT_VAR(utf8, s, strend, aushort, datumtype, needs_swap,
1382 needs_swap);
1383 if (!checksum)
1384 mPUSHu(aushort);
1385 else if (checksum > bits_in_uv)
1386 cdouble += (NV)aushort;
1387 else
1388 cuv += aushort;
1389 }
1390 break;
1391#else
1392 /* FALLTHROUGH */
1393#endif
1394 case 'v':
1395 case 'n':
1396 case 'S':
1397 while (len-- > 0) {
1398 U16 au16;
1399#if U16SIZE > SIZE16
1400 au16 = 0;
1401#endif
1402 SHIFT16(utf8, s, strend, &au16, datumtype, needs_swap);
1403 if (datumtype == 'n')
1404 au16 = PerlSock_ntohs(au16);
1405 if (datumtype == 'v')
1406 au16 = vtohs(au16);
1407 if (!checksum)
1408 mPUSHu(au16);
1409 else if (checksum > bits_in_uv)
1410 cdouble += (NV) au16;
1411 else
1412 cuv += au16;
1413 }
1414 break;
1415 case 'v' | TYPE_IS_SHRIEKING:
1416 case 'n' | TYPE_IS_SHRIEKING:
1417 while (len-- > 0) {
1418 I16 ai16;
1419# if U16SIZE > SIZE16
1420 ai16 = 0;
1421# endif
1422 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap);
1423 /* There should never be any byte-swapping here. */
1424 assert(!TYPE_ENDIANNESS(datumtype));
1425 if (datumtype == ('n' | TYPE_IS_SHRIEKING))
1426 ai16 = (I16) PerlSock_ntohs((U16) ai16);
1427 if (datumtype == ('v' | TYPE_IS_SHRIEKING))
1428 ai16 = (I16) vtohs((U16) ai16);
1429 if (!checksum)
1430 mPUSHi(ai16);
1431 else if (checksum > bits_in_uv)
1432 cdouble += (NV) ai16;
1433 else
1434 cuv += ai16;
1435 }
1436 break;
1437 case 'i':
1438 case 'i' | TYPE_IS_SHRIEKING:
1439 while (len-- > 0) {
1440 int aint;
1441 SHIFT_VAR(utf8, s, strend, aint, datumtype, needs_swap);
1442 if (!checksum)
1443 mPUSHi(aint);
1444 else if (checksum > bits_in_uv)
1445 cdouble += (NV)aint;
1446 else
1447 cuv += aint;
1448 }
1449 break;
1450 case 'I':
1451 case 'I' | TYPE_IS_SHRIEKING:
1452 while (len-- > 0) {
1453 unsigned int auint;
1454 SHIFT_VAR(utf8, s, strend, auint, datumtype, needs_swap);
1455 if (!checksum)
1456 mPUSHu(auint);
1457 else if (checksum > bits_in_uv)
1458 cdouble += (NV)auint;
1459 else
1460 cuv += auint;
1461 }
1462 break;
1463 case 'j':
1464 while (len-- > 0) {
1465 IV aiv;
1466 SHIFT_VAR(utf8, s, strend, aiv, datumtype, needs_swap);
1467 if (!checksum)
1468 mPUSHi(aiv);
1469 else if (checksum > bits_in_uv)
1470 cdouble += (NV)aiv;
1471 else
1472 cuv += aiv;
1473 }
1474 break;
1475 case 'J':
1476 while (len-- > 0) {
1477 UV auv;
1478 SHIFT_VAR(utf8, s, strend, auv, datumtype, needs_swap);
1479 if (!checksum)
1480 mPUSHu(auv);
1481 else if (checksum > bits_in_uv)
1482 cdouble += (NV)auv;
1483 else
1484 cuv += auv;
1485 }
1486 break;
1487 case 'l' | TYPE_IS_SHRIEKING:
1488#if LONGSIZE != SIZE32
1489 while (len-- > 0) {
1490 long along;
1491 SHIFT_VAR(utf8, s, strend, along, datumtype, needs_swap);
1492 if (!checksum)
1493 mPUSHi(along);
1494 else if (checksum > bits_in_uv)
1495 cdouble += (NV)along;
1496 else
1497 cuv += along;
1498 }
1499 break;
1500#else
1501 /* FALLTHROUGH */
1502#endif
1503 case 'l':
1504 while (len-- > 0) {
1505 I32 ai32;
1506#if U32SIZE > SIZE32
1507 ai32 = 0;
1508#endif
1509 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap);
1510#if U32SIZE > SIZE32
1511 if (ai32 > 2147483647) ai32 -= 4294967296;
1512#endif
1513 if (!checksum)
1514 mPUSHi(ai32);
1515 else if (checksum > bits_in_uv)
1516 cdouble += (NV)ai32;
1517 else
1518 cuv += ai32;
1519 }
1520 break;
1521 case 'L' | TYPE_IS_SHRIEKING:
1522#if LONGSIZE != SIZE32
1523 while (len-- > 0) {
1524 unsigned long aulong;
1525 SHIFT_VAR(utf8, s, strend, aulong, datumtype, needs_swap);
1526 if (!checksum)
1527 mPUSHu(aulong);
1528 else if (checksum > bits_in_uv)
1529 cdouble += (NV)aulong;
1530 else
1531 cuv += aulong;
1532 }
1533 break;
1534#else
1535 /* FALLTHROUGH */
1536#endif
1537 case 'V':
1538 case 'N':
1539 case 'L':
1540 while (len-- > 0) {
1541 U32 au32;
1542#if U32SIZE > SIZE32
1543 au32 = 0;
1544#endif
1545 SHIFT32(utf8, s, strend, &au32, datumtype, needs_swap);
1546 if (datumtype == 'N')
1547 au32 = PerlSock_ntohl(au32);
1548 if (datumtype == 'V')
1549 au32 = vtohl(au32);
1550 if (!checksum)
1551 mPUSHu(au32);
1552 else if (checksum > bits_in_uv)
1553 cdouble += (NV)au32;
1554 else
1555 cuv += au32;
1556 }
1557 break;
1558 case 'V' | TYPE_IS_SHRIEKING:
1559 case 'N' | TYPE_IS_SHRIEKING:
1560 while (len-- > 0) {
1561 I32 ai32;
1562#if U32SIZE > SIZE32
1563 ai32 = 0;
1564#endif
1565 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap);
1566 /* There should never be any byte swapping here. */
1567 assert(!TYPE_ENDIANNESS(datumtype));
1568 if (datumtype == ('N' | TYPE_IS_SHRIEKING))
1569 ai32 = (I32)PerlSock_ntohl((U32)ai32);
1570 if (datumtype == ('V' | TYPE_IS_SHRIEKING))
1571 ai32 = (I32)vtohl((U32)ai32);
1572 if (!checksum)
1573 mPUSHi(ai32);
1574 else if (checksum > bits_in_uv)
1575 cdouble += (NV)ai32;
1576 else
1577 cuv += ai32;
1578 }
1579 break;
1580 case 'p':
1581 while (len-- > 0) {
1582 const char *aptr;
1583 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap);
1584 /* newSVpv generates undef if aptr is NULL */
1585 mPUSHs(newSVpv(aptr, 0));
1586 }
1587 break;
1588 case 'w':
1589 {
1590 UV auv = 0;
1591 size_t bytes = 0;
1592
1593 while (len > 0 && s < strend) {
1594 U8 ch;
1595 ch = SHIFT_BYTE(utf8, s, strend, datumtype);
1596 auv = (auv << 7) | (ch & 0x7f);
1597 /* UTF8_IS_XXXXX not right here because this is a BER, not
1598 * UTF-8 format - using constant 0x80 */
1599 if (ch < 0x80) {
1600 bytes = 0;
1601 mPUSHu(auv);
1602 len--;
1603 auv = 0;
1604 continue;
1605 }
1606 if (++bytes >= sizeof(UV)) { /* promote to string */
1607 const char *t;
1608
1609 sv = Perl_newSVpvf(aTHX_ "%.*" UVuf,
1610 (int)TYPE_DIGITS(UV), auv);
1611 while (s < strend) {
1612 ch = SHIFT_BYTE(utf8, s, strend, datumtype);
1613 sv = mul128(sv, (U8)(ch & 0x7f));
1614 if (!(ch & 0x80)) {
1615 bytes = 0;
1616 break;
1617 }
1618 }
1619 t = SvPV_nolen_const(sv);
1620 while (*t == '0')
1621 t++;
1622 sv_chop(sv, t);
1623 mPUSHs(sv);
1624 len--;
1625 auv = 0;
1626 }
1627 }
1628 if ((s >= strend) && bytes)
1629 Perl_croak(aTHX_ "Unterminated compressed integer in unpack");
1630 }
1631 break;
1632 case 'P':
1633 if (symptr->howlen == e_star)
1634 Perl_croak(aTHX_ "'P' must have an explicit size in unpack");
1635 EXTEND(SP, 1);
1636 if (s + sizeof(char*) <= strend) {
1637 char *aptr;
1638 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap);
1639 /* newSVpvn generates undef if aptr is NULL */
1640 PUSHs(newSVpvn_flags(aptr, len, SVs_TEMP));
1641 }
1642 break;
1643#if defined(HAS_QUAD) && IVSIZE >= 8
1644 case 'q':
1645 while (len-- > 0) {
1646 Quad_t aquad;
1647 SHIFT_VAR(utf8, s, strend, aquad, datumtype, needs_swap);
1648 if (!checksum)
1649 mPUSHs(newSViv((IV)aquad));
1650 else if (checksum > bits_in_uv)
1651 cdouble += (NV)aquad;
1652 else
1653 cuv += aquad;
1654 }
1655 break;
1656 case 'Q':
1657 while (len-- > 0) {
1658 Uquad_t auquad;
1659 SHIFT_VAR(utf8, s, strend, auquad, datumtype, needs_swap);
1660 if (!checksum)
1661 mPUSHs(newSVuv((UV)auquad));
1662 else if (checksum > bits_in_uv)
1663 cdouble += (NV)auquad;
1664 else
1665 cuv += auquad;
1666 }
1667 break;
1668#endif
1669 /* float and double added gnb@melba.bby.oz.au 22/11/89 */
1670 case 'f':
1671 while (len-- > 0) {
1672 float afloat;
1673 SHIFT_VAR(utf8, s, strend, afloat, datumtype, needs_swap);
1674 if (!checksum)
1675 mPUSHn(afloat);
1676 else
1677 cdouble += afloat;
1678 }
1679 break;
1680 case 'd':
1681 while (len-- > 0) {
1682 double adouble;
1683 SHIFT_VAR(utf8, s, strend, adouble, datumtype, needs_swap);
1684 if (!checksum)
1685 mPUSHn(adouble);
1686 else
1687 cdouble += adouble;
1688 }
1689 break;
1690 case 'F':
1691 while (len-- > 0) {
1692 NV_bytes anv;
1693 SHIFT_BYTES(utf8, s, strend, anv.bytes, sizeof(anv.bytes),
1694 datumtype, needs_swap);
1695 if (!checksum)
1696 mPUSHn(anv.nv);
1697 else
1698 cdouble += anv.nv;
1699 }
1700 break;
1701#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
1702 case 'D':
1703 while (len-- > 0) {
1704 ld_bytes aldouble;
1705 SHIFT_BYTES(utf8, s, strend, aldouble.bytes,
1706 sizeof(aldouble.bytes), datumtype, needs_swap);
1707 /* The most common long double format, the x86 80-bit
1708 * extended precision, has either 2 or 6 unused bytes,
1709 * which may contain garbage, which may contain
1710 * unintentional data. While we do zero the bytes of
1711 * the long double data in pack(), here in unpack() we
1712 * don't, because it's really hard to envision that
1713 * reading the long double off aldouble would be
1714 * affected by the unused bytes.
1715 *
1716 * Note that trying to unpack 'long doubles' of 'long
1717 * doubles' packed in another system is in the general
1718 * case doomed without having more detail. */
1719 if (!checksum)
1720 mPUSHn(aldouble.ld);
1721 else
1722 cdouble += aldouble.ld;
1723 }
1724 break;
1725#endif
1726 case 'u':
1727 if (!checksum) {
1728 const STRLEN l = (STRLEN) (strend - s) * 3 / 4;
1729 sv = sv_2mortal(newSV(l));
1730 if (l) SvPOK_on(sv);
1731 }
1732
1733 /* Note that all legal uuencoded strings are ASCII printables, so
1734 * have the same representation under UTF-8 vs not. This means we
1735 * can ignore UTF8ness on legal input. For illegal we stop at the
1736 * first failure, and don't report where/what that is, so again we
1737 * can ignore UTF8ness */
1738
1739 while (s < strend && *s != ' ' && ISUUCHAR(*s)) {
1740 I32 a, b, c, d;
1741 char hunk[3];
1742
1743 len = PL_uudmap[*(U8*)s++] & 077;
1744 while (len > 0) {
1745 if (s < strend && ISUUCHAR(*s))
1746 a = PL_uudmap[*(U8*)s++] & 077;
1747 else
1748 a = 0;
1749 if (s < strend && ISUUCHAR(*s))
1750 b = PL_uudmap[*(U8*)s++] & 077;
1751 else
1752 b = 0;
1753 if (s < strend && ISUUCHAR(*s))
1754 c = PL_uudmap[*(U8*)s++] & 077;
1755 else
1756 c = 0;
1757 if (s < strend && ISUUCHAR(*s))
1758 d = PL_uudmap[*(U8*)s++] & 077;
1759 else
1760 d = 0;
1761 hunk[0] = (char)((a << 2) | (b >> 4));
1762 hunk[1] = (char)((b << 4) | (c >> 2));
1763 hunk[2] = (char)((c << 6) | d);
1764 if (!checksum)
1765 sv_catpvn(sv, hunk, (len > 3) ? 3 : len);
1766 len -= 3;
1767 }
1768 if (*s == '\n')
1769 s++;
1770 else /* possible checksum byte */
1771 if (s + 1 < strend && s[1] == '\n')
1772 s += 2;
1773 }
1774 if (!checksum)
1775 XPUSHs(sv);
1776 break;
1777 } /* End of switch */
1778
1779 if (checksum) {
1780 if (strchr("fFdD", TYPE_NO_MODIFIERS(datumtype)) ||
1781 (checksum > bits_in_uv &&
1782 strchr("cCsSiIlLnNUWvVqQjJ", TYPE_NO_MODIFIERS(datumtype))) ) {
1783 NV trouble, anv;
1784
1785 anv = (NV) (1 << (checksum & 15));
1786 while (checksum >= 16) {
1787 checksum -= 16;
1788 anv *= 65536.0;
1789 }
1790 while (cdouble < 0.0)
1791 cdouble += anv;
1792 cdouble = Perl_modf(cdouble / anv, &trouble);
1793#ifdef LONGDOUBLE_DOUBLEDOUBLE
1794 /* Workaround for powerpc doubledouble modfl bug:
1795 * close to 1.0L and -1.0L cdouble is 0, and trouble
1796 * is cdouble / anv. */
1797 if (trouble != Perl_ceil(trouble)) {
1798 cdouble = trouble;
1799 if (cdouble > 1.0L) cdouble -= 1.0L;
1800 if (cdouble < -1.0L) cdouble += 1.0L;
1801 }
1802#endif
1803 cdouble *= anv;
1804 sv = newSVnv(cdouble);
1805 }
1806 else {
1807 if (checksum < bits_in_uv) {
1808 UV mask = ((UV)1 << checksum) - 1;
1809 cuv &= mask;
1810 }
1811 sv = newSVuv(cuv);
1812 }
1813 mXPUSHs(sv);
1814 checksum = 0;
1815 }
1816
1817 if (symptr->flags & FLAG_SLASH){
1818 if (SP - PL_stack_base - start_sp_offset <= 0)
1819 break;
1820 if( next_symbol(symptr) ){
1821 if( symptr->howlen == e_number )
1822 Perl_croak(aTHX_ "Count after length/code in unpack" );
1823 if( beyond ){
1824 /* ...end of char buffer then no decent length available */
1825 Perl_croak(aTHX_ "length/code after end of string in unpack" );
1826 } else {
1827 /* take top of stack (hope it's numeric) */
1828 len = POPi;
1829 if( len < 0 )
1830 Perl_croak(aTHX_ "Negative '/' count in unpack" );
1831 }
1832 } else {
1833 Perl_croak(aTHX_ "Code missing after '/' in unpack" );
1834 }
1835 datumtype = symptr->code;
1836 explicit_length = FALSE;
1837 goto redo_switch;
1838 }
1839 }
1840
1841 if (new_s)
1842 *new_s = s;
1843 PUTBACK;
1844 return SP - PL_stack_base - start_sp_offset;
1845}
1846
1847PP(pp_unpack)
1848{
1849 dSP;
1850 dPOPPOPssrl;
1851 U8 gimme = GIMME_V;
1852 STRLEN llen;
1853 STRLEN rlen;
1854 const char *pat = SvPV_const(left, llen);
1855 const char *s = SvPV_const(right, rlen);
1856 const char *strend = s + rlen;
1857 const char *patend = pat + llen;
1858 SSize_t cnt;
1859
1860 PUTBACK;
1861 cnt = unpackstring(pat, patend, s, strend,
1862 ((gimme == G_SCALAR) ? FLAG_UNPACK_ONLY_ONE : 0)
1863 | (DO_UTF8(right) ? FLAG_DO_UTF8 : 0));
1864
1865 SPAGAIN;
1866 if ( !cnt && gimme == G_SCALAR )
1867 PUSHs(&PL_sv_undef);
1868 RETURN;
1869}
1870
1871STATIC U8 *
1872doencodes(U8 *h, const U8 *s, SSize_t len)
1873{
1874 *h++ = PL_uuemap[len];
1875 while (len > 2) {
1876 *h++ = PL_uuemap[(077 & (s[0] >> 2))];
1877 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((s[1] >> 4) & 017)))];
1878 *h++ = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))];
1879 *h++ = PL_uuemap[(077 & (s[2] & 077))];
1880 s += 3;
1881 len -= 3;
1882 }
1883 if (len > 0) {
1884 const U8 r = (len > 1 ? s[1] : '\0');
1885 *h++ = PL_uuemap[(077 & (s[0] >> 2))];
1886 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((r >> 4) & 017)))];
1887 *h++ = PL_uuemap[(077 & ((r << 2) & 074))];
1888 *h++ = PL_uuemap[0];
1889 }
1890 *h++ = '\n';
1891 return h;
1892}
1893
1894STATIC SV *
1895S_is_an_int(pTHX_ const char *s, STRLEN l)
1896{
1897 SV *result = newSVpvn(s, l);
1898 char *const result_c = SvPV_nolen(result); /* convenience */
1899 char *out = result_c;
1900 bool skip = 1;
1901 bool ignore = 0;
1902
1903 PERL_ARGS_ASSERT_IS_AN_INT;
1904
1905 while (*s) {
1906 switch (*s) {
1907 case ' ':
1908 break;
1909 case '+':
1910 if (!skip) {
1911 SvREFCNT_dec(result);
1912 return (NULL);
1913 }
1914 break;
1915 case '0':
1916 case '1':
1917 case '2':
1918 case '3':
1919 case '4':
1920 case '5':
1921 case '6':
1922 case '7':
1923 case '8':
1924 case '9':
1925 skip = 0;
1926 if (!ignore) {
1927 *(out++) = *s;
1928 }
1929 break;
1930 case '.':
1931 ignore = 1;
1932 break;
1933 default:
1934 SvREFCNT_dec(result);
1935 return (NULL);
1936 }
1937 s++;
1938 }
1939 *(out++) = '\0';
1940 SvCUR_set(result, out - result_c);
1941 return (result);
1942}
1943
1944/* pnum must be '\0' terminated */
1945STATIC int
1946S_div128(pTHX_ SV *pnum, bool *done)
1947{
1948 STRLEN len;
1949 char * const s = SvPV(pnum, len);
1950 char *t = s;
1951 int m = 0;
1952
1953 PERL_ARGS_ASSERT_DIV128;
1954
1955 *done = 1;
1956 while (*t) {
1957 const int i = m * 10 + (*t - '0');
1958 const int r = (i >> 7); /* r < 10 */
1959 m = i & 0x7F;
1960 if (r) {
1961 *done = 0;
1962 }
1963 *(t++) = '0' + r;
1964 }
1965 *(t++) = '\0';
1966 SvCUR_set(pnum, (STRLEN) (t - s));
1967 return (m);
1968}
1969
1970/*
1971=for apidoc packlist
1972
1973The engine implementing C<pack()> Perl function.
1974
1975=cut
1976*/
1977
1978void
1979Perl_packlist(pTHX_ SV *cat, const char *pat, const char *patend, SV **beglist, SV **endlist )
1980{
1981 tempsym_t sym;
1982
1983 PERL_ARGS_ASSERT_PACKLIST;
1984
1985 TEMPSYM_INIT(&sym, pat, patend, FLAG_PACK);
1986
1987 /* We're going to do changes through SvPVX(cat). Make sure it's valid.
1988 Also make sure any UTF8 flag is loaded */
1989 SvPV_force_nolen(cat);
1990 if (DO_UTF8(cat))
1991 sym.flags |= FLAG_PARSE_UTF8 | FLAG_DO_UTF8;
1992
1993 (void)pack_rec( cat, &sym, beglist, endlist );
1994}
1995
1996/* like sv_utf8_upgrade, but also repoint the group start markers */
1997STATIC void
1998marked_upgrade(pTHX_ SV *sv, tempsym_t *sym_ptr) {
1999 STRLEN len;
2000 tempsym_t *group;
2001 const char *from_ptr, *from_start, *from_end, **marks, **m;
2002 char *to_start, *to_ptr;
2003
2004 if (SvUTF8(sv)) return;
2005
2006 from_start = SvPVX_const(sv);
2007 from_end = from_start + SvCUR(sv);
2008 for (from_ptr = from_start; from_ptr < from_end; from_ptr++)
2009 if (!NATIVE_BYTE_IS_INVARIANT(*from_ptr)) break;
2010 if (from_ptr == from_end) {
2011 /* Simple case: no character needs to be changed */
2012 SvUTF8_on(sv);
2013 return;
2014 }
2015
2016 len = (from_end-from_ptr)*UTF8_EXPAND+(from_ptr-from_start)+1;
2017 Newx(to_start, len, char);
2018 Copy(from_start, to_start, from_ptr-from_start, char);
2019 to_ptr = to_start + (from_ptr-from_start);
2020
2021 Newx(marks, sym_ptr->level+2, const char *);
2022 for (group=sym_ptr; group; group = group->previous)
2023 marks[group->level] = from_start + group->strbeg;
2024 marks[sym_ptr->level+1] = from_end+1;
2025 for (m = marks; *m < from_ptr; m++)
2026 *m = to_start + (*m-from_start);
2027
2028 for (;from_ptr < from_end; from_ptr++) {
2029 while (*m == from_ptr) *m++ = to_ptr;
2030 to_ptr = (char *) uvchr_to_utf8((U8 *) to_ptr, *(U8 *) from_ptr);
2031 }
2032 *to_ptr = 0;
2033
2034 while (*m == from_ptr) *m++ = to_ptr;
2035 if (m != marks + sym_ptr->level+1) {
2036 Safefree(marks);
2037 Safefree(to_start);
2038 Perl_croak(aTHX_ "panic: marks beyond string end, m=%p, marks=%p, "
2039 "level=%d", m, marks, sym_ptr->level);
2040 }
2041 for (group=sym_ptr; group; group = group->previous)
2042 group->strbeg = marks[group->level] - to_start;
2043 Safefree(marks);
2044
2045 if (SvOOK(sv)) {
2046 if (SvIVX(sv)) {
2047 SvLEN_set(sv, SvLEN(sv) + SvIVX(sv));
2048 from_start -= SvIVX(sv);
2049 SvIV_set(sv, 0);
2050 }
2051 SvFLAGS(sv) &= ~SVf_OOK;
2052 }
2053 if (SvLEN(sv) != 0)
2054 Safefree(from_start);
2055 SvPV_set(sv, to_start);
2056 SvCUR_set(sv, to_ptr - to_start);
2057 SvLEN_set(sv, len);
2058 SvUTF8_on(sv);
2059}
2060
2061/* Exponential string grower. Makes string extension effectively O(n)
2062 needed says how many extra bytes we need (not counting the final '\0')
2063 Only grows the string if there is an actual lack of space
2064*/
2065STATIC char *
2066S_sv_exp_grow(pTHX_ SV *sv, STRLEN needed) {
2067 const STRLEN cur = SvCUR(sv);
2068 const STRLEN len = SvLEN(sv);
2069 STRLEN extend;
2070
2071 PERL_ARGS_ASSERT_SV_EXP_GROW;
2072
2073 if (len - cur > needed) return SvPVX(sv);
2074 extend = needed > len ? needed : len;
2075 return SvGROW(sv, len+extend+1);
2076}
2077
2078static SV *
2079S_sv_check_infnan(pTHX_ SV *sv, I32 datumtype)
2080{
2081 SvGETMAGIC(sv);
2082 if (UNLIKELY(SvAMAGIC(sv)))
2083 sv = sv_2num(sv);
2084 if (UNLIKELY(isinfnansv(sv))) {
2085 const I32 c = TYPE_NO_MODIFIERS(datumtype);
2086 const NV nv = SvNV_nomg(sv);
2087 if (c == 'w')
2088 Perl_croak(aTHX_ "Cannot compress %" NVgf " in pack", nv);
2089 else
2090 Perl_croak(aTHX_ "Cannot pack %" NVgf " with '%c'", nv, (int) c);
2091 }
2092 return sv;
2093}
2094
2095#define SvIV_no_inf(sv,d) \
2096 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvIV_nomg(sv))
2097#define SvUV_no_inf(sv,d) \
2098 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvUV_nomg(sv))
2099
2100STATIC
2101SV **
2102S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
2103{
2104 tempsym_t lookahead;
2105 SSize_t items = endlist - beglist;
2106 bool found = next_symbol(symptr);
2107 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0;
2108 bool warn_utf8 = ckWARN(WARN_UTF8);
2109 char* from;
2110
2111 PERL_ARGS_ASSERT_PACK_REC;
2112
2113 if (symptr->level == 0 && found && symptr->code == 'U') {
2114 marked_upgrade(aTHX_ cat, symptr);
2115 symptr->flags |= FLAG_DO_UTF8;
2116 utf8 = 0;
2117 }
2118 symptr->strbeg = SvCUR(cat);
2119
2120 while (found) {
2121 SV *fromstr;
2122 STRLEN fromlen;
2123 SSize_t len;
2124 SV *lengthcode = NULL;
2125 I32 datumtype = symptr->code;
2126 howlen_t howlen = symptr->howlen;
2127 char *start = SvPVX(cat);
2128 char *cur = start + SvCUR(cat);
2129 bool needs_swap;
2130
2131#define NEXTFROM (lengthcode ? lengthcode : items > 0 ? (--items, *beglist++) : &PL_sv_no)
2132#define PEEKFROM (lengthcode ? lengthcode : items > 0 ? *beglist : &PL_sv_no)
2133
2134 switch (howlen) {
2135 case e_star:
2136 len = strchr("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ?
2137 0 : items;
2138 break;
2139 default:
2140 /* e_no_len and e_number */
2141 len = symptr->length;
2142 break;
2143 }
2144
2145 if (len) {
2146 packprops_t props = packprops[TYPE_NO_ENDIANNESS(datumtype)];
2147
2148 if (props && !(props & PACK_SIZE_UNPREDICTABLE)) {
2149 /* We can process this letter. */
2150 STRLEN size = props & PACK_SIZE_MASK;
2151 GROWING2(utf8, cat, start, cur, size, (STRLEN)len);
2152 }
2153 }
2154
2155 /* Look ahead for next symbol. Do we have code/code? */
2156 lookahead = *symptr;
2157 found = next_symbol(&lookahead);
2158 if (symptr->flags & FLAG_SLASH) {
2159 IV count;
2160 if (!found) Perl_croak(aTHX_ "Code missing after '/' in pack");
2161 if (strchr("aAZ", lookahead.code)) {
2162 if (lookahead.howlen == e_number) count = lookahead.length;
2163 else {
2164 if (items > 0) {
2165 count = sv_len_utf8(*beglist);
2166 }
2167 else count = 0;
2168 if (lookahead.code == 'Z') count++;
2169 }
2170 } else {
2171 if (lookahead.howlen == e_number && lookahead.length < items)
2172 count = lookahead.length;
2173 else count = items;
2174 }
2175 lookahead.howlen = e_number;
2176 lookahead.length = count;
2177 lengthcode = sv_2mortal(newSViv(count));
2178 }
2179
2180 needs_swap = NEEDS_SWAP(datumtype);
2181
2182 /* Code inside the switch must take care to properly update
2183 cat (CUR length and '\0' termination) if it updated *cur and
2184 doesn't simply leave using break */
2185 switch (TYPE_NO_ENDIANNESS(datumtype)) {
2186 default:
2187 Perl_croak(aTHX_ "Invalid type '%c' in pack",
2188 (int) TYPE_NO_MODIFIERS(datumtype));
2189 case '%':
2190 Perl_croak(aTHX_ "'%%' may not be used in pack");
2191
2192 case '.' | TYPE_IS_SHRIEKING:
2193 case '.':
2194 if (howlen == e_star) from = start;
2195 else if (len == 0) from = cur;
2196 else {
2197 tempsym_t *group = symptr;
2198
2199 while (--len && group) group = group->previous;
2200 from = group ? start + group->strbeg : start;
2201 }
2202 fromstr = NEXTFROM;
2203 len = SvIV_no_inf(fromstr, datumtype);
2204 goto resize;
2205 case '@' | TYPE_IS_SHRIEKING:
2206 case '@':
2207 from = start + symptr->strbeg;
2208 resize:
2209 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING))
2210 if (len >= 0) {
2211 while (len && from < cur) {
2212 from += UTF8SKIP(from);
2213 len--;
2214 }
2215 if (from > cur)
2216 Perl_croak(aTHX_ "Malformed UTF-8 string in pack");
2217 if (len) {
2218 /* Here we know from == cur */
2219 grow:
2220 GROWING(0, cat, start, cur, len);
2221 Zero(cur, len, char);
2222 cur += len;
2223 } else if (from < cur) {
2224 len = cur - from;
2225 goto shrink;
2226 } else goto no_change;
2227 } else {
2228 cur = from;
2229 len = -len;
2230 goto utf8_shrink;
2231 }
2232 else {
2233 len -= cur - from;
2234 if (len > 0) goto grow;
2235 if (len == 0) goto no_change;
2236 len = -len;
2237 goto shrink;
2238 }
2239 break;
2240
2241 case '(': {
2242 tempsym_t savsym = *symptr;
2243 U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags);
2244 symptr->flags |= group_modifiers;
2245 symptr->patend = savsym.grpend;
2246 symptr->level++;
2247 symptr->previous = &lookahead;
2248 while (len--) {
2249 U32 was_utf8;
2250 if (utf8) symptr->flags |= FLAG_PARSE_UTF8;
2251 else symptr->flags &= ~FLAG_PARSE_UTF8;
2252 was_utf8 = SvUTF8(cat);
2253 symptr->patptr = savsym.grpbeg;
2254 beglist = pack_rec(cat, symptr, beglist, endlist);
2255 if (SvUTF8(cat) != was_utf8)
2256 /* This had better be an upgrade while in utf8==0 mode */
2257 utf8 = 1;
2258
2259 if (savsym.howlen == e_star && beglist == endlist)
2260 break; /* No way to continue */
2261 }
2262 items = endlist - beglist;
2263 lookahead.flags = symptr->flags & ~group_modifiers;
2264 goto no_change;
2265 }
2266 case 'X' | TYPE_IS_SHRIEKING:
2267 if (!len) /* Avoid division by 0 */
2268 len = 1;
2269 if (utf8) {
2270 char *hop, *last;
2271 SSize_t l = len;
2272 hop = last = start;
2273 while (hop < cur) {
2274 hop += UTF8SKIP(hop);
2275 if (--l == 0) {
2276 last = hop;
2277 l = len;
2278 }
2279 }
2280 if (last > cur)
2281 Perl_croak(aTHX_ "Malformed UTF-8 string in pack");
2282 cur = last;
2283 break;
2284 }
2285 len = (cur-start) % len;
2286 /* FALLTHROUGH */
2287 case 'X':
2288 if (utf8) {
2289 if (len < 1) goto no_change;
2290 utf8_shrink:
2291 while (len > 0) {
2292 if (cur <= start)
2293 Perl_croak(aTHX_ "'%c' outside of string in pack",
2294 (int) TYPE_NO_MODIFIERS(datumtype));
2295 while (--cur, UTF8_IS_CONTINUATION(*cur)) {
2296 if (cur <= start)
2297 Perl_croak(aTHX_ "'%c' outside of string in pack",
2298 (int) TYPE_NO_MODIFIERS(datumtype));
2299 }
2300 len--;
2301 }
2302 } else {
2303 shrink:
2304 if (cur - start < len)
2305 Perl_croak(aTHX_ "'%c' outside of string in pack",
2306 (int) TYPE_NO_MODIFIERS(datumtype));
2307 cur -= len;
2308 }
2309 if (cur < start+symptr->strbeg) {
2310 /* Make sure group starts don't point into the void */
2311 tempsym_t *group;
2312 const STRLEN length = cur-start;
2313 for (group = symptr;
2314 group && length < group->strbeg;
2315 group = group->previous) group->strbeg = length;
2316 lookahead.strbeg = length;
2317 }
2318 break;
2319 case 'x' | TYPE_IS_SHRIEKING: {
2320 SSize_t ai32;
2321 if (!len) /* Avoid division by 0 */
2322 len = 1;
2323 if (utf8) ai32 = utf8_length((U8 *) start, (U8 *) cur) % len;
2324 else ai32 = (cur - start) % len;
2325 if (ai32 == 0) goto no_change;
2326 len -= ai32;
2327 }
2328 /* FALLTHROUGH */
2329 case 'x':
2330 goto grow;
2331 case 'A':
2332 case 'Z':
2333 case 'a': {
2334 const char *aptr;
2335
2336 fromstr = NEXTFROM;
2337 aptr = SvPV_const(fromstr, fromlen);
2338 if (DO_UTF8(fromstr)) {
2339 const char *end, *s;
2340
2341 if (!utf8 && !SvUTF8(cat)) {
2342 marked_upgrade(aTHX_ cat, symptr);
2343 lookahead.flags |= FLAG_DO_UTF8;
2344 lookahead.strbeg = symptr->strbeg;
2345 utf8 = 1;
2346 start = SvPVX(cat);
2347 cur = start + SvCUR(cat);
2348 }
2349 if (howlen == e_star) {
2350 if (utf8) goto string_copy;
2351 len = fromlen+1;
2352 }
2353 s = aptr;
2354 end = aptr + fromlen;
2355 fromlen = datumtype == 'Z' ? len-1 : len;
2356 while ((SSize_t) fromlen > 0 && s < end) {
2357 s += UTF8SKIP(s);
2358 fromlen--;
2359 }
2360 if (s > end)
2361 Perl_croak(aTHX_ "Malformed UTF-8 string in pack");
2362 if (utf8) {
2363 len = fromlen;
2364 if (datumtype == 'Z') len++;
2365 fromlen = s-aptr;
2366 len += fromlen;
2367
2368 goto string_copy;
2369 }
2370 fromlen = len - fromlen;
2371 if (datumtype == 'Z') fromlen--;
2372 if (howlen == e_star) {
2373 len = fromlen;
2374 if (datumtype == 'Z') len++;
2375 }
2376 GROWING(0, cat, start, cur, len);
2377 if (!S_utf8_to_bytes(aTHX_ &aptr, end, cur, fromlen,
2378 datumtype | TYPE_IS_PACK))
2379 Perl_croak(aTHX_ "panic: predicted utf8 length not available, "
2380 "for '%c', aptr=%p end=%p cur=%p, fromlen=%zu",
2381 (int)datumtype, aptr, end, cur, fromlen);
2382 cur += fromlen;
2383 len -= fromlen;
2384 } else if (utf8) {
2385 if (howlen == e_star) {
2386 len = fromlen;
2387 if (datumtype == 'Z') len++;
2388 }
2389 if (len <= (SSize_t) fromlen) {
2390 fromlen = len;
2391 if (datumtype == 'Z' && fromlen > 0) fromlen--;
2392 }
2393 /* assumes a byte expands to at most UTF8_EXPAND bytes on
2394 upgrade, so:
2395 expected_length <= from_len*UTF8_EXPAND + (len-from_len) */
2396 GROWING(0, cat, start, cur, fromlen*(UTF8_EXPAND-1)+len);
2397 len -= fromlen;
2398 while (fromlen > 0) {
2399 cur = (char *) uvchr_to_utf8((U8 *) cur, * (U8 *) aptr);
2400 aptr++;
2401 fromlen--;
2402 }
2403 } else {
2404 string_copy:
2405 if (howlen == e_star) {
2406 len = fromlen;
2407 if (datumtype == 'Z') len++;
2408 }
2409 if (len <= (SSize_t) fromlen) {
2410 fromlen = len;
2411 if (datumtype == 'Z' && fromlen > 0) fromlen--;
2412 }
2413 GROWING(0, cat, start, cur, len);
2414 Copy(aptr, cur, fromlen, char);
2415 cur += fromlen;
2416 len -= fromlen;
2417 }
2418 memset(cur, datumtype == 'A' ? ' ' : '\0', len);
2419 cur += len;
2420 SvTAINT(cat);
2421 break;
2422 }
2423 case 'B':
2424 case 'b': {
2425 const char *str, *end;
2426 SSize_t l, field_len;
2427 U8 bits;
2428 bool utf8_source;
2429 U32 utf8_flags;
2430
2431 fromstr = NEXTFROM;
2432 str = SvPV_const(fromstr, fromlen);
2433 end = str + fromlen;
2434 if (DO_UTF8(fromstr)) {
2435 utf8_source = TRUE;
2436 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY;
2437 } else {
2438 utf8_source = FALSE;
2439 utf8_flags = 0; /* Unused, but keep compilers happy */
2440 }
2441 if (howlen == e_star) len = fromlen;
2442 field_len = (len+7)/8;
2443 GROWING(utf8, cat, start, cur, field_len);
2444 if (len > (SSize_t)fromlen) len = fromlen;
2445 bits = 0;
2446 l = 0;
2447 if (datumtype == 'B')
2448 while (l++ < len) {
2449 if (utf8_source) {
2450 UV val = 0;
2451 NEXT_UNI_VAL(val, cur, str, end, utf8_flags);
2452 bits |= val & 1;
2453 } else bits |= *str++ & 1;
2454 if (l & 7) bits <<= 1;
2455 else {
2456 PUSH_BYTE(utf8, cur, bits);
2457 bits = 0;
2458 }
2459 }
2460 else
2461 /* datumtype == 'b' */
2462 while (l++ < len) {
2463 if (utf8_source) {
2464 UV val = 0;
2465 NEXT_UNI_VAL(val, cur, str, end, utf8_flags);
2466 if (val & 1) bits |= 0x80;
2467 } else if (*str++ & 1)
2468 bits |= 0x80;
2469 if (l & 7) bits >>= 1;
2470 else {
2471 PUSH_BYTE(utf8, cur, bits);
2472 bits = 0;
2473 }
2474 }
2475 l--;
2476 if (l & 7) {
2477 if (datumtype == 'B')
2478 bits <<= 7 - (l & 7);
2479 else
2480 bits >>= 7 - (l & 7);
2481 PUSH_BYTE(utf8, cur, bits);
2482 l += 7;
2483 }
2484 /* Determine how many chars are left in the requested field */
2485 l /= 8;
2486 if (howlen == e_star) field_len = 0;
2487 else field_len -= l;
2488 Zero(cur, field_len, char);
2489 cur += field_len;
2490 break;
2491 }
2492 case 'H':
2493 case 'h': {
2494 const char *str, *end;
2495 SSize_t l, field_len;
2496 U8 bits;
2497 bool utf8_source;
2498 U32 utf8_flags;
2499
2500 fromstr = NEXTFROM;
2501 str = SvPV_const(fromstr, fromlen);
2502 end = str + fromlen;
2503 if (DO_UTF8(fromstr)) {
2504 utf8_source = TRUE;
2505 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY;
2506 } else {
2507 utf8_source = FALSE;
2508 utf8_flags = 0; /* Unused, but keep compilers happy */
2509 }
2510 if (howlen == e_star) len = fromlen;
2511 field_len = (len+1)/2;
2512 GROWING(utf8, cat, start, cur, field_len);
2513 if (!utf8_source && len > (SSize_t)fromlen) len = fromlen;
2514 bits = 0;
2515 l = 0;
2516 if (datumtype == 'H')
2517 while (l++ < len) {
2518 if (utf8_source) {
2519 UV val = 0;
2520 NEXT_UNI_VAL(val, cur, str, end, utf8_flags);
2521 if (val < 256 && isALPHA(val))
2522 bits |= (val + 9) & 0xf;
2523 else
2524 bits |= val & 0xf;
2525 } else if (isALPHA(*str))
2526 bits |= (*str++ + 9) & 0xf;
2527 else
2528 bits |= *str++ & 0xf;
2529 if (l & 1) bits <<= 4;
2530 else {
2531 PUSH_BYTE(utf8, cur, bits);
2532 bits = 0;
2533 }
2534 }
2535 else
2536 while (l++ < len) {
2537 if (utf8_source) {
2538 UV val = 0;
2539 NEXT_UNI_VAL(val, cur, str, end, utf8_flags);
2540 if (val < 256 && isALPHA(val))
2541 bits |= ((val + 9) & 0xf) << 4;
2542 else
2543 bits |= (val & 0xf) << 4;
2544 } else if (isALPHA(*str))
2545 bits |= ((*str++ + 9) & 0xf) << 4;
2546 else
2547 bits |= (*str++ & 0xf) << 4;
2548 if (l & 1) bits >>= 4;
2549 else {
2550 PUSH_BYTE(utf8, cur, bits);
2551 bits = 0;
2552 }
2553 }
2554 l--;
2555 if (l & 1) {
2556 PUSH_BYTE(utf8, cur, bits);
2557 l++;
2558 }
2559 /* Determine how many chars are left in the requested field */
2560 l /= 2;
2561 if (howlen == e_star) field_len = 0;
2562 else field_len -= l;
2563 Zero(cur, field_len, char);
2564 cur += field_len;
2565 break;
2566 }
2567 case 'c':
2568 while (len-- > 0) {
2569 IV aiv;
2570 fromstr = NEXTFROM;
2571 aiv = SvIV_no_inf(fromstr, datumtype);
2572 if ((-128 > aiv || aiv > 127))
2573 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
2574 "Character in 'c' format wrapped in pack");
2575 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff));
2576 }
2577 break;
2578 case 'C':
2579 if (len == 0) {
2580 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
2581 break;
2582 }
2583 while (len-- > 0) {
2584 IV aiv;
2585 fromstr = NEXTFROM;
2586 aiv = SvIV_no_inf(fromstr, datumtype);
2587 if ((0 > aiv || aiv > 0xff))
2588 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
2589 "Character in 'C' format wrapped in pack");
2590 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff));
2591 }
2592 break;
2593 case 'W': {
2594 char *end;
2595 U8 in_bytes = (U8)IN_BYTES;
2596
2597 end = start+SvLEN(cat)-1;
2598 if (utf8) end -= UTF8_MAXLEN-1;
2599 while (len-- > 0) {
2600 UV auv;
2601 fromstr = NEXTFROM;
2602 auv = SvUV_no_inf(fromstr, datumtype);
2603 if (in_bytes) auv = auv % 0x100;
2604 if (utf8) {
2605 W_utf8:
2606 if (cur >= end) {
2607 *cur = '\0';
2608 SvCUR_set(cat, cur - start);
2609
2610 GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
2611 end = start+SvLEN(cat)-UTF8_MAXLEN;
2612 }
2613 cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv, 0);
2614 } else {
2615 if (auv >= 0x100) {
2616 if (!SvUTF8(cat)) {
2617 *cur = '\0';
2618 SvCUR_set(cat, cur - start);
2619 marked_upgrade(aTHX_ cat, symptr);
2620 lookahead.flags |= FLAG_DO_UTF8;
2621 lookahead.strbeg = symptr->strbeg;
2622 utf8 = 1;
2623 start = SvPVX(cat);
2624 cur = start + SvCUR(cat);
2625 end = start+SvLEN(cat)-UTF8_MAXLEN;
2626 goto W_utf8;
2627 }
2628 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
2629 "Character in 'W' format wrapped in pack");
2630 auv &= 0xff;
2631 }
2632 if (cur >= end) {
2633 *cur = '\0';
2634 SvCUR_set(cat, cur - start);
2635 GROWING(0, cat, start, cur, len+1);
2636 end = start+SvLEN(cat)-1;
2637 }
2638 *(U8 *) cur++ = (U8)auv;
2639 }
2640 }
2641 break;
2642 }
2643 case 'U': {
2644 char *end;
2645
2646 if (len == 0) {
2647 if (!(symptr->flags & FLAG_DO_UTF8)) {
2648 marked_upgrade(aTHX_ cat, symptr);
2649 lookahead.flags |= FLAG_DO_UTF8;
2650 lookahead.strbeg = symptr->strbeg;
2651 }
2652 utf8 = 0;
2653 goto no_change;
2654 }
2655
2656 end = start+SvLEN(cat);
2657 if (!utf8) end -= UTF8_MAXLEN;
2658 while (len-- > 0) {
2659 UV auv;
2660 fromstr = NEXTFROM;
2661 auv = SvUV_no_inf(fromstr, datumtype);
2662 if (utf8) {
2663 U8 buffer[UTF8_MAXLEN+1], *endb;
2664 endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 0);
2665 if (cur+(endb-buffer)*UTF8_EXPAND >= end) {
2666 *cur = '\0';
2667 SvCUR_set(cat, cur - start);
2668 GROWING(0, cat, start, cur,
2669 len+(endb-buffer)*UTF8_EXPAND);
2670 end = start+SvLEN(cat);
2671 }
2672 cur = my_bytes_to_utf8(buffer, endb-buffer, cur, 0);
2673 } else {
2674 if (cur >= end) {
2675 *cur = '\0';
2676 SvCUR_set(cat, cur - start);
2677 GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
2678 end = start+SvLEN(cat)-UTF8_MAXLEN;
2679 }
2680 cur = (char *) uvchr_to_utf8_flags((U8 *) cur,
2681 UNI_TO_NATIVE(auv),
2682 0);
2683 }
2684 }
2685 break;
2686 }
2687 /* Float and double added by gnb@melba.bby.oz.au 22/11/89 */
2688 case 'f':
2689 while (len-- > 0) {
2690 float afloat;
2691 NV anv;
2692 fromstr = NEXTFROM;
2693 anv = SvNV(fromstr);
2694# if (defined(VMS) && !defined(_IEEE_FP)) || defined(DOUBLE_IS_VAX_FLOAT)
2695 /* IEEE fp overflow shenanigans are unavailable on VAX and optional
2696 * on Alpha; fake it if we don't have them.
2697 */
2698 if (anv > FLT_MAX)
2699 afloat = FLT_MAX;
2700 else if (anv < -FLT_MAX)
2701 afloat = -FLT_MAX;
2702 else afloat = (float)anv;
2703# else
2704# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2705 if(Perl_isnan(anv))
2706 afloat = (float)NV_NAN;
2707 else
2708# endif
2709# ifdef NV_INF
2710 /* a simple cast to float is undefined if outside
2711 * the range of values that can be represented */
2712 afloat = (float)(anv > FLT_MAX ? NV_INF :
2713 anv < -FLT_MAX ? -NV_INF : anv);
2714# endif
2715# endif
2716 PUSH_VAR(utf8, cur, afloat, needs_swap);
2717 }
2718 break;
2719 case 'd':
2720 while (len-- > 0) {
2721 double adouble;
2722 NV anv;
2723 fromstr = NEXTFROM;
2724 anv = SvNV(fromstr);
2725# if (defined(VMS) && !defined(_IEEE_FP)) || defined(DOUBLE_IS_VAX_FLOAT)
2726 /* IEEE fp overflow shenanigans are unavailable on VAX and optional
2727 * on Alpha; fake it if we don't have them.
2728 */
2729 if (anv > DBL_MAX)
2730 adouble = DBL_MAX;
2731 else if (anv < -DBL_MAX)
2732 adouble = -DBL_MAX;
2733 else adouble = (double)anv;
2734# else
2735 adouble = (double)anv;
2736# endif
2737 PUSH_VAR(utf8, cur, adouble, needs_swap);
2738 }
2739 break;
2740 case 'F': {
2741 NV_bytes anv;
2742 Zero(&anv, 1, NV); /* can be long double with unused bits */
2743 while (len-- > 0) {
2744 fromstr = NEXTFROM;
2745#ifdef __GNUC__
2746 /* to work round a gcc/x86 bug; don't use SvNV */
2747 anv.nv = sv_2nv(fromstr);
2748# if defined(LONGDOUBLE_X86_80_BIT) && defined(USE_LONG_DOUBLE) \
2749 && LONG_DOUBLESIZE > 10
2750 /* GCC sometimes overwrites the padding in the
2751 assignment above */
2752 Zero(anv.bytes+10, sizeof(anv.bytes) - 10, U8);
2753# endif
2754#else
2755 anv.nv = SvNV(fromstr);
2756#endif
2757 PUSH_BYTES(utf8, cur, anv.bytes, sizeof(anv.bytes), needs_swap);
2758 }
2759 break;
2760 }
2761#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
2762 case 'D': {
2763 ld_bytes aldouble;
2764 /* long doubles can have unused bits, which may be nonzero */
2765 Zero(&aldouble, 1, long double);
2766 while (len-- > 0) {
2767 fromstr = NEXTFROM;
2768# ifdef __GNUC__
2769 /* to work round a gcc/x86 bug; don't use SvNV */
2770 aldouble.ld = (long double)sv_2nv(fromstr);
2771# if defined(LONGDOUBLE_X86_80_BIT) && LONG_DOUBLESIZE > 10
2772 /* GCC sometimes overwrites the padding in the
2773 assignment above */
2774 Zero(aldouble.bytes+10, sizeof(aldouble.bytes) - 10, U8);
2775# endif
2776# else
2777 aldouble.ld = (long double)SvNV(fromstr);
2778# endif
2779 PUSH_BYTES(utf8, cur, aldouble.bytes, sizeof(aldouble.bytes),
2780 needs_swap);
2781 }
2782 break;
2783 }
2784#endif
2785 case 'n' | TYPE_IS_SHRIEKING:
2786 case 'n':
2787 while (len-- > 0) {
2788 I16 ai16;
2789 fromstr = NEXTFROM;
2790 ai16 = (I16)SvIV_no_inf(fromstr, datumtype);
2791 ai16 = PerlSock_htons(ai16);
2792 PUSH16(utf8, cur, &ai16, FALSE);
2793 }
2794 break;
2795 case 'v' | TYPE_IS_SHRIEKING:
2796 case 'v':
2797 while (len-- > 0) {
2798 I16 ai16;
2799 fromstr = NEXTFROM;
2800 ai16 = (I16)SvIV_no_inf(fromstr, datumtype);
2801 ai16 = htovs(ai16);
2802 PUSH16(utf8, cur, &ai16, FALSE);
2803 }
2804 break;
2805 case 'S' | TYPE_IS_SHRIEKING:
2806#if SHORTSIZE != SIZE16
2807 while (len-- > 0) {
2808 unsigned short aushort;
2809 fromstr = NEXTFROM;
2810 aushort = SvUV_no_inf(fromstr, datumtype);
2811 PUSH_VAR(utf8, cur, aushort, needs_swap);
2812 }
2813 break;
2814#else
2815 /* FALLTHROUGH */
2816#endif
2817 case 'S':
2818 while (len-- > 0) {
2819 U16 au16;
2820 fromstr = NEXTFROM;
2821 au16 = (U16)SvUV_no_inf(fromstr, datumtype);
2822 PUSH16(utf8, cur, &au16, needs_swap);
2823 }
2824 break;
2825 case 's' | TYPE_IS_SHRIEKING:
2826#if SHORTSIZE != SIZE16
2827 while (len-- > 0) {
2828 short ashort;
2829 fromstr = NEXTFROM;
2830 ashort = SvIV_no_inf(fromstr, datumtype);
2831 PUSH_VAR(utf8, cur, ashort, needs_swap);
2832 }
2833 break;
2834#else
2835 /* FALLTHROUGH */
2836#endif
2837 case 's':
2838 while (len-- > 0) {
2839 I16 ai16;
2840 fromstr = NEXTFROM;
2841 ai16 = (I16)SvIV_no_inf(fromstr, datumtype);
2842 PUSH16(utf8, cur, &ai16, needs_swap);
2843 }
2844 break;
2845 case 'I':
2846 case 'I' | TYPE_IS_SHRIEKING:
2847 while (len-- > 0) {
2848 unsigned int auint;
2849 fromstr = NEXTFROM;
2850 auint = SvUV_no_inf(fromstr, datumtype);
2851 PUSH_VAR(utf8, cur, auint, needs_swap);
2852 }
2853 break;
2854 case 'j':
2855 while (len-- > 0) {
2856 IV aiv;
2857 fromstr = NEXTFROM;
2858 aiv = SvIV_no_inf(fromstr, datumtype);
2859 PUSH_VAR(utf8, cur, aiv, needs_swap);
2860 }
2861 break;
2862 case 'J':
2863 while (len-- > 0) {
2864 UV auv;
2865 fromstr = NEXTFROM;
2866 auv = SvUV_no_inf(fromstr, datumtype);
2867 PUSH_VAR(utf8, cur, auv, needs_swap);
2868 }
2869 break;
2870 case 'w':
2871 while (len-- > 0) {
2872 NV anv;
2873 fromstr = NEXTFROM;
2874 S_sv_check_infnan(aTHX_ fromstr, datumtype);
2875 anv = SvNV_nomg(fromstr);
2876
2877 if (anv < 0) {
2878 *cur = '\0';
2879 SvCUR_set(cat, cur - start);
2880 Perl_croak(aTHX_ "Cannot compress negative numbers in pack");
2881 }
2882
2883 /* 0xFFFFFFFFFFFFFFFF may cast to 18446744073709551616.0,
2884 which is == UV_MAX_P1. IOK is fine (instead of UV_only), as
2885 any negative IVs will have already been got by the croak()
2886 above. IOK is untrue for fractions, so we test them
2887 against UV_MAX_P1. */
2888 if (SvIOK(fromstr) || anv < UV_MAX_P1) {
2889 char buf[(sizeof(UV)*CHAR_BIT)/7+1];
2890 char *in = buf + sizeof(buf);
2891 UV auv = SvUV_nomg(fromstr);
2892
2893 do {
2894 *--in = (char)((auv & 0x7f) | 0x80);
2895 auv >>= 7;
2896 } while (auv);
2897 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */
2898 PUSH_GROWING_BYTES(utf8, cat, start, cur,
2899 in, (buf + sizeof(buf)) - in);
2900 } else if (SvPOKp(fromstr))
2901 goto w_string;
2902 else if (SvNOKp(fromstr)) {
2903 /* 10**NV_MAX_10_EXP is the largest power of 10
2904 so 10**(NV_MAX_10_EXP+1) is definitely unrepresentable
2905 given 10**(NV_MAX_10_EXP+1) == 128 ** x solve for x:
2906 x = (NV_MAX_10_EXP+1) * log (10) / log (128)
2907 And with that many bytes only Inf can overflow.
2908 Some C compilers are strict about integral constant
2909 expressions so we conservatively divide by a slightly
2910 smaller integer instead of multiplying by the exact
2911 floating-point value.
2912 */
2913#ifdef NV_MAX_10_EXP
2914 /* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */
2915 char buf[1 + (int)((NV_MAX_10_EXP + 1) / 2)]; /* valid C */
2916#else
2917 /* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */
2918 char buf[1 + (int)((308 + 1) / 2)]; /* valid C */
2919#endif
2920 char *in = buf + sizeof(buf);
2921
2922 anv = Perl_floor(anv);
2923 do {
2924 const NV next = Perl_floor(anv / 128);
2925 if (in <= buf) /* this cannot happen ;-) */
2926 Perl_croak(aTHX_ "Cannot compress integer in pack");
2927 *--in = (unsigned char)(anv - (next * 128)) | 0x80;
2928 anv = next;
2929 } while (anv > 0);
2930 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */
2931 PUSH_GROWING_BYTES(utf8, cat, start, cur,
2932 in, (buf + sizeof(buf)) - in);
2933 } else {
2934 const char *from;
2935 char *result, *in;
2936 SV *norm;
2937 STRLEN len;
2938 bool done;
2939
2940 w_string:
2941 /* Copy string and check for compliance */
2942 from = SvPV_nomg_const(fromstr, len);
2943 if ((norm = is_an_int(from, len)) == NULL)
2944 Perl_croak(aTHX_ "Can only compress unsigned integers in pack");
2945
2946 Newx(result, len, char);
2947 in = result + len;
2948 done = FALSE;
2949 while (!done) *--in = div128(norm, &done) | 0x80;
2950 result[len - 1] &= 0x7F; /* clear continue bit */
2951 PUSH_GROWING_BYTES(utf8, cat, start, cur,
2952 in, (result + len) - in);
2953 Safefree(result);
2954 SvREFCNT_dec(norm); /* free norm */
2955 }
2956 }
2957 break;
2958 case 'i':
2959 case 'i' | TYPE_IS_SHRIEKING:
2960 while (len-- > 0) {
2961 int aint;
2962 fromstr = NEXTFROM;
2963 aint = SvIV_no_inf(fromstr, datumtype);
2964 PUSH_VAR(utf8, cur, aint, needs_swap);
2965 }
2966 break;
2967 case 'N' | TYPE_IS_SHRIEKING:
2968 case 'N':
2969 while (len-- > 0) {
2970 U32 au32;
2971 fromstr = NEXTFROM;
2972 au32 = SvUV_no_inf(fromstr, datumtype);
2973 au32 = PerlSock_htonl(au32);
2974 PUSH32(utf8, cur, &au32, FALSE);
2975 }
2976 break;
2977 case 'V' | TYPE_IS_SHRIEKING:
2978 case 'V':
2979 while (len-- > 0) {
2980 U32 au32;
2981 fromstr = NEXTFROM;
2982 au32 = SvUV_no_inf(fromstr, datumtype);
2983 au32 = htovl(au32);
2984 PUSH32(utf8, cur, &au32, FALSE);
2985 }
2986 break;
2987 case 'L' | TYPE_IS_SHRIEKING:
2988#if LONGSIZE != SIZE32
2989 while (len-- > 0) {
2990 unsigned long aulong;
2991 fromstr = NEXTFROM;
2992 aulong = SvUV_no_inf(fromstr, datumtype);
2993 PUSH_VAR(utf8, cur, aulong, needs_swap);
2994 }
2995 break;
2996#else
2997 /* Fall though! */
2998#endif
2999 case 'L':
3000 while (len-- > 0) {
3001 U32 au32;
3002 fromstr = NEXTFROM;
3003 au32 = SvUV_no_inf(fromstr, datumtype);
3004 PUSH32(utf8, cur, &au32, needs_swap);
3005 }
3006 break;
3007 case 'l' | TYPE_IS_SHRIEKING:
3008#if LONGSIZE != SIZE32
3009 while (len-- > 0) {
3010 long along;
3011 fromstr = NEXTFROM;
3012 along = SvIV_no_inf(fromstr, datumtype);
3013 PUSH_VAR(utf8, cur, along, needs_swap);
3014 }
3015 break;
3016#else
3017 /* Fall though! */
3018#endif
3019 case 'l':
3020 while (len-- > 0) {
3021 I32 ai32;
3022 fromstr = NEXTFROM;
3023 ai32 = SvIV_no_inf(fromstr, datumtype);
3024 PUSH32(utf8, cur, &ai32, needs_swap);
3025 }
3026 break;
3027#if defined(HAS_QUAD) && IVSIZE >= 8
3028 case 'Q':
3029 while (len-- > 0) {
3030 Uquad_t auquad;
3031 fromstr = NEXTFROM;
3032 auquad = (Uquad_t) SvUV_no_inf(fromstr, datumtype);
3033 PUSH_VAR(utf8, cur, auquad, needs_swap);
3034 }
3035 break;
3036 case 'q':
3037 while (len-- > 0) {
3038 Quad_t aquad;
3039 fromstr = NEXTFROM;
3040 aquad = (Quad_t)SvIV_no_inf(fromstr, datumtype);
3041 PUSH_VAR(utf8, cur, aquad, needs_swap);
3042 }
3043 break;
3044#endif
3045 case 'P':
3046 len = 1; /* assume SV is correct length */
3047 GROWING(utf8, cat, start, cur, sizeof(char *));
3048 /* FALLTHROUGH */
3049 case 'p':
3050 while (len-- > 0) {
3051 const char *aptr;
3052
3053 fromstr = NEXTFROM;
3054 SvGETMAGIC(fromstr);
3055 if (!SvOK(fromstr)) aptr = NULL;
3056 else {
3057 /* XXX better yet, could spirit away the string to
3058 * a safe spot and hang on to it until the result
3059 * of pack() (and all copies of the result) are
3060 * gone.
3061 */
3062 if (((SvTEMP(fromstr) && SvREFCNT(fromstr) == 1)
3063 || (SvPADTMP(fromstr) &&
3064 !SvREADONLY(fromstr)))) {
3065 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
3066 "Attempt to pack pointer to temporary value");
3067 }
3068 if (SvPOK(fromstr) || SvNIOK(fromstr))
3069 aptr = SvPV_nomg_const_nolen(fromstr);
3070 else
3071 aptr = SvPV_force_flags_nolen(fromstr, 0);
3072 }
3073 PUSH_VAR(utf8, cur, aptr, needs_swap);
3074 }
3075 break;
3076 case 'u': {
3077 const char *aptr, *aend;
3078 bool from_utf8;
3079
3080 fromstr = NEXTFROM;
3081 if (len <= 2) len = 45;
3082 else len = len / 3 * 3;
3083 if (len >= 64) {
3084 Perl_ck_warner(aTHX_ packWARN(WARN_PACK),
3085 "Field too wide in 'u' format in pack");
3086 len = 63;
3087 }
3088 aptr = SvPV_const(fromstr, fromlen);
3089 from_utf8 = DO_UTF8(fromstr);
3090 if (from_utf8) {
3091 aend = aptr + fromlen;
3092 fromlen = sv_len_utf8_nomg(fromstr);
3093 } else aend = NULL; /* Unused, but keep compilers happy */
3094 GROWING(utf8, cat, start, cur, (fromlen+2) / 3 * 4 + (fromlen+len-1)/len * 2);
3095 while (fromlen > 0) {
3096 U8 *end;
3097 SSize_t todo;
3098 U8 hunk[1+63/3*4+1];
3099
3100 if ((SSize_t)fromlen > len)
3101 todo = len;
3102 else
3103 todo = fromlen;
3104 if (from_utf8) {
3105 char buffer[64];
3106 if (!S_utf8_to_bytes(aTHX_ &aptr, aend, buffer, todo,
3107 'u' | TYPE_IS_PACK)) {
3108 *cur = '\0';
3109 SvCUR_set(cat, cur - start);
3110 Perl_croak(aTHX_ "panic: string is shorter than advertised, "
3111 "aptr=%p, aend=%p, buffer=%p, todo=%zd",
3112 aptr, aend, buffer, todo);
3113 }
3114 end = doencodes(hunk, (const U8 *)buffer, todo);
3115 } else {
3116 end = doencodes(hunk, (const U8 *)aptr, todo);
3117 aptr += todo;
3118 }
3119 PUSH_BYTES(utf8, cur, hunk, end-hunk, 0);
3120 fromlen -= todo;
3121 }
3122 break;
3123 }
3124 }
3125 *cur = '\0';
3126 SvCUR_set(cat, cur - start);
3127 no_change:
3128 *symptr = lookahead;
3129 }
3130 return beglist;
3131}
3132#undef NEXTFROM
3133
3134
3135PP(pp_pack)
3136{
3137 dSP; dMARK; dORIGMARK; dTARGET;
3138 SV *cat = TARG;
3139 STRLEN fromlen;
3140 SV *pat_sv = *++MARK;
3141 const char *pat = SvPV_const(pat_sv, fromlen);
3142 const char *patend = pat + fromlen;
3143
3144 MARK++;
3145 SvPVCLEAR(cat);
3146 SvUTF8_off(cat);
3147
3148 packlist(cat, pat, patend, MARK, SP + 1);
3149
3150 SvSETMAGIC(cat);
3151 SP = ORIGMARK;
3152 PUSHs(cat);
3153 RETURN;
3154}
3155
3156/*
3157 * ex: set ts=8 sts=4 sw=4 et:
3158 */