package Data::Dumper;
BEGIN {
- $VERSION = '2.157'; # Don't forget to set version and release
+ $VERSION = '2.158'; # Don't forget to set version and release
} # date in POD below!
#$| = 1;
or $Useperl = 1;
}
+my $IS_ASCII = ord 'A' == 65;
+
# module vars and their defaults
$Indent = 2 unless defined $Indent;
$Purity = 0 unless defined $Purity;
sub Dump {
return &Dumpxs
- unless $Data::Dumper::Useperl || (ref($_[0]) && $_[0]->{useperl}) ||
- $Data::Dumper::Deparse || (ref($_[0]) && $_[0]->{deparse});
+ unless $Data::Dumper::Useperl || (ref($_[0]) && $_[0]->{useperl})
+ || $Data::Dumper::Deparse || (ref($_[0]) && $_[0]->{deparse})
+
+ # Use pure perl version on earlier releases on EBCDIC platforms
+ || (! $IS_ASCII && $] lt 5.021_010);
return &Dumpperl;
}
"\e" => "\\e",
);
+my $low_controls = ($IS_ASCII)
+
+ # This includes \177, because traditionally it has been
+ # output as octal, even though it isn't really a "low"
+ # control
+ ? qr/[\0-\x1f\177]/
+
+ # EBCDIC low controls.
+ : qr/[\0-\x3f]/;
+
# put a string value in double quotes
sub qquote {
local($_) = shift;
s/([\\\"\@\$])/\\$1/g;
+
+ # This efficiently changes the high ordinal characters to \x{} if the utf8
+ # flag is on. On ASCII platforms, the high ordinals are all the
+ # non-ASCII's. On EBCDIC platforms, we don't include in these the non-ASCII
+ # controls whose ordinals are less than SPACE, excluded below by the range
+ # \0-\x3f. On ASCII platforms this range just compiles as part of :ascii:.
+ # On EBCDIC platforms, there is just one outlier high ordinal control, and
+ # it gets output as \x{}.
my $bytes; { use bytes; $bytes = length }
- s/([[:^ascii:]])/'\x{'.sprintf("%x",ord($1)).'}'/ge if $bytes > length;
- return qq("$_") unless
- /[^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~]/; # fast exit
+ s/([^[:ascii:]\0-\x3f])/sprintf("\\x{%x}",ord($1))/ge
+ if $bytes > length
- my $high = shift || "";
+ # The above doesn't get the EBCDIC outlier high ordinal control when
+ # the string is UTF-8 but there are no UTF-8 variant characters in it.
+ # We want that to come out as \x{} anyway. We need is_utf8() to do
+ # this.
+ || (! $IS_ASCII && $] ge 5.008_001 && utf8::is_utf8($_));
+
+ return qq("$_") if / ^ [[:print:]]* $ /x; # fast exit
+
+ # Here, there is at least one non-printable to output. First, translate the
+ # escapes.
s/([\a\b\t\n\f\r\e])/$esc{$1}/g;
- if (ord('^')==94) { # ascii
- # no need for 3 digits in escape for these
- s/([\0-\037])(?!\d)/'\\'.sprintf('%o',ord($1))/eg;
- s/([\0-\037\177])/'\\'.sprintf('%03o',ord($1))/eg;
+ # no need for 3 digits in escape for octals not followed by a digit.
+ s/($low_controls)(?!\d)/'\\'.sprintf('%o',ord($1))/eg;
+
+ # But otherwise use 3 digits
+ s/($low_controls)/'\\'.sprintf('%03o',ord($1))/eg;
+
# all but last branch below not supported --BEHAVIOR SUBJECT TO CHANGE--
- if ($high eq "iso8859") {
- s/([\200-\240])/'\\'.sprintf('%o',ord($1))/eg;
+ my $high = shift || "";
+ if ($high eq "iso8859") { # Doesn't escape the Latin1 printables
+ if ($IS_ASCII) {
+ s/([\200-\240])/'\\'.sprintf('%o',ord($1))/eg;
+ }
+ elsif ($] ge 5.007_003) {
+ my $high_control = utf8::unicode_to_native(0x9F);
+ s/$high_control/sprintf('\\%o',ord($1))/eg;
+ }
} elsif ($high eq "utf8") {
+# Some discussion of what to do here is in
+# https://rt.perl.org/Ticket/Display.html?id=113088
# use utf8;
# $str =~ s/([^\040-\176])/sprintf "\\x{%04x}", ord($1)/ge;
} elsif ($high eq "8bit") {
# leave it as it is
} else {
- s/([\200-\377])/'\\'.sprintf('%03o',ord($1))/eg;
- s/([^\040-\176])/sprintf "\\x{%04x}", ord($1)/ge;
+ s/([[:^ascii:]])/'\\'.sprintf('%03o',ord($1))/eg;
+ #s/([^\040-\176])/sprintf "\\x{%04x}", ord($1)/ge;
}
- }
- else { # ebcdic
- s{([^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~])(?!\d)}
- {my $v = ord($1); '\\'.sprintf(($v <= 037 ? '%o' : '%03o'), $v)}eg;
- s{([^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~])}
- {'\\'.sprintf('%03o',ord($1))}eg;
- }
return qq("$_");
}
=head1 VERSION
-Version 2.157 (January 29 2014)
+Version 2.158 (March 13 2015)
=head1 SEE ALSO
# define DD_USE_OLD_ID_FORMAT
#endif
+/* These definitions are ASCII only. But the pure-perl .pm avoids
+ * calling this .xs file for releases where they aren't defined */
+
+#ifndef isASCII
+# define isASCII(c) (((UV) (c)) < 128)
+#endif
+
+#ifndef ESC_NATIVE /* \e */
+# define ESC_NATIVE 27
+#endif
+
+#ifndef isPRINT
+# define isPRINT(c) (((UV) (c)) >= ' ' && ((UV) (c)) < 127)
+#endif
+
+#ifndef isALPHA
+# define isALPHA(c) ( (((UV) (c)) >= 'a' && ((UV) (c)) <= 'z') \
+ || (((UV) (c)) <= 'Z' && ((UV) (c)) >= 'A'))
+#endif
+
+#ifndef isIDFIRST
+# define isIDFIRST(c) (isALPHA(c) || (c) == '_')
+#endif
+
#ifndef isWORDCHAR
-# define isWORDCHAR(c) isALNUM(c)
+# define isWORDCHAR(c) (isIDFIRST(c) \
+ || (((UV) (c)) >= '0' && ((UV) (c)) <= '9'))
#endif
static I32 num_q (const char *s, STRLEN slen);
#if PERL_VERSION <= 6 /* Perl 5.6 and earlier */
-# ifdef EBCDIC
-# define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : ASCII_TO_NATIVE(ch))
-# else
-# define UNI_TO_NATIVE(ch) (ch)
-# endif
-
UV
Perl_utf8_to_uvchr_buf(pTHX_ U8 *s, U8 *send, STRLEN *retlen)
{
* end of the buffer if there is a malformation that indicates the
* character is longer than the space available */
- const UV uv = utf8_to_uvchr(s, retlen);
- return UNI_TO_NATIVE(uv);
+ return utf8_to_uvchr(s, retlen);
}
# if !defined(PERL_IMPLICIT_CONTEXT)
STRLEN qq_escapables = 0; /* " $ @ will need a \ in "" strings. */
STRLEN normal = 0;
int increment;
- UV next;
-
- /* this will need EBCDICification */
- for (s = src; s < send; do_utf8 ? s += increment : s++) {
- const UV k = do_utf8 ? utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL) : *(U8*)s;
- /* check for invalid utf8 */
- increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s);
+ for (s = src; s < send; s += increment) { /* Sizing pass */
+ UV k = *(U8*)s;
- /* this is only used to check if the next character is an
- * ASCII digit, which are invariant, so if the following collects
- * a UTF-8 start byte it does no harm
- */
- next = (s + increment >= send ) ? 0 : *(U8*)(s+increment);
+ increment = 1; /* Will override if necessary for utf-8 */
-#ifdef EBCDIC
- if (!isprint(k) || k > 256) {
-#else
- if (k > 127) {
-#endif
- /* 4: \x{} then count the number of hex digits. */
- grow += 4 + (k <= 0xFF ? 2 : k <= 0xFFF ? 3 : k <= 0xFFFF ? 4 :
+ if (isPRINT(k)) {
+ if (k == '\\') {
+ backslashes++;
+ } else if (k == '\'') {
+ single_quotes++;
+ } else if (k == '"' || k == '$' || k == '@') {
+ qq_escapables++;
+ } else {
+ normal++;
+ }
+ }
+ else if (! isASCII(k) && k > ' ') {
+ /* High ordinal non-printable code point. (The test that k is
+ * above SPACE should be optimized out by the compiler on
+ * non-EBCDIC platforms; otherwise we could put an #ifdef around
+ * it, but it's better to have just a single code path when
+ * possible. All but one of the non-ASCII EBCDIC controls are low
+ * ordinal; that one is the only one above SPACE.)
+ *
+ * If UTF-8, output as hex, regardless of useqq. This means there
+ * is an overhead of 4 chars '\x{}'. Then count the number of hex
+ * digits. */
+ if (do_utf8) {
+ k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL);
+
+ /* treat invalid utf8 byte by byte. This loop iteration gets the
+ * first byte */
+ increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s);
+
+ grow += 4 + (k <= 0xFF ? 2 : k <= 0xFFF ? 3 : k <= 0xFFFF ? 4 :
#if UVSIZE == 4
- 8 /* We may allocate a bit more than the minimum here. */
+ 8 /* We may allocate a bit more than the minimum here. */
#else
- k <= 0xFFFFFFFF ? 8 : UVSIZE * 4
-#endif
- );
-#ifndef EBCDIC
- } else if (useqq &&
- /* we can't use the short form like '\0' if followed by a digit */
- (((k >= 7 && k <= 10) || k == 12 || k == 13 || k == 27)
- || (k < 8 && (next < '0' || next > '9')))) {
- grow += 2;
- } else if (useqq && k <= 31 && (next < '0' || next > '9')) {
- grow += 3;
- } else if (useqq && (k <= 31 || k >= 127)) {
- grow += 4;
+ k <= 0xFFFFFFFF ? 8 : UVSIZE * 4
#endif
- } else if (k == '\\') {
- backslashes++;
- } else if (k == '\'') {
- single_quotes++;
- } else if (k == '"' || k == '$' || k == '@') {
- qq_escapables++;
- } else {
+ );
+ }
+ else if (useqq) { /* Not utf8, must be <= 0xFF, hence 2 hex
+ * digits. */
+ grow += 4 + 2;
+ }
+ else { /* Non-qq generates 3 octal digits plus backslash */
+ grow += 4;
+ }
+ } /* End of high-ordinal non-printable */
+ else if (! useqq) { /* Low ordinal, non-printable, non-qq just
+ * outputs the raw char */
normal++;
}
- }
+ else { /* Is qq, low ordinal, non-printable. Output escape
+ * sequences */
+ if ( k == '\a' || k == '\b' || k == '\t' || k == '\n' || k == '\r'
+ || k == '\f' || k == ESC_NATIVE)
+ {
+ grow += 2; /* 1 char plus backslash */
+ }
+ else /* The other low ordinals are output as an octal escape
+ * sequence */
+ if (s + 1 >= send || ( *(U8*)(s+1) >= '0'
+ && *(U8*)(s+1) <= '9'))
+ {
+ /* When the following character is a digit, use 3 octal digits
+ * plus backslash, as using fewer digits would concatenate the
+ * following char into this one */
+ grow += 4;
+ }
+ else if (k <= 7) {
+ grow += 2; /* 1 octal digit, plus backslash */
+ }
+ else if (k <= 077) {
+ grow += 3; /* 2 octal digits plus backslash */
+ }
+ else {
+ grow += 4; /* 3 octal digits plus backslash */
+ }
+ }
+ } /* End of size-calculating loop */
+
if (grow || useqq) {
/* We have something needing hex. 3 is ""\0 */
sv_grow(sv, cur + 3 + grow + 2*backslashes + single_quotes
*r++ = '"';
- for (s = src; s < send; do_utf8 ? s += UTF8SKIP(s) : s++) {
- const UV k = do_utf8 ? utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL) : *(U8*)s;
+ for (s = src; s < send; s += increment) {
+ UV k;
+
+ if (do_utf8
+ && ! isASCII(*s)
+ /* Exclude non-ASCII low ordinal controls. This should be
+ * optimized out by the compiler on ASCII platforms; if not
+ * could wrap it in a #ifdef EBCDIC, but better to avoid
+ * #if's if possible */
+ && *(U8*)s > ' '
+ ) {
+
+ /* When in UTF-8, we output all non-ascii chars as \x{}
+ * reqardless of useqq, except for the low ordinal controls on
+ * EBCDIC platforms */
+ k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL);
+
+ /* treat invalid utf8 byte by byte. This loop iteration gets the
+ * first byte */
+ increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s);
+
+#if PERL_VERSION < 10
+ sprintf(r, "\\x{%"UVxf"}", k);
+ r += strlen(r);
+ /* my_sprintf is not supported by ppport.h */
+#else
+ r = r + my_sprintf(r, "\\x{%"UVxf"}", k);
+#endif
+ continue;
+ }
+
+ /* Here 1) isn't UTF-8; or
+ * 2) the current character is ASCII; or
+ * 3) it is an EBCDIC platform and is a low ordinal
+ * non-ASCII control.
+ * In each case the character occupies just one byte */
+ k = *(U8*)s;
+ increment = 1;
+
+ if (isPRINT(k)) {
+ /* These need a backslash escape */
+ if (k == '"' || k == '\\' || k == '$' || k == '@') {
+ *r++ = '\\';
+ }
- if (k == '"' || k == '\\' || k == '$' || k == '@') {
- *r++ = '\\';
*r++ = (char)k;
}
- else
-#ifdef EBCDIC
- if (isprint(k) && k < 256)
-#else
- if (useqq && (k <= 31 || k == 127 || (!do_utf8 && k > 127))) {
+ else if (! useqq) { /* non-qq, non-printable, low-ordinal is
+ * output raw */
+ *r++ = (char)k;
+ }
+ else { /* Is qq means use escape sequences */
bool next_is_digit;
*r++ = '\\';
switch (k) {
- case 7: *r++ = 'a'; break;
- case 8: *r++ = 'b'; break;
- case 9: *r++ = 't'; break;
- case 10: *r++ = 'n'; break;
- case 12: *r++ = 'f'; break;
- case 13: *r++ = 'r'; break;
- case 27: *r++ = 'e'; break;
+ case '\a': *r++ = 'a'; break;
+ case '\b': *r++ = 'b'; break;
+ case '\t': *r++ = 't'; break;
+ case '\n': *r++ = 'n'; break;
+ case '\f': *r++ = 'f'; break;
+ case '\r': *r++ = 'r'; break;
+ case ESC_NATIVE: *r++ = 'e'; break;
default:
- increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s);
/* only ASCII digits matter here, which are invariant,
* since we only encode characters \377 and under, or
* \x177 and under for a unicode string
*/
- next = (s+increment < send) ? *(U8*)(s+increment) : 0;
- next_is_digit = next >= '0' && next <= '9';
+ next_is_digit = (s + 1 >= send )
+ ? FALSE
+ : (*(U8*)(s+1) >= '0' && *(U8*)(s+1) <= '9');
/* faster than
* r = r + my_sprintf(r, "%o", k);
}
}
}
- else if (k < 0x80)
-#endif
- *r++ = (char)k;
- else {
-#if PERL_VERSION < 10
- sprintf(r, "\\x{%"UVxf"}", k);
- r += strlen(r);
- /* my_sprintf is not supported by ppport.h */
-#else
- r = r + my_sprintf(r, "\\x{%"UVxf"}", k);
-#endif
- }
}
*r++ = '"';
} else {
len = my_snprintf(tmpbuf, sizeof(tmpbuf), "%"IVdf, SvIV(val));
if (SvPOK(val)) {
/* Need to check to see if this is a string such as " 0".
- I'm assuming from sprintf isn't going to clash with utf8.
- Is this valid on EBCDIC? */
+ I'm assuming from sprintf isn't going to clash with utf8. */
STRLEN pvlen;
const char * const pv = SvPV(val, pvlen);
if (pvlen != len || memNE(pv, tmpbuf, len))
#
# This is the exact equivalent of Dump. Well, almost. The things that are
# different as of now (due to Laziness):
-# * doesn't deparse yet.'
+# * doesn't do deparse yet.'
#
void