#define PL_max_intro_pending (vTHX->Imax_intro_pending)
#define PL_maxsysfd (vTHX->Imaxsysfd)
#define PL_mbrlen_ps (vTHX->Imbrlen_ps)
+#define PL_mbrtowc_ps (vTHX->Imbrtowc_ps)
#define PL_memory_debug_header (vTHX->Imemory_debug_header)
#define PL_mess_sv (vTHX->Imess_sv)
#define PL_min_intro_pending (vTHX->Imin_intro_pending)
#define PL_warnhook (vTHX->Iwarnhook)
#define PL_watchaddr (vTHX->Iwatchaddr)
#define PL_watchok (vTHX->Iwatchok)
+#define PL_wcrtomb_ps (vTHX->Iwcrtomb_ps)
#define PL_xsubfilename (vTHX->Ixsubfilename)
#endif /* MULTIPLICITY */
#if ! defined(HAS_MBLEN) && ! defined(HAS_MBRLEN)
#define mblen(a,b) not_here("mblen")
#endif
-#ifndef HAS_MBTOWC
+#if ! defined(HAS_MBTOWC) && ! defined(HAS_MBRTOWC)
#define mbtowc(pwc, s, n) not_here("mbtowc")
#endif
#ifndef HAS_WCTOMB
OUTPUT:
RETVAL
+#if defined(HAS_MBRTOWC) && (defined(USE_ITHREADS) || ! defined(HAS_MBTOWC))
+# define USE_MBRTOWC
+#else
+# undef USE_MBRTOWC
+#endif
+
int
-mbtowc(pwc, s, n)
- wchar_t * pwc
- char * s
+mbtowc(pwc, s, n = ~0)
+ SV * pwc
+ SV * s
size_t n
- PREINIT:
-#if defined(USE_ITHREADS) && defined(HAS_MBRTOWC)
- mbstate_t ps;
-#endif
CODE:
-#if defined(USE_ITHREADS) && defined(HAS_MBRTOWC)
- memset(&ps, 0, sizeof(ps));;
- PERL_UNUSED_RESULT(mbrtowc(pwc, NULL, 0, &ps));/* Reset any shift state */
errno = 0;
- RETVAL = mbrtowc(pwc, s, n, &ps); /* Prefer reentrant version */
+ SvGETMAGIC(s);
+ if (! SvOK(s)) { /* Initialize state */
+#ifdef USE_MBRTOWC
+ /* Initialize the shift state to all zeros in PL_mbrtowc_ps. */
+ memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
+ RETVAL = 0;
+#else
+ LOCALE_LOCK;
+ RETVAL = mbtowc(NULL, NULL, 0);
+ LOCALE_UNLOCK;
+#endif
+ }
+ else { /* Not resetting state */
+ wchar_t wc;
+ SV * byte_s = sv_2mortal(newSVsv_nomg(s));
+ if (! sv_utf8_downgrade_nomg(byte_s, TRUE)) {
+ SETERRNO(EINVAL, LIB_INVARG);
+ RETVAL = -1;
+ }
+ else {
+ size_t len;
+ char * string = SvPV(byte_s, len);
+ if (n < len) len = n;
+#ifdef USE_MBRTOWC
+ RETVAL = (SSize_t) mbrtowc(&wc, string, len, &PL_mbrtowc_ps);
#else
- RETVAL = mbtowc(pwc, s, n);
+ /* Locking prevents races, but locales can be switched out
+ * without locking, so this isn't a cure all */
+ LOCALE_LOCK;
+ RETVAL = mbtowc(&wc, string, len);
+ LOCALE_UNLOCK;
#endif
+ if (RETVAL >= 0) {
+ sv_setiv_mg(pwc, wc);
+ }
+ else { /* Use mbtowc() ret code for transparency */
+ RETVAL = -1;
+ }
+ }
+ }
OUTPUT:
RETVAL
+#if defined(HAS_WCRTOMB) && (defined(USE_ITHREADS) || ! defined(HAS_WCTOMB))
+# define USE_WCRTOMB
+#else
+# undef USE_WCRTOMB
+#endif
+
int
wctomb(s, wchar)
- char * s
+ SV * s
wchar_t wchar
+ CODE:
+ errno = 0;
+ SvGETMAGIC(s);
+ if (s == &PL_sv_undef) {
+#ifdef USE_WCRTOMB
+ /* The man pages khw looked at are in agreement that this works.
+ * But probably memzero would too */
+ RETVAL = wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
+#else
+ LOCALE_LOCK;
+ RETVAL = wctomb(NULL, L'\0');
+ LOCALE_UNLOCK;
+#endif
+ }
+ else { /* Not resetting state */
+ char buffer[MB_LEN_MAX];
+#ifdef USE_WCRTOMB
+ RETVAL = wcrtomb(buffer, wchar, &PL_wcrtomb_ps);
+#else
+ /* Locking prevents races, but locales can be switched out without
+ * locking, so this isn't a cure all */
+ LOCALE_LOCK;
+ RETVAL = wctomb(buffer, wchar);
+ LOCALE_UNLOCK;
+#endif
+ if (RETVAL >= 0) {
+ sv_setpvn_mg(s, buffer, RETVAL);
+ }
+ }
+ OUTPUT:
+ RETVAL
int
strcoll(s1, s2)
=item C<mbtowc>
-This is identical to the C function C<mbtowc()>.
+This is the same as the C function C<mbtowc()> on unthreaded perls. On
+threaded perls, it transparently (almost) substitutes the more
+thread-safe L<C<mbrtowc>(3)>, if available, instead of C<mbtowc>.
-See L</mblen>.
+Core Perl does not have any support for wide and multibyte locales,
+except Unicode UTF-8 locales. This function, in conjunction with
+L</mblen> and L</wctomb> may be used to roll your own decoding/encoding
+of other types of multi-byte locales.
+
+The first parameter is a scalar into which, upon success, the wide
+character represented by the multi-byte string contained in the second
+parameter is stored. The optional third parameter is ignored if it is
+larger than the actual length of the second parameter string.
+
+Use C<undef> as the second parameter to this function to get the effect
+of passing NULL as the second parameter to C<mbtowc>. This resets any
+shift state to its initial value. The return value is undefined if
+C<mbrtowc> was substituted, so you should never rely on it.
+
+When the second parameter is a scalar containing a value that either is
+a PV string or can be forced into one, the return value is the number of
+bytes occupied by the first character of that string; or 0 if that first
+character is the wide NUL character; or negative if there is an error.
+This is based on the locale that currently underlies the program,
+regardless of whether or not the function is called from Perl code that
+is within the scope of S<C<use locale>>. Perl makes no attempt at
+hiding from your code any differences in the C<errno> setting between
+C<mbtowc> and C<mbrtowc>. It does set C<errno> to 0 before calling
+them.
=item C<memchr>
=item C<wctomb>
-This is identical to the C function C<wctomb()>.
+This is the same as the C function C<wctomb()> on unthreaded perls. On
+threaded perls, it transparently (almost) substitutes the more
+thread-safe L<C<wcrtomb>(3)>, if available, instead of C<wctomb>.
-See L</mblen>.
+Core Perl does not have any support for wide and multibyte locales,
+except Unicode UTF-8 locales. This function, in conjunction with
+L</mblen> and L</mbtowc> may be used to roll your own decoding/encoding
+of other types of multi-byte locales.
+
+Use C<undef> as the first parameter to this function to get the effect
+of passing NULL as the first parameter to C<wctomb>. This resets any
+shift state to its initial value. The return value is undefined if
+C<wcrtomb> was substituted, so you should never rely on it.
+
+When the first parameter is a scalar, the code point contained in the
+scalar second parameter is converted into a multi-byte string and stored
+into the first parameter scalar. This is based on the locale that
+currently underlies the program, regardless of whether or not the
+function is called from Perl code that is within the scope of S<C<use
+locale>>. The return value is the number of bytes stored; or negative
+if the code point isn't representable in the current locale. Perl makes
+no attempt at hiding from your code any differences in the C<errno>
+setting between C<wctomb> and C<wcrtomb>. It does set C<errno> to 0
+before calling them.
=item C<write>
require 'test.pl';
}
-plan tests => 6;
+my $utf8_locale = find_utf8_ctype_locale();
+plan tests => 13;
use POSIX qw();
skip("LC_CTYPE locale support not available", 4)
unless locales_enabled('LC_CTYPE');
- my $utf8_locale = find_utf8_ctype_locale();
skip("no utf8 locale available", 4) unless $utf8_locale;
local $ENV{LC_CTYPE} = $utf8_locale;
-1, {}, 'mblen() returns -1 when input length is too short');
}
}
+
+SKIP: {
+ skip("mbtowc() not present", 5) unless $Config{d_mbtowc};
+
+ my $wide;
+
+ is(&POSIX::mbtowc($wide, "a"), 1, 'mbtowc() returns correct length on ASCII input');
+ is($wide , ord "a", 'mbtowc() returns correct ordinal on ASCII input');
+
+ skip("LC_CTYPE locale support not available", 3)
+ unless locales_enabled('LC_CTYPE');
+
+ skip("no utf8 locale available", 3) unless $utf8_locale;
+
+ local $ENV{LC_CTYPE} = $utf8_locale;
+ local $ENV{LC_ALL};
+ delete $ENV{LC_ALL};
+ local $ENV{PERL_UNICODE};
+ delete $ENV{PERL_UNICODE};
+
+ SKIP: {
+ my ($major, $minor, $rest) = $Config{osvers} =~ / (\d+) \. (\d+) .* /x;
+ skip("mbtowc() broken (at least for c.utf8) on early HP-UX", 3)
+ if $Config{osname} eq 'hpux'
+ && $major < 11 || ($major == 11 && $minor < 31);
+
+ fresh_perl_is(
+ 'use POSIX; &POSIX::mbtowc(undef, undef,0); my $wide; print &POSIX::mbtowc($wide, "'
+ . I8_to_native("\x{c3}\x{28}")
+ . '", 2)',
+ -1, {}, 'mbtowc() recognizes invalid multibyte characters');
+
+ fresh_perl_is(
+ 'use POSIX; &POSIX::mbtowc(undef,undef,0);
+ my $sigma = "\N{GREEK SMALL LETTER SIGMA}";
+ utf8::encode($sigma);
+ my $wide; my $len = &POSIX::mbtowc($wide, $sigma, 2);
+ print "$len:$wide"',
+ "2:963", {}, 'mbtowc() works on UTF-8 characters');
+
+ fresh_perl_is(
+ 'use POSIX; &POSIX::mbtowc(undef,undef,0);
+ my $wide; print &POSIX::mbtowc($wide, "\N{GREEK SMALL LETTER SIGMA}", 1);',
+ -1, {}, 'mbtowc() returns -1 when input length is too short');
+ }
+}
+
+SKIP: {
+ skip("mbtowc or wctomb() not present", 2) unless $Config{d_mbtowc} && $Config{d_wctomb};
+
+ fresh_perl_is('use POSIX; &POSIX::wctomb(undef,0); my $string; my $len = &POSIX::wctomb($string, ord "A"); print "$len:$string"',
+ "1:A", {}, 'wctomb() works on ASCII input');
+
+ skip("LC_CTYPE locale support not available", 1)
+ unless locales_enabled('LC_CTYPE');
+
+ skip("no utf8 locale available", 1) unless $utf8_locale;
+
+ local $ENV{LC_CTYPE} = $utf8_locale;
+ local $ENV{LC_ALL};
+ delete $ENV{LC_ALL};
+ local $ENV{PERL_UNICODE};
+ delete $ENV{PERL_UNICODE};
+
+ SKIP: {
+ my ($major, $minor, $rest) = $Config{osvers} =~ / (\d+) \. (\d+) .* /x;
+ skip("wctomb() broken (at least for c.utf8) on early HP-UX", 1)
+ if $Config{osname} eq 'hpux'
+ && $major < 11 || ($major == 11 && $minor < 31);
+
+ fresh_perl_is('use POSIX; &POSIX::wctomb(undef,0); my $string; my $len = &POSIX::wctomb($string, 0x100); print "$len:$string"',
+ "2:" . I8_to_native("\x{c4}\x{80}"),
+ {}, 'wctomb() works on UTF-8 characters');
+
+ }
+}
#ifdef HAS_MBRLEN
PERLVAR(I, mbrlen_ps, mbstate_t)
#endif
+#ifdef HAS_MBRTOWC
+PERLVAR(I, mbrtowc_ps, mbstate_t)
+#endif
+#ifdef HAS_WCRTOMB
+PERLVAR(I, wcrtomb_ps, mbstate_t)
+#endif
/* If you are adding a U8 or U16, check to see if there are 'Space' comments
* above on where there are gaps which currently will be structure padding. */
# endif
# endif /* DEBUGGING */
- /* Initialize the per-thread mbrFOO() state variable. See POSIX.xs for
- * why this particular incantation is used. */
+ /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
+ * why these particular incantations are used. */
#ifdef HAS_MBRLEN
memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
#endif
+#ifdef HAS_MBRTOWC
+ memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
+#endif
+#ifdef HAS_WCTOMBR
+ wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
+#endif
/* Initialize the cache of the program's UTF-8ness for the always known
* locales C and POSIX */
A comparison of the two methods is given in
L<perlunicode/Comparison of \N{...} and \p{name=...}>.
-=head2 The C<POSIX::mblen()> function now works on shift state locales
-and is thread-safe on C99 and above compilers; the length parameter is
-now optional
+=head2 The C<POSIX::mblen()>, C<mbtowc>, and C<wctomb> functions now
+work on shift state locales and are thread-safe on C99 and above
+compilers when executed on a platform that has locale thread-safety; the
+length parameters are now optional.
-This function is always executed under the current C language locale.
+These functions are always executed under the current C language locale.
(See L<perllocale>.) Most locales are stateless, but a few, notably the
-very rarely encountered ISO 2022, maintain a state between calls to this
-function. Previously the state was cleared on every call to this
-function, but now the state is not reset unless the first parameter is
-C<undef>.
-
-On threaded perls, the C99 function L<mbrlen(3)>,
-when available, is substituted for plain
-C<mblen>.
-This makes this function thread-safe when executing on a locale
+very rarely encountered ISO 2022, maintain a state between calls to
+these functions. Previously the state was cleared on every call, but
+now the state is not reset unless the appropriate parameter is C<undef>.
+
+On threaded perls, the C99 functions L<mbrlen(3)>, L<mbrtowc(3)>, and
+L<wcrtomb(3)>, when available, are substituted for the plain functions.
+This makes these functions thread-safe when executing on a locale
thread-safe platform.
-The string length parameter is now optional; useful only if you wish to
-restrict the length parsed in the source string to less than the actual
-length.
+The string length parameters in C<mblen> and C<mbtowc> are now optional;
+useful only if you wish to restrict the length parsed in the source
+string to less than the actual length.
=head1 Security
F<t/run/switches.t> no longer uses (and re-uses) the F<tmpinplace/>
directory under F<t/>. This may prevent spurious failures. [GH #17424]
+=item *
+
+Various bugs in C<POSIX::mbtowc> were fixed. Potential races with
+other threads are now avoided, and previously the returned wide
+character could well be garbage.
+
+=item *
+
+Various bugs in C<POSIX::wctomb> were fixed. Potential races with other
+threads are now avoided, and previously it would segfault if the string
+parameter was shared or hadn't been pre-allocated with a string of
+sufficient length to hold the result.
+
=back
=head1 Platform Support
#ifdef HAS_MBRLEN
PL_mbrlen_ps = proto_perl->Imbrlen_ps;
#endif
+#ifdef HAS_MBRTOWC
+ PL_mbrtowc_ps = proto_perl->Imbrtowc_ps;
+#endif
+#ifdef HAS_WCRTOMB
+ PL_wcrtomb_ps = proto_perl->Iwcrtomb_ps;
+#endif
PL_langinfo_buf = NULL;
PL_langinfo_bufsize = 0;
Math::Random::Secure
Math::TrulyRandom
mbrlen(3)
+mbrtowc(3)
md5sum(1)
Method::Signatures
mmap(2)
waitpid(2)
waitpid(3)
Want
+wcrtomb(3)
wget(1)
Win32::Locale
write(2)