From: Karl Williamson <public@khwilliamson.com>
Date: Mon, 19 Mar 2012 21:03:01 +0000 (-0600)
Subject: utf8.c: Add utf8_to_uvchr_buf() and utf8_to_uvuni_buf()
X-Git-Tag: v5.15.9~27
X-Git-Url: https://perl5.git.perl.org/perl5.git/commitdiff_plain/ec5f19d09949aac9034bb62ade44ffba8d4d2bb1

utf8.c: Add utf8_to_uvchr_buf() and utf8_to_uvuni_buf()

The existing functions (utf8_to_uvchr and utf8_to_uvuni) have a
deficiency in that they could read beyond the end of the input string if
given malformed input.  This commit creates two new functions which
behave as the old ones did, but have an extra parameter each, which
gives the upper limit to the string, so no read beyond it is done.
---

diff --git a/embed.fnc b/embed.fnc
index c549dc9..5a49690 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1450,6 +1450,8 @@ ApMd	|U8*	|bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8
 ApMd	|U8*	|bytes_to_utf8	|NN const U8 *s|NN STRLEN *len
 Apd	|UV	|utf8_to_uvchr	|NN const U8 *s|NULLOK STRLEN *retlen
 Apd	|UV	|utf8_to_uvuni	|NN const U8 *s|NULLOK STRLEN *retlen
+Apd	|UV	|utf8_to_uvchr_buf	|NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
+Apd	|UV	|utf8_to_uvuni_buf	|NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
 pM	|bool	|check_utf8_print	|NN const U8 *s|const STRLEN len
 
 #ifdef EBCDIC
diff --git a/embed.h b/embed.h
index 1d1e598..8a39047 100644
--- a/embed.h
+++ b/embed.h
@@ -672,7 +672,9 @@
 #define utf8_length(a,b)	Perl_utf8_length(aTHX_ a,b)
 #define utf8_to_bytes(a,b)	Perl_utf8_to_bytes(aTHX_ a,b)
 #define utf8_to_uvchr(a,b)	Perl_utf8_to_uvchr(aTHX_ a,b)
+#define utf8_to_uvchr_buf(a,b,c)	Perl_utf8_to_uvchr_buf(aTHX_ a,b,c)
 #define utf8_to_uvuni(a,b)	Perl_utf8_to_uvuni(aTHX_ a,b)
+#define utf8_to_uvuni_buf(a,b,c)	Perl_utf8_to_uvuni_buf(aTHX_ a,b,c)
 #define utf8n_to_uvuni(a,b,c,d)	Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
 #define uvchr_to_utf8_flags(a,b,c)	Perl_uvchr_to_utf8_flags(aTHX_ a,b,c)
 #define uvuni_to_utf8_flags(a,b,c)	Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index c6fe15f..8b04237 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -104,7 +104,11 @@ The code has been refactored to reduce duplication.
 
 =item *
 
-XXX
+Two new functions C<utf8_to_uvchr_buf()> and C<utf8_to_uvuni_buf()> have
+been added.  These are the same as C<utf8_to_uvchr> and
+C<utf8_to_uvuni>, but take an extra parameter that is used to guard
+against reading beyond the end of the input string.
+See L<perlapi/utf8_to_uvchr_buf> and L<perlapi/utf8_to_uvuni_buf>.
 
 =back
 
diff --git a/proto.h b/proto.h
index b811e6b..9c91855 100644
--- a/proto.h
+++ b/proto.h
@@ -4568,11 +4568,23 @@ PERL_CALLCONV UV	Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 #define PERL_ARGS_ASSERT_UTF8_TO_UVCHR	\
 	assert(s)
 
+PERL_CALLCONV UV	Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+			__attribute__nonnull__(pTHX_1)
+			__attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF	\
+	assert(s); assert(send)
+
 PERL_CALLCONV UV	Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 			__attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_UTF8_TO_UVUNI	\
 	assert(s)
 
+PERL_CALLCONV UV	Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+			__attribute__nonnull__(pTHX_1)
+			__attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF	\
+	assert(s); assert(send)
+
 PERL_CALLCONV UV	Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 			__attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_UTF8N_TO_UVUNI	\
diff --git a/utf8.c b/utf8.c
index 0aede4c..1faa96d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -563,7 +563,7 @@ All other code points corresponding to Unicode characters, including private
 use and those yet to be assigned, are never considered malformed and never
 warn.
 
-Most code should use L</utf8_to_uvchr>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
 
 =cut
 */
@@ -795,6 +795,31 @@ malformed:
 }
 
 /*
+=for apidoc utf8_to_uvchr_buf
+
+Returns the native code point of the first character in the string C<s> which
+is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
+C<retlen> will be set to the length, in bytes, of that character.
+
+If C<s> does not point to a well-formed UTF-8 character, zero is
+returned and C<retlen> is set, if possible, to -1.
+
+=cut
+*/
+
+
+UV
+Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
+
+    assert(s < send);
+
+    return utf8n_to_uvchr(s, send - s, retlen,
+			  ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+}
+
+/*
 =for apidoc utf8_to_uvchr
 
 Returns the native code point of the first character in the string C<s>
@@ -817,6 +842,34 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 }
 
 /*
+=for apidoc utf8_to_uvuni_buf
+
+Returns the Unicode code point of the first character in the string C<s> which
+is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
+C<retlen> will be set to the length, in bytes, of that character.
+
+This function should only be used when the returned UV is considered
+an index into the Unicode semantic tables (e.g. swashes).
+
+If C<s> does not point to a well-formed UTF-8 character, zero is
+returned and C<retlen> is set, if possible, to -1.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
+
+    assert(send > s);
+
+    /* Call the low level routine asking for checks */
+    return Perl_utf8n_to_uvuni(aTHX_ s, send -s, retlen,
+			       ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+}
+
+/*
 =for apidoc utf8_to_uvuni
 
 Returns the Unicode code point of the first character in the string C<s>