[perl5.git] / dquote.c

/*    dquote.c
 *
 * This file contains functions that are related to
 * parsing double-quotish expressions.
 *
*/

#include "EXTERN.h"
#define PERL_IN_DQUOTE_C
#include "perl.h"
#include "dquote_inline.h"

/* XXX Add documentation after final interface and behavior is decided */
/* May want to show context for error, so would pass S_grok_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning)
    U8 source = *current;
*/

char
Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
{

    U8 result;

    if (! isPRINT_A(source)) {
        Perl_croak(aTHX_ "%s",
                        "Character following \"\\c\" must be printable ASCII");
    }
    else if (source == '{') {
        const char control = toCTRL('{');
        if (isPRINT_A(control)) {
            /* diag_listed_as: Use "%s" instead of "%s" */
            Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
        }
        else {
            Perl_croak(aTHX_ "Sequence \"\\c{\" invalid");
        }
    }

    result = toCTRL(source);
    if (output_warning && isPRINT_A(result)) {
        U8 clearer[3];
        U8 i = 0;
        if (! isWORDCHAR(result)) {
            clearer[i++] = '\\';
        }
        clearer[i++] = result;
        clearer[i++] = '\0';

        Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
                        "\"\\c%c\" is more clearly written simply as \"%s\"",
                        source,
                        clearer);
    }

    return result;
}

bool
Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
                      const char** error_msg,
                      const bool output_warning, const bool strict,
                      const bool silence_non_portable,
                      const bool UTF)
{

/*  Documentation to be supplied when interface nailed down finally
 *  This returns FALSE if there is an error which the caller need not recover
 *  from; otherwise TRUE.  In either case the caller should look at *len [???].
 *  It guarantees that the returned codepoint, *uv, when expressed as
 *  utf8 bytes, would fit within the skipped "\o{...}" bytes.
 *  On input:
 *	s   is the address of a pointer to a string.  **s is 'o', and the
 *	    previous character was a backslash.  At exit, *s will be advanced
 *	    to the byte just after those absorbed by this function.  Hence the
 *	    caller can continue parsing from there.  In the case of an error,
 *	    this routine has generally positioned *s to point just to the right
 *	    of the first bad spot, so that a message that has a "<--" to mark
 *	    the spot will be correctly positioned.
 *	send - 1  gives a limit in *s that this function is not permitted to
 *	    look beyond.  That is, the function may look at bytes only in the
 *	    range *s..send-1
 *	uv  points to a UV that will hold the output value, valid only if the
 *	    return from the function is TRUE
 *      error_msg is a pointer that will be set to an internal buffer giving an
 *	    error message upon failure (the return is FALSE).  Untouched if
 *	    function succeeds
 *	output_warning says whether to output any warning messages, or suppress
 *	    them
 *	strict is true if this should fail instead of warn if there are
 *	    non-octal digits within the braces
 *      silence_non_portable is true if to suppress warnings about the code
 *          point returned being too large to fit on all platforms.
 *	UTF is true iff the string *s is encoded in UTF-8.
 */
    char* e;
    STRLEN numbers_len;
    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
		| PERL_SCAN_DISALLOW_PREFIX
		/* XXX Until the message is improved in grok_oct, handle errors
		 * ourselves */
	        | PERL_SCAN_SILENT_ILLDIGIT;

    PERL_ARGS_ASSERT_GROK_BSLASH_O;

    assert(*(*s - 1) == '\\');
    assert(* *s       == 'o');
    (*s)++;

    if (**s != '{') {
	*error_msg = "Missing braces on \\o{}";
	return FALSE;
    }

    e = (char *) memchr(*s, '}', send - *s);
    if (!e) {
        (*s)++;  /* Move past the '{' */
        while (isOCTAL(**s)) { /* Position beyond the legal digits */
            (*s)++;
        }
        *error_msg = "Missing right brace on \\o{";
	return FALSE;
    }

    (*s)++;    /* Point to expected first digit (could be first byte of utf8
                  sequence if not a digit) */
    numbers_len = e - *s;
    if (numbers_len == 0) {
        (*s)++;    /* Move past the } */
	*error_msg = "Number with no digits";
	return FALSE;
    }

    if (silence_non_portable) {
        flags |= PERL_SCAN_SILENT_NON_PORTABLE;
    }

    *uv = grok_oct(*s, &numbers_len, &flags, NULL);
    /* Note that if has non-octal, will ignore everything starting with that up
     * to the '}' */

    if (numbers_len != (STRLEN) (e - *s)) {
        if (strict) {
            *s += numbers_len;
            *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1;
            *error_msg = "Non-octal character";
            return FALSE;
        }
        else if (output_warning) {
            Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
            /* diag_listed_as: Non-octal character '%c'.  Resolved as "%s" */
                        "Non-octal character '%c'.  Resolved as \"\\o{%.*s}\"",
                        *(*s + numbers_len),
                        (int) numbers_len,
                        *s);
        }
    }

    /* Return past the '}' */
    *s = e + 1;

    return TRUE;
}

bool
Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
                      const char** error_msg,
                      const bool output_warning, const bool strict,
                      const bool silence_non_portable,
                      const bool UTF)
{

/*  Documentation to be supplied when interface nailed down finally
 *  This returns FALSE if there is an error which the caller need not recover
 *  from; otherwise TRUE.
 *  It guarantees that the returned codepoint, *uv, when expressed as
 *  utf8 bytes, would fit within the skipped "\x{...}" bytes.
 *
 *  On input:
 *	s   is the address of a pointer to a string.  **s is 'x', and the
 *	    previous character was a backslash.  At exit, *s will be advanced
 *	    to the byte just after those absorbed by this function.  Hence the
 *	    caller can continue parsing from there.  In the case of an error,
 *	    this routine has generally positioned *s to point just to the right
 *	    of the first bad spot, so that a message that has a "<--" to mark
 *	    the spot will be correctly positioned.
 *	send - 1  gives a limit in *s that this function is not permitted to
 *	    look beyond.  That is, the function may look at bytes only in the
 *	    range *s..send-1
 *	uv  points to a UV that will hold the output value, valid only if the
 *	    return from the function is TRUE
 *      error_msg is a pointer that will be set to an internal buffer giving an
 *	    error message upon failure (the return is FALSE).  Untouched if
 *	    function succeeds
 *	output_warning says whether to output any warning messages, or suppress
 *	    them
 *	strict is true if anything out of the ordinary should cause this to
 *	    fail instead of warn or be silent.  For example, it requires
 *	    exactly 2 digits following the \x (when there are no braces).
 *	    3 digits could be a mistake, so is forbidden in this mode.
 *      silence_non_portable is true if to suppress warnings about the code
 *          point returned being too large to fit on all platforms.
 *	UTF is true iff the string *s is encoded in UTF-8.
 */
    char* e;
    STRLEN numbers_len;
    I32 flags = PERL_SCAN_DISALLOW_PREFIX;


    PERL_ARGS_ASSERT_GROK_BSLASH_X;

    assert(*(*s - 1) == '\\');
    assert(* *s      == 'x');
    (*s)++;

    if (strict || ! output_warning) {
        flags |= PERL_SCAN_SILENT_ILLDIGIT;
    }

    if (**s != '{') {
        STRLEN len = (strict) ? 3 : 2;

	*uv = grok_hex(*s, &len, &flags, NULL);
	*s += len;
        if (strict && len != 2) {
            if (len < 2) {
                *s += (UTF) ? UTF8SKIP(*s) : 1;
                *error_msg = "Non-hex character";
            }
            else {
                *error_msg = "Use \\x{...} for more than two hex characters";
            }
            return FALSE;
        }
	return TRUE;
    }

    e = (char *) memchr(*s, '}', send - *s);
    if (!e) {
        (*s)++;  /* Move past the '{' */
        while (isXDIGIT(**s)) { /* Position beyond the legal digits */
            (*s)++;
        }
        /* XXX The corresponding message above for \o is just '\\o{'; other
         * messages for other constructs include the '}', so are inconsistent.
         */
	*error_msg = "Missing right brace on \\x{}";
	return FALSE;
    }

    (*s)++;    /* Point to expected first digit (could be first byte of utf8
                  sequence if not a digit) */
    numbers_len = e - *s;
    if (numbers_len == 0) {
        if (strict) {
            (*s)++;    /* Move past the } */
            *error_msg = "Number with no digits";
            return FALSE;
        }
        *s = e + 1;
        *uv = 0;
        return TRUE;
    }

    flags |= PERL_SCAN_ALLOW_UNDERSCORES;
    if (silence_non_portable) {
        flags |= PERL_SCAN_SILENT_NON_PORTABLE;
    }

    *uv = grok_hex(*s, &numbers_len, &flags, NULL);
    /* Note that if has non-hex, will ignore everything starting with that up
     * to the '}' */

    if (strict && numbers_len != (STRLEN) (e - *s)) {
        *s += numbers_len;
        *s += (UTF) ? UTF8SKIP(*s) : 1;
        *error_msg = "Non-hex character";
        return FALSE;
    }

    /* Return past the '}' */
    *s = e + 1;

    return TRUE;
}

/*
 * ex: set ts=8 sts=4 sw=4 et:
 */
Commit	Line	Data
a55c5245	1	/* dquote.c
04e98a4d	2	*
a55c5245 JH	3	* This file contains functions that are related to
a55c5245 JH	4	* parsing double-quotish expressions.
04e98a4d	5	*
04e98a4d AD	6	*/
04e98a4d AD	7
a55c5245 JH	8	#include "EXTERN.h"
	9	#define PERL_IN_DQUOTE_C
	10	#include "perl.h"
ce54a8b9	11	#include "dquote_inline.h"
881ffab6	12
68b355dd	13	/* XXX Add documentation after final interface and behavior is decided */
f7e03a10	14	/* May want to show context for error, so would pass S_grok_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning)
68b355dd	15	U8 source = *current;
68b355dd KW	16	*/
68b355dd KW	17
a55c5245 JH	18	char
a55c5245 JH	19	Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
68b355dd KW	20	{
	21
	22	U8 result;
	23
421e43ba	24	if (! isPRINT_A(source)) {
7357bd17 KW	25	Perl_croak(aTHX_ "%s",
7357bd17 KW	26	"Character following \"\\c\" must be printable ASCII");
68b355dd	27	}
421e43ba	28	else if (source == '{') {
a27ed980 KW	29	const char control = toCTRL('{');
	30	if (isPRINT_A(control)) {
	31	/* diag_listed_as: Use "%s" instead of "%s" */
	32	Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
	33	}
	34	else {
	35	Perl_croak(aTHX_ "Sequence \"\\c{\" invalid");
	36	}
421e43ba	37	}
68b355dd KW	38
68b355dd KW	39	result = toCTRL(source);
5e784d58	40	if (output_warning && isPRINT_A(result)) {
4d8be631 KW	41	U8 clearer[3];
	42	U8 i = 0;
	43	if (! isWORDCHAR(result)) {
	44	clearer[i++] = '\\';
	45	}
	46	clearer[i++] = result;
	47	clearer[i++] = '\0';
68b355dd	48
d4360efa S	49	Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
	50	"\"\\c%c\" is more clearly written simply as \"%s\"",
	51	source,
	52	clearer);
68b355dd KW	53	}
	54
	55	return result;
	56	}
	57
a55c5245	58	bool
e8278639 KW	59	Perl_grok_bslash_o(pTHX_ char *s, const char const send, UV *uv,
e8278639 KW	60	const char** error_msg,
80f4111b	61	const bool output_warning, const bool strict,
17896a85	62	const bool silence_non_portable,
80f4111b	63	const bool UTF)
db30362b KW	64	{
	65
	66	/* Documentation to be supplied when interface nailed down finally
	67	* This returns FALSE if there is an error which the caller need not recover
61caa4da DM	68	* from; otherwise TRUE. In either case the caller should look at *len [???].
61caa4da DM	69	* It guarantees that the returned codepoint, *uv, when expressed as
7a4ca5b4	70	* utf8 bytes, would fit within the skipped "\o{...}" bytes.
db30362b	71	* On input:
e8278639 KW	72	* s is the address of a pointer to a string. **s is 'o', and the
	73	* previous character was a backslash. At exit, *s will be advanced
	74	* to the byte just after those absorbed by this function. Hence the
	75	* caller can continue parsing from there. In the case of an error,
	76	* this routine has generally positioned *s to point just to the right
	77	* of the first bad spot, so that a message that has a "<--" to mark
	78	* the spot will be correctly positioned.
	79	* send - 1 gives a limit in *s that this function is not permitted to
	80	* look beyond. That is, the function may look at bytes only in the
	81	* range *s..send-1
db30362b KW	82	* uv points to a UV that will hold the output value, valid only if the
db30362b KW	83	* return from the function is TRUE
db30362b KW	84	* error_msg is a pointer that will be set to an internal buffer giving an
	85	* error message upon failure (the return is FALSE). Untouched if
	86	* function succeeds
	87	* output_warning says whether to output any warning messages, or suppress
	88	* them
80f4111b KW	89	* strict is true if this should fail instead of warn if there are
80f4111b KW	90	* non-octal digits within the braces
17896a85 KW	91	* silence_non_portable is true if to suppress warnings about the code
17896a85 KW	92	* point returned being too large to fit on all platforms.
80f4111b	93	* UTF is true iff the string *s is encoded in UTF-8.
db30362b	94	*/
00ce5563	95	char* e;
db30362b KW	96	STRLEN numbers_len;
	97	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	98	\| PERL_SCAN_DISALLOW_PREFIX
	99	/* XXX Until the message is improved in grok_oct, handle errors
	100	* ourselves */
	101	\| PERL_SCAN_SILENT_ILLDIGIT;
	102
	103	PERL_ARGS_ASSERT_GROK_BSLASH_O;
	104
7f4ec488 KW	105	assert((s - 1) == '\\');
7f4ec488 KW	106	assert(* *s == 'o');
00ce5563	107	(*s)++;
db30362b	108
00ce5563	109	if (**s != '{') {
db30362b KW	110	*error_msg = "Missing braces on \\o{}";
	111	return FALSE;
	112	}
	113
e8278639	114	e = (char ) memchr(s, '}', send - *s);
db30362b	115	if (!e) {
00ce5563	116	(s)++; / Move past the '{' */
b8de99ca KW	117	while (isOCTAL(*s)) { / Position beyond the legal digits */
	118	(*s)++;
	119	}
00ce5563	120	*error_msg = "Missing right brace on \\o{";
db30362b KW	121	return FALSE;
	122	}
	123
00ce5563 KW	124	(s)++; / Point to expected first digit (could be first byte of utf8
	125	sequence if not a digit) */
	126	numbers_len = e - *s;
db30362b	127	if (numbers_len == 0) {
00ce5563	128	(s)++; / Move past the } */
db30362b KW	129	*error_msg = "Number with no digits";
	130	return FALSE;
	131	}
	132
17896a85 KW	133	if (silence_non_portable) {
	134	flags \|= PERL_SCAN_SILENT_NON_PORTABLE;
	135	}
	136
00ce5563	137	uv = grok_oct(s, &numbers_len, &flags, NULL);
db30362b KW	138	/* Note that if has non-octal, will ignore everything starting with that up
	139	* to the '}' */
	140
80f4111b KW	141	if (numbers_len != (STRLEN) (e - *s)) {
	142	if (strict) {
	143	*s += numbers_len;
	144	s += (UTF) ? UTF8SKIP(s) : (STRLEN) 1;
	145	*error_msg = "Non-octal character";
	146	return FALSE;
	147	}
	148	else if (output_warning) {
b67d718a KW	149	Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
	150	/* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */
	151	"Non-octal character '%c'. Resolved as \"\\o{%.*s}\"",
	152	(s + numbers_len),
	153	(int) numbers_len,
	154	*s);
80f4111b	155	}
db30362b KW	156	}
db30362b KW	157
00ce5563 KW	158	/* Return past the '}' */
	159	*s = e + 1;
	160
db30362b KW	161	return TRUE;
	162	}
	163
ce54a8b9	164	bool
e8278639 KW	165	Perl_grok_bslash_x(pTHX_ char *s, const char const send, UV *uv,
e8278639 KW	166	const char** error_msg,
ce54a8b9 KW	167	const bool output_warning, const bool strict,
	168	const bool silence_non_portable,
	169	const bool UTF)
	170	{
5e0a247b	171
ce54a8b9 KW	172	/* Documentation to be supplied when interface nailed down finally
	173	* This returns FALSE if there is an error which the caller need not recover
	174	* from; otherwise TRUE.
	175	* It guarantees that the returned codepoint, *uv, when expressed as
	176	* utf8 bytes, would fit within the skipped "\x{...}" bytes.
	177	*
	178	* On input:
e8278639 KW	179	* s is the address of a pointer to a string. **s is 'x', and the
	180	* previous character was a backslash. At exit, *s will be advanced
	181	* to the byte just after those absorbed by this function. Hence the
	182	* caller can continue parsing from there. In the case of an error,
	183	* this routine has generally positioned *s to point just to the right
	184	* of the first bad spot, so that a message that has a "<--" to mark
	185	* the spot will be correctly positioned.
	186	* send - 1 gives a limit in *s that this function is not permitted to
	187	* look beyond. That is, the function may look at bytes only in the
	188	* range *s..send-1
ce54a8b9 KW	189	* uv points to a UV that will hold the output value, valid only if the
	190	* return from the function is TRUE
	191	* error_msg is a pointer that will be set to an internal buffer giving an
	192	* error message upon failure (the return is FALSE). Untouched if
	193	* function succeeds
	194	* output_warning says whether to output any warning messages, or suppress
	195	* them
	196	* strict is true if anything out of the ordinary should cause this to
	197	* fail instead of warn or be silent. For example, it requires
	198	* exactly 2 digits following the \x (when there are no braces).
	199	* 3 digits could be a mistake, so is forbidden in this mode.
	200	* silence_non_portable is true if to suppress warnings about the code
	201	* point returned being too large to fit on all platforms.
	202	* UTF is true iff the string *s is encoded in UTF-8.
	203	*/
	204	char* e;
	205	STRLEN numbers_len;
	206	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7f4ec488	207
5e0a247b	208
ce54a8b9	209	PERL_ARGS_ASSERT_GROK_BSLASH_X;
5e0a247b	210
7f4ec488 KW	211	assert((s - 1) == '\\');
7f4ec488 KW	212	assert(* *s == 'x');
ce54a8b9	213	(*s)++;
5e0a247b	214
ce54a8b9 KW	215	if (strict \|\| ! output_warning) {
ce54a8b9 KW	216	flags \|= PERL_SCAN_SILENT_ILLDIGIT;
5e0a247b KW	217	}
5e0a247b KW	218
ce54a8b9 KW	219	if (**s != '{') {
	220	STRLEN len = (strict) ? 3 : 2;
	221
	222	uv = grok_hex(s, &len, &flags, NULL);
	223	*s += len;
	224	if (strict && len != 2) {
	225	if (len < 2) {
	226	s += (UTF) ? UTF8SKIP(s) : 1;
	227	*error_msg = "Non-hex character";
	228	}
	229	else {
	230	*error_msg = "Use \\x{...} for more than two hex characters";
	231	}
	232	return FALSE;
	233	}
	234	return TRUE;
	235	}
	236
e8278639	237	e = (char ) memchr(s, '}', send - *s);
ce54a8b9 KW	238	if (!e) {
	239	(s)++; / Move past the '{' */
	240	while (isXDIGIT(*s)) { / Position beyond the legal digits */
	241	(*s)++;
	242	}
	243	/* XXX The corresponding message above for \o is just '\\o{'; other
	244	* messages for other constructs include the '}', so are inconsistent.
	245	*/
	246	*error_msg = "Missing right brace on \\x{}";
	247	return FALSE;
	248	}
	249
	250	(s)++; / Point to expected first digit (could be first byte of utf8
	251	sequence if not a digit) */
	252	numbers_len = e - *s;
	253	if (numbers_len == 0) {
	254	if (strict) {
	255	(s)++; / Move past the } */
	256	*error_msg = "Number with no digits";
	257	return FALSE;
	258	}
	259	*s = e + 1;
	260	*uv = 0;
	261	return TRUE;
	262	}
	263
	264	flags \|= PERL_SCAN_ALLOW_UNDERSCORES;
	265	if (silence_non_portable) {
	266	flags \|= PERL_SCAN_SILENT_NON_PORTABLE;
	267	}
	268
	269	uv = grok_hex(s, &numbers_len, &flags, NULL);
	270	/* Note that if has non-hex, will ignore everything starting with that up
	271	* to the '}' */
	272
	273	if (strict && numbers_len != (STRLEN) (e - *s)) {
	274	*s += numbers_len;
	275	s += (UTF) ? UTF8SKIP(s) : 1;
	276	*error_msg = "Non-hex character";
	277	return FALSE;
	278	}
	279
	280	/* Return past the '}' */
	281	*s = e + 1;
	282
	283	return TRUE;
5e0a247b KW	284	}
5e0a247b KW	285
04e98a4d	286	/*
14d04a33	287	* ex: set ts=8 sts=4 sw=4 et:
04e98a4d	288	*/