1/*    dquote.c
2 *
3 * This file contains functions that are related to
4 * parsing double-quotish expressions.
5 *
6*/
7
8#include "EXTERN.h"
9#define PERL_IN_DQUOTE_C
10#include "perl.h"
11
12/* XXX Add documentation after final interface and behavior is decided */
13
14bool
15Perl_grok_bslash_c(pTHX_ const char   source,
16                         U8 *         result,
17                         const char** message,
18                         U32 *        packed_warn)
19{
20    PERL_ARGS_ASSERT_GROK_BSLASH_C;
21
22    /* This returns TRUE if the \c? sequence is valid; FALSE otherwise.  If it
23     * is valid, the sequence evaluates to a single character, which will be
24     * stored into *result.
25     *
26     * source   is the character immediately after a '\c' sequence.
27     * result   points to a char variable into which this function will store
28     *          what the sequence evaluates to, if valid; unchanged otherwise.
29     * message  A pointer to any warning or error message will be stored into
30     *          this pointer; NULL if none.
31     * packed_warn if NULL on input asks that this routine display any warning
32     *          messages.  Otherwise, if the function found a warning, the
33     *          packed warning categories will be stored into *packed_warn (and
34     *          the corresponding message text into *message); 0 if none.
35     */
36
37    *message = NULL;
38    if (packed_warn) *packed_warn = 0;
39
40    if (! isPRINT_A(source)) {
41        *message = "Character following \"\\c\" must be printable ASCII";
42        return FALSE;
43    }
44
45    if (source == '{') {
46        const char control = toCTRL('{');
47        if (isPRINT_A(control)) {
48            /* diag_listed_as: Use "%s" instead of "%s" */
49            *message = Perl_form(aTHX_ PERL_DIAG_DIE_SYNTAX("Use \"%c\" instead of \"\\c{\""), control);
50        }
51        else {
52            *message = "Sequence \"\\c{\" invalid";
53        }
54        return FALSE;
55    }
56
57    *result = toCTRL(source);
58    if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) {
59        U8 clearer[3];
60        U8 i = 0;
61        char format[] = PERL_DIAG_WARN_SYNTAX("\"\\c%c\" is more clearly written simply as \"%s\"");
62
63        if (! isWORDCHAR(*result)) {
64            clearer[i++] = '\\';
65        }
66        clearer[i++] = *result;
67        clearer[i++] = '\0';
68
69        if (packed_warn) {
70            *message = Perl_form(aTHX_ format, source, clearer);
71            *packed_warn = packWARN(WARN_SYNTAX);
72        }
73        else {
74            Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer);
75        }
76    }
77
78    return TRUE;
79}
80
81const char *
82Perl_form_alien_digit_msg(pTHX_
83        const U8 which,           /* 8 or 16 */
84        const STRLEN valids_len,  /* length of input before first bad char */
85        const char * const first_bad, /* Ptr to that bad char */
86        const char * const send,      /* End of input string */
87        const bool UTF,               /* Is it in UTF-8? */
88        const bool braced)            /* Is it enclosed in {} */
89{
90    /* Generate a mortal SV containing an appropriate warning message about
91     * alien characters found in an octal or hex constant given by the inputs,
92     * and return a pointer to that SV's string.  The message looks like:
93     *
94     * Non-hex character '?' terminates \x early.  Resolved as "\x{...}"
95     *
96     */
97
98    /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
99     * (leading zeros could take up more space, and the scalar will
100     * automatically grow if necessary).  Space for NUL is added by the newSV()
101     * function */
102    SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
103    SV * message_sv = sv_newmortal();
104    char symbol;
105
106    PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
107    assert(which == 8 || which == 16);
108
109    /* Calculate the display form of the character */
110    if (    UVCHR_IS_INVARIANT(*first_bad)
111        || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
112    {
113        pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
114                                                    (STRLEN) -1, UNI_DISPLAY_QQ);
115    }
116    else {  /* Is not UTF-8, or is illegal UTF-8.  Show just the one byte */
117
118        /* It also isn't a UTF-8 invariant character, so no display shortcuts
119         * are available.  Use \\x{...} */
120        Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
121    }
122
123    /* Ready to start building the message */
124    sv_setpvs(message_sv, "Non-");
125    if (which == 8) {
126        sv_catpvs(message_sv, "octal");
127        if (braced) {
128            symbol = 'o';
129        }
130        else {
131            symbol = '0';   /* \008, for example */
132        }
133    }
134    else {
135        sv_catpvs(message_sv, "hex");
136        symbol = 'x';
137    }
138    sv_catpvs(message_sv, " character ");
139
140    if (isPRINT(*first_bad)) {
141        sv_catpvs(message_sv, "'");
142    }
143    sv_catsv(message_sv, display_char);
144    if (isPRINT(*first_bad)) {
145        sv_catpvs(message_sv, "'");
146    }
147    Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early.  Resolved as "
148                                     "\"\\%c", symbol, symbol);
149    if (braced) {
150        sv_catpvs(message_sv, "{");
151    }
152
153    /* Octal constants have an extra leading 0, but \0 already includes that */
154    if (symbol == 'o' && valids_len < 3) {
155        sv_catpvs(message_sv, "0");
156    }
157    if (valids_len == 0) {  /* No legal digits at all */
158        sv_catpvs(message_sv, "00");
159    }
160    else if (valids_len == 1) { /* Just one is legal */
161        sv_catpvs(message_sv, "0");
162    }
163    sv_catpvn(message_sv, first_bad - valids_len, valids_len);
164
165    if (braced) {
166        sv_catpvs(message_sv, "}");
167    }
168    else {
169        sv_catsv(message_sv, display_char);
170    }
171    sv_catpvs(message_sv, "\"");
172
173    SvREFCNT_dec_NN(display_char);
174
175    return SvPVX_const(message_sv);
176}
177
178const char *
179Perl_form_cp_too_large_msg(pTHX_
180        const U8 which,        /* 8 or 16 */
181        const char * string,   /* NULL, or the text that is supposed to
182                                  represent a code point */
183        const Size_t len,      /* length of 'string' if not NULL; else 0 */
184        const UV cp)           /* 0 if 'string' not NULL; else the too-large
185                                  code point */
186{
187    /* Generate a mortal SV containing an appropriate warning message about
188     * code points that are too large for this system, given by the inputs,
189     * and return a pointer to that SV's string.  Either the text of the string
190     * to be converted to a code point is input, or a code point itself.  The
191     * former is needed to accurately represent something that overflows.
192     *
193     * The message looks like:
194     *
195     * Use of code point %s is not allowed; the permissible max is %s
196     *
197     */
198
199    SV * message_sv = sv_newmortal();
200    const char * format;
201    const char * prefix;
202
203    PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
204    assert(which == 8 || which == 16);
205
206    /* One but not both must be non-zero */
207    assert((string != NULL) ^ (cp != 0));
208    assert((string == NULL) || len);
209
210    if (which == 8) {
211        format = "%" UVof;
212        prefix = "0";
213    }
214    else {
215        format = "%" UVXf;
216        prefix = "0x";
217    }
218
219    Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
220    if (string) {
221        Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
222    }
223    else {
224        Perl_sv_catpvf(aTHX_ message_sv, format, cp);
225    }
226    Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
227    Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
228
229    return SvPVX_const(message_sv);
230}
231
232bool
233Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
234                      const char** message,
235                      U32 *      packed_warn,
236                      const bool strict,
237                      const bool allow_UV_MAX,
238                      const bool UTF)
239{
240
241/*  Documentation to be supplied when interface nailed down finally
242 *  This returns FALSE if there is an error the caller should probably die
243 *  from; otherwise TRUE.
244 *	s   is the address of a pointer to a string.  **s is 'o', and the
245 *	    previous character was a backslash.  At exit, *s will be advanced
246 *	    to the byte just after those absorbed by this function.  Hence the
247 *	    caller can continue parsing from there.  In the case of an error
248 *	    when this function returns FALSE, continuing to parse is not an
249 *	    option, this routine has generally positioned *s to point just to
250 *	    the right of the first bad spot, so that a message that has a "<--"
251 *	    to mark the spot will be correctly positioned.
252 *	send - 1  gives a limit in *s that this function is not permitted to
253 *	    look beyond.  That is, the function may look at bytes only in the
254 *	    range *s..send-1
255 *	uv  points to a UV that will hold the output value, valid only if the
256 *	    return from the function is TRUE; may be changed from the input
257 *	    value even when FALSE is returned.
258 *      message  A pointer to any warning or error message will be stored into
259 *          this pointer; NULL if none.
260 *      packed_warn if NULL on input asks that this routine display any warning
261 *          messages.  Otherwise, if the function found a warning, the packed
262 *          warning categories will be stored into *packed_warn (and the
263 *          corresponding message text into *message); 0 if none.
264 *	strict is true if this should fail instead of warn if there are
265 *	    non-octal digits within the braces
266 *      allow_UV_MAX is true if this shouldn't fail if the input code point is
267 *          UV_MAX, which is normally illegal, reserved for internal use.
268 *	UTF is true iff the string *s is encoded in UTF-8.
269 */
270    char * e;
271    char * rbrace;
272    STRLEN numbers_len;
273    STRLEN trailing_blanks_len = 0;
274    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
275              | PERL_SCAN_DISALLOW_PREFIX
276              | PERL_SCAN_SILENT_NON_PORTABLE
277              | PERL_SCAN_SILENT_ILLDIGIT
278              | PERL_SCAN_SILENT_OVERFLOW;
279
280    PERL_ARGS_ASSERT_GROK_BSLASH_O;
281
282    assert(*(*s - 1) == '\\');
283    assert(* *s       == 'o');
284
285    *message = NULL;
286    if (packed_warn) *packed_warn = 0;
287
288    (*s)++;
289
290    if (send <= *s || **s != '{') {
291        *message = "Missing braces on \\o{}";
292        return FALSE;
293    }
294
295    rbrace = (char *) memchr(*s, '}', send - *s);
296    if (!rbrace) {
297        (*s)++;  /* Move past the '{' */
298
299        /* Position beyond the legal digits and blanks */
300        while (*s < send && isBLANK(**s)) {
301            (*s)++;
302        }
303
304        while (*s < send && isOCTAL(**s)) {
305            (*s)++;
306        }
307
308        *message = "Missing right brace on \\o{}";
309        return FALSE;
310    }
311
312    /* Point to expected first digit (could be first byte of utf8 sequence if
313     * not a digit) */
314    (*s)++;
315    while (isBLANK(**s)) {
316        (*s)++;
317    }
318
319    e = rbrace;
320    while (*s < e && isBLANK(*(e - 1))) {
321        e--;
322    }
323
324    numbers_len = e - *s;
325    if (numbers_len == 0) {
326        (*s)++;    /* Move past the '}' */
327        *message = "Empty \\o{}";
328        return FALSE;
329    }
330
331    *uv = grok_oct(*s, &numbers_len, &flags, NULL);
332    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
333                 || (! allow_UV_MAX && *uv == UV_MAX)))
334    {
335        *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
336        *s = rbrace + 1;
337        return FALSE;
338    }
339
340    while (isBLANK(**s)) {
341        trailing_blanks_len++;
342        (*s)++;
343    }
344
345    /* Note that if has non-octal, will ignore everything starting with that up
346     * to the '}' */
347    if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) {
348        *s += numbers_len;
349        if (strict) {
350            *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
351            *message = "Non-octal character";
352            return FALSE;
353        }
354
355        if (ckWARN(WARN_DIGIT)) {
356            const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
357                                                                      UTF, TRUE);
358            if (packed_warn) {
359                *message = failure;
360                *packed_warn = packWARN(WARN_DIGIT);
361            }
362            else {
363                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
364            }
365        }
366    }
367
368    /* Return past the '}' */
369    *s = rbrace + 1;
370
371    return TRUE;
372}
373
374bool
375Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
376                      const char** message,
377                      U32 *      packed_warn,
378                      const bool strict,
379                      const bool allow_UV_MAX,
380                      const bool UTF)
381{
382
383/*  Documentation to be supplied when interface nailed down finally
384 *  This returns FALSE if there is an error the caller should probably die
385 *  from; otherwise TRUE.
386 *  It guarantees that the returned codepoint, *uv, when expressed as
387 *  utf8 bytes, would fit within the skipped "\x{...}" bytes.
388 *
389 *  On input:
390 *	s   is the address of a pointer to a string.  **s is 'x', and the
391 *	    previous character was a backslash.  At exit, *s will be advanced
392 *	    to the byte just after those absorbed by this function.  Hence the
393 *	    caller can continue parsing from there.  In the case of an error,
394 *	    this routine has generally positioned *s to point just to the right
395 *	    of the first bad spot, so that a message that has a "<--" to mark
396 *	    the spot will be correctly positioned.
397 *	send - 1  gives a limit in *s that this function is not permitted to
398 *	    look beyond.  That is, the function may look at bytes only in the
399 *	    range *s..send-1
400 *	uv  points to a UV that will hold the output value, valid only if the
401 *	    return from the function is TRUE; may be changed from the input
402 *	    value even when FALSE is returned.
403 *      message  A pointer to any warning or error message will be stored into
404 *          this pointer; NULL if none.
405 *      packed_warn if NULL on input asks that this routine display any warning
406 *          messages.  Otherwise, if the function found a warning, the packed
407 *          warning categories will be stored into *packed_warn (and the
408 *          corresponding message text into *message); 0 if none.
409 *	strict is true if anything out of the ordinary should cause this to
410 *	    fail instead of warn or be silent.  For example, it requires
411 *	    exactly 2 digits following the \x (when there are no braces).
412 *	    3 digits could be a mistake, so is forbidden in this mode.
413 *      allow_UV_MAX is true if this shouldn't fail if the input code point is
414 *          UV_MAX, which is normally illegal, reserved for internal use.
415 *	UTF is true iff the string *s is encoded in UTF-8.
416 */
417    char* e;
418    char * rbrace;
419    STRLEN numbers_len;
420    STRLEN trailing_blanks_len = 0;
421    I32 flags = PERL_SCAN_DISALLOW_PREFIX
422              | PERL_SCAN_SILENT_ILLDIGIT
423              | PERL_SCAN_NOTIFY_ILLDIGIT
424              | PERL_SCAN_SILENT_NON_PORTABLE
425              | PERL_SCAN_SILENT_OVERFLOW;
426
427    PERL_ARGS_ASSERT_GROK_BSLASH_X;
428
429    assert(*(*s - 1) == '\\');
430    assert(* *s      == 'x');
431
432    *message = NULL;
433    if (packed_warn) *packed_warn = 0;
434
435    (*s)++;
436
437    if (send <= *s) {
438        if (strict) {
439            *message = "Empty \\x";
440            return FALSE;
441        }
442
443        /* Sadly, to preserve backcompat, an empty \x at the end of string is
444         * interpreted as a NUL */
445        *uv = 0;
446        return TRUE;
447    }
448
449    if (**s != '{') {
450        numbers_len = (strict) ? 3 : 2;
451
452        *uv = grok_hex(*s, &numbers_len, &flags, NULL);
453        *s += numbers_len;
454
455        if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
456            if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
457                *message = "Use \\x{...} for more than two hex characters";
458                return FALSE;
459            }
460            else if (strict) {
461                    *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
462                    *message = "Non-hex character";
463                    return FALSE;
464            }
465            else if (ckWARN(WARN_DIGIT)) {
466                const char * failure = form_alien_digit_msg(16, numbers_len, *s,
467                                                              send, UTF, FALSE);
468
469                if (! packed_warn) {
470                    Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
471                }
472                else {
473                    *message = failure;
474                    *packed_warn = packWARN(WARN_DIGIT);
475                }
476            }
477        }
478        return TRUE;
479    }
480
481    rbrace = (char *) memchr(*s, '}', send - *s);
482    if (!rbrace) {
483        (*s)++;  /* Move past the '{' */
484
485        /* Position beyond legal blanks and digits */
486        while (*s < send && isBLANK(**s)) {
487            (*s)++;
488        }
489
490        while (*s < send && isXDIGIT(**s)) {
491            (*s)++;
492        }
493
494        *message = "Missing right brace on \\x{}";
495        return FALSE;
496    }
497
498    (*s)++;    /* Point to expected first digit (could be first byte of utf8
499                  sequence if not a digit) */
500    while (isBLANK(**s)) {
501        (*s)++;
502    }
503
504    e = rbrace;
505    while (*s < e && isBLANK(*(e - 1))) {
506        e--;
507    }
508
509    numbers_len = e - *s;
510    if (numbers_len == 0) {
511        if (strict) {
512            (*s)++;    /* Move past the } */
513            *message = "Empty \\x{}";
514            return FALSE;
515        }
516        *s = rbrace + 1;
517        *uv = 0;
518        return TRUE;
519    }
520
521    flags |= PERL_SCAN_ALLOW_UNDERSCORES;
522
523    *uv = grok_hex(*s, &numbers_len, &flags, NULL);
524    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
525                 || (! allow_UV_MAX && *uv == UV_MAX)))
526    {
527        *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
528        *s = e + 1;
529        return FALSE;
530    }
531
532    while (isBLANK(**s)) {
533        trailing_blanks_len++;
534        (*s)++;
535    }
536
537    if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) {
538        *s += numbers_len;
539        if (strict) {
540            *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
541            *message = "Non-hex character";
542            return FALSE;
543        }
544
545        if (ckWARN(WARN_DIGIT)) {
546            const char * failure = form_alien_digit_msg(16, numbers_len, *s,
547                                                                send, UTF, TRUE);
548            if (! packed_warn) {
549                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
550            }
551            else {
552                *message = failure;
553                *packed_warn = packWARN(WARN_DIGIT);
554            }
555        }
556    }
557
558    /* Return past the '}' */
559    *s = rbrace + 1;
560
561    return TRUE;
562}
563
564/*
565 * ex: set ts=8 sts=4 sw=4 et:
566 */
567