1/* dquote.c 2 * 3 * This file contains functions that are related to 4 * parsing double-quotish expressions. 5 * 6*/ 7 8#include "EXTERN.h" 9#define PERL_IN_DQUOTE_C 10#include "perl.h" 11 12/* XXX Add documentation after final interface and behavior is decided */ 13 14bool 15Perl_grok_bslash_c(pTHX_ const char source, 16 U8 * result, 17 const char** message, 18 U32 * packed_warn) 19{ 20 PERL_ARGS_ASSERT_GROK_BSLASH_C; 21 22 /* This returns TRUE if the \c? sequence is valid; FALSE otherwise. If it 23 * is valid, the sequence evaluates to a single character, which will be 24 * stored into *result. 25 * 26 * source is the character immediately after a '\c' sequence. 27 * result points to a char variable into which this function will store 28 * what the sequence evaluates to, if valid; unchanged otherwise. 29 * message A pointer to any warning or error message will be stored into 30 * this pointer; NULL if none. 31 * packed_warn if NULL on input asks that this routine display any warning 32 * messages. Otherwise, if the function found a warning, the 33 * packed warning categories will be stored into *packed_warn (and 34 * the corresponding message text into *message); 0 if none. 35 */ 36 37 *message = NULL; 38 if (packed_warn) *packed_warn = 0; 39 40 if (! isPRINT_A(source)) { 41 *message = "Character following \"\\c\" must be printable ASCII"; 42 return FALSE; 43 } 44 45 if (source == '{') { 46 const char control = toCTRL('{'); 47 if (isPRINT_A(control)) { 48 /* diag_listed_as: Use "%s" instead of "%s" */ 49 *message = Perl_form(aTHX_ PERL_DIAG_DIE_SYNTAX("Use \"%c\" instead of \"\\c{\""), control); 50 } 51 else { 52 *message = "Sequence \"\\c{\" invalid"; 53 } 54 return FALSE; 55 } 56 57 *result = toCTRL(source); 58 if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) { 59 U8 clearer[3]; 60 U8 i = 0; 61 char format[] = PERL_DIAG_WARN_SYNTAX("\"\\c%c\" is more clearly written simply as \"%s\""); 62 63 if (! isWORDCHAR(*result)) { 64 clearer[i++] = '\\'; 65 } 66 clearer[i++] = *result; 67 clearer[i++] = '\0'; 68 69 if (packed_warn) { 70 *message = Perl_form(aTHX_ format, source, clearer); 71 *packed_warn = packWARN(WARN_SYNTAX); 72 } 73 else { 74 Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer); 75 } 76 } 77 78 return TRUE; 79} 80 81const char * 82Perl_form_alien_digit_msg(pTHX_ 83 const U8 which, /* 8 or 16 */ 84 const STRLEN valids_len, /* length of input before first bad char */ 85 const char * const first_bad, /* Ptr to that bad char */ 86 const char * const send, /* End of input string */ 87 const bool UTF, /* Is it in UTF-8? */ 88 const bool braced) /* Is it enclosed in {} */ 89{ 90 /* Generate a mortal SV containing an appropriate warning message about 91 * alien characters found in an octal or hex constant given by the inputs, 92 * and return a pointer to that SV's string. The message looks like: 93 * 94 * Non-hex character '?' terminates \x early. Resolved as "\x{...}" 95 * 96 */ 97 98 /* The usual worst case scenario: 2 chars to display per byte, plus \x{} 99 * (leading zeros could take up more space, and the scalar will 100 * automatically grow if necessary). Space for NUL is added by the newSV() 101 * function */ 102 SV * display_char = newSV(2 * UTF8_MAXBYTES + 4); 103 SV * message_sv = sv_newmortal(); 104 char symbol; 105 106 PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG; 107 assert(which == 8 || which == 16); 108 109 /* Calculate the display form of the character */ 110 if ( UVCHR_IS_INVARIANT(*first_bad) 111 || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send))) 112 { 113 pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad), 114 (STRLEN) -1, UNI_DISPLAY_QQ); 115 } 116 else { /* Is not UTF-8, or is illegal UTF-8. Show just the one byte */ 117 118 /* It also isn't a UTF-8 invariant character, so no display shortcuts 119 * are available. Use \\x{...} */ 120 Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad); 121 } 122 123 /* Ready to start building the message */ 124 sv_setpvs(message_sv, "Non-"); 125 if (which == 8) { 126 sv_catpvs(message_sv, "octal"); 127 if (braced) { 128 symbol = 'o'; 129 } 130 else { 131 symbol = '0'; /* \008, for example */ 132 } 133 } 134 else { 135 sv_catpvs(message_sv, "hex"); 136 symbol = 'x'; 137 } 138 sv_catpvs(message_sv, " character "); 139 140 if (isPRINT(*first_bad)) { 141 sv_catpvs(message_sv, "'"); 142 } 143 sv_catsv(message_sv, display_char); 144 if (isPRINT(*first_bad)) { 145 sv_catpvs(message_sv, "'"); 146 } 147 Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early. Resolved as " 148 "\"\\%c", symbol, symbol); 149 if (braced) { 150 sv_catpvs(message_sv, "{"); 151 } 152 153 /* Octal constants have an extra leading 0, but \0 already includes that */ 154 if (symbol == 'o' && valids_len < 3) { 155 sv_catpvs(message_sv, "0"); 156 } 157 if (valids_len == 0) { /* No legal digits at all */ 158 sv_catpvs(message_sv, "00"); 159 } 160 else if (valids_len == 1) { /* Just one is legal */ 161 sv_catpvs(message_sv, "0"); 162 } 163 sv_catpvn(message_sv, first_bad - valids_len, valids_len); 164 165 if (braced) { 166 sv_catpvs(message_sv, "}"); 167 } 168 else { 169 sv_catsv(message_sv, display_char); 170 } 171 sv_catpvs(message_sv, "\""); 172 173 SvREFCNT_dec_NN(display_char); 174 175 return SvPVX_const(message_sv); 176} 177 178const char * 179Perl_form_cp_too_large_msg(pTHX_ 180 const U8 which, /* 8 or 16 */ 181 const char * string, /* NULL, or the text that is supposed to 182 represent a code point */ 183 const Size_t len, /* length of 'string' if not NULL; else 0 */ 184 const UV cp) /* 0 if 'string' not NULL; else the too-large 185 code point */ 186{ 187 /* Generate a mortal SV containing an appropriate warning message about 188 * code points that are too large for this system, given by the inputs, 189 * and return a pointer to that SV's string. Either the text of the string 190 * to be converted to a code point is input, or a code point itself. The 191 * former is needed to accurately represent something that overflows. 192 * 193 * The message looks like: 194 * 195 * Use of code point %s is not allowed; the permissible max is %s 196 * 197 */ 198 199 SV * message_sv = sv_newmortal(); 200 const char * format; 201 const char * prefix; 202 203 PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG; 204 assert(which == 8 || which == 16); 205 206 /* One but not both must be non-zero */ 207 assert((string != NULL) ^ (cp != 0)); 208 assert((string == NULL) || len); 209 210 if (which == 8) { 211 format = "%" UVof; 212 prefix = "0"; 213 } 214 else { 215 format = "%" UVXf; 216 prefix = "0x"; 217 } 218 219 Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix); 220 if (string) { 221 Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string); 222 } 223 else { 224 Perl_sv_catpvf(aTHX_ message_sv, format, cp); 225 } 226 Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix); 227 Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP); 228 229 return SvPVX_const(message_sv); 230} 231 232bool 233Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv, 234 const char** message, 235 U32 * packed_warn, 236 const bool strict, 237 const bool allow_UV_MAX, 238 const bool UTF) 239{ 240 241/* Documentation to be supplied when interface nailed down finally 242 * This returns FALSE if there is an error the caller should probably die 243 * from; otherwise TRUE. 244 * s is the address of a pointer to a string. **s is 'o', and the 245 * previous character was a backslash. At exit, *s will be advanced 246 * to the byte just after those absorbed by this function. Hence the 247 * caller can continue parsing from there. In the case of an error 248 * when this function returns FALSE, continuing to parse is not an 249 * option, this routine has generally positioned *s to point just to 250 * the right of the first bad spot, so that a message that has a "<--" 251 * to mark the spot will be correctly positioned. 252 * send - 1 gives a limit in *s that this function is not permitted to 253 * look beyond. That is, the function may look at bytes only in the 254 * range *s..send-1 255 * uv points to a UV that will hold the output value, valid only if the 256 * return from the function is TRUE; may be changed from the input 257 * value even when FALSE is returned. 258 * message A pointer to any warning or error message will be stored into 259 * this pointer; NULL if none. 260 * packed_warn if NULL on input asks that this routine display any warning 261 * messages. Otherwise, if the function found a warning, the packed 262 * warning categories will be stored into *packed_warn (and the 263 * corresponding message text into *message); 0 if none. 264 * strict is true if this should fail instead of warn if there are 265 * non-octal digits within the braces 266 * allow_UV_MAX is true if this shouldn't fail if the input code point is 267 * UV_MAX, which is normally illegal, reserved for internal use. 268 * UTF is true iff the string *s is encoded in UTF-8. 269 */ 270 char * e; 271 char * rbrace; 272 STRLEN numbers_len; 273 STRLEN trailing_blanks_len = 0; 274 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 275 | PERL_SCAN_DISALLOW_PREFIX 276 | PERL_SCAN_SILENT_NON_PORTABLE 277 | PERL_SCAN_SILENT_ILLDIGIT 278 | PERL_SCAN_SILENT_OVERFLOW; 279 280 PERL_ARGS_ASSERT_GROK_BSLASH_O; 281 282 assert(*(*s - 1) == '\\'); 283 assert(* *s == 'o'); 284 285 *message = NULL; 286 if (packed_warn) *packed_warn = 0; 287 288 (*s)++; 289 290 if (send <= *s || **s != '{') { 291 *message = "Missing braces on \\o{}"; 292 return FALSE; 293 } 294 295 rbrace = (char *) memchr(*s, '}', send - *s); 296 if (!rbrace) { 297 (*s)++; /* Move past the '{' */ 298 299 /* Position beyond the legal digits and blanks */ 300 while (*s < send && isBLANK(**s)) { 301 (*s)++; 302 } 303 304 while (*s < send && isOCTAL(**s)) { 305 (*s)++; 306 } 307 308 *message = "Missing right brace on \\o{}"; 309 return FALSE; 310 } 311 312 /* Point to expected first digit (could be first byte of utf8 sequence if 313 * not a digit) */ 314 (*s)++; 315 while (isBLANK(**s)) { 316 (*s)++; 317 } 318 319 e = rbrace; 320 while (*s < e && isBLANK(*(e - 1))) { 321 e--; 322 } 323 324 numbers_len = e - *s; 325 if (numbers_len == 0) { 326 (*s)++; /* Move past the '}' */ 327 *message = "Empty \\o{}"; 328 return FALSE; 329 } 330 331 *uv = grok_oct(*s, &numbers_len, &flags, NULL); 332 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX) 333 || (! allow_UV_MAX && *uv == UV_MAX))) 334 { 335 *message = form_cp_too_large_msg(8, *s, numbers_len, 0); 336 *s = rbrace + 1; 337 return FALSE; 338 } 339 340 while (isBLANK(**s)) { 341 trailing_blanks_len++; 342 (*s)++; 343 } 344 345 /* Note that if has non-octal, will ignore everything starting with that up 346 * to the '}' */ 347 if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) { 348 *s += numbers_len; 349 if (strict) { 350 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; 351 *message = "Non-octal character"; 352 return FALSE; 353 } 354 355 if (ckWARN(WARN_DIGIT)) { 356 const char * failure = form_alien_digit_msg(8, numbers_len, *s, send, 357 UTF, TRUE); 358 if (packed_warn) { 359 *message = failure; 360 *packed_warn = packWARN(WARN_DIGIT); 361 } 362 else { 363 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); 364 } 365 } 366 } 367 368 /* Return past the '}' */ 369 *s = rbrace + 1; 370 371 return TRUE; 372} 373 374bool 375Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv, 376 const char** message, 377 U32 * packed_warn, 378 const bool strict, 379 const bool allow_UV_MAX, 380 const bool UTF) 381{ 382 383/* Documentation to be supplied when interface nailed down finally 384 * This returns FALSE if there is an error the caller should probably die 385 * from; otherwise TRUE. 386 * It guarantees that the returned codepoint, *uv, when expressed as 387 * utf8 bytes, would fit within the skipped "\x{...}" bytes. 388 * 389 * On input: 390 * s is the address of a pointer to a string. **s is 'x', and the 391 * previous character was a backslash. At exit, *s will be advanced 392 * to the byte just after those absorbed by this function. Hence the 393 * caller can continue parsing from there. In the case of an error, 394 * this routine has generally positioned *s to point just to the right 395 * of the first bad spot, so that a message that has a "<--" to mark 396 * the spot will be correctly positioned. 397 * send - 1 gives a limit in *s that this function is not permitted to 398 * look beyond. That is, the function may look at bytes only in the 399 * range *s..send-1 400 * uv points to a UV that will hold the output value, valid only if the 401 * return from the function is TRUE; may be changed from the input 402 * value even when FALSE is returned. 403 * message A pointer to any warning or error message will be stored into 404 * this pointer; NULL if none. 405 * packed_warn if NULL on input asks that this routine display any warning 406 * messages. Otherwise, if the function found a warning, the packed 407 * warning categories will be stored into *packed_warn (and the 408 * corresponding message text into *message); 0 if none. 409 * strict is true if anything out of the ordinary should cause this to 410 * fail instead of warn or be silent. For example, it requires 411 * exactly 2 digits following the \x (when there are no braces). 412 * 3 digits could be a mistake, so is forbidden in this mode. 413 * allow_UV_MAX is true if this shouldn't fail if the input code point is 414 * UV_MAX, which is normally illegal, reserved for internal use. 415 * UTF is true iff the string *s is encoded in UTF-8. 416 */ 417 char* e; 418 char * rbrace; 419 STRLEN numbers_len; 420 STRLEN trailing_blanks_len = 0; 421 I32 flags = PERL_SCAN_DISALLOW_PREFIX 422 | PERL_SCAN_SILENT_ILLDIGIT 423 | PERL_SCAN_NOTIFY_ILLDIGIT 424 | PERL_SCAN_SILENT_NON_PORTABLE 425 | PERL_SCAN_SILENT_OVERFLOW; 426 427 PERL_ARGS_ASSERT_GROK_BSLASH_X; 428 429 assert(*(*s - 1) == '\\'); 430 assert(* *s == 'x'); 431 432 *message = NULL; 433 if (packed_warn) *packed_warn = 0; 434 435 (*s)++; 436 437 if (send <= *s) { 438 if (strict) { 439 *message = "Empty \\x"; 440 return FALSE; 441 } 442 443 /* Sadly, to preserve backcompat, an empty \x at the end of string is 444 * interpreted as a NUL */ 445 *uv = 0; 446 return TRUE; 447 } 448 449 if (**s != '{') { 450 numbers_len = (strict) ? 3 : 2; 451 452 *uv = grok_hex(*s, &numbers_len, &flags, NULL); 453 *s += numbers_len; 454 455 if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) { 456 if (numbers_len == 3) { /* numbers_len 3 only happens with strict */ 457 *message = "Use \\x{...} for more than two hex characters"; 458 return FALSE; 459 } 460 else if (strict) { 461 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; 462 *message = "Non-hex character"; 463 return FALSE; 464 } 465 else if (ckWARN(WARN_DIGIT)) { 466 const char * failure = form_alien_digit_msg(16, numbers_len, *s, 467 send, UTF, FALSE); 468 469 if (! packed_warn) { 470 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); 471 } 472 else { 473 *message = failure; 474 *packed_warn = packWARN(WARN_DIGIT); 475 } 476 } 477 } 478 return TRUE; 479 } 480 481 rbrace = (char *) memchr(*s, '}', send - *s); 482 if (!rbrace) { 483 (*s)++; /* Move past the '{' */ 484 485 /* Position beyond legal blanks and digits */ 486 while (*s < send && isBLANK(**s)) { 487 (*s)++; 488 } 489 490 while (*s < send && isXDIGIT(**s)) { 491 (*s)++; 492 } 493 494 *message = "Missing right brace on \\x{}"; 495 return FALSE; 496 } 497 498 (*s)++; /* Point to expected first digit (could be first byte of utf8 499 sequence if not a digit) */ 500 while (isBLANK(**s)) { 501 (*s)++; 502 } 503 504 e = rbrace; 505 while (*s < e && isBLANK(*(e - 1))) { 506 e--; 507 } 508 509 numbers_len = e - *s; 510 if (numbers_len == 0) { 511 if (strict) { 512 (*s)++; /* Move past the } */ 513 *message = "Empty \\x{}"; 514 return FALSE; 515 } 516 *s = rbrace + 1; 517 *uv = 0; 518 return TRUE; 519 } 520 521 flags |= PERL_SCAN_ALLOW_UNDERSCORES; 522 523 *uv = grok_hex(*s, &numbers_len, &flags, NULL); 524 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX) 525 || (! allow_UV_MAX && *uv == UV_MAX))) 526 { 527 *message = form_cp_too_large_msg(16, *s, numbers_len, 0); 528 *s = e + 1; 529 return FALSE; 530 } 531 532 while (isBLANK(**s)) { 533 trailing_blanks_len++; 534 (*s)++; 535 } 536 537 if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) { 538 *s += numbers_len; 539 if (strict) { 540 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; 541 *message = "Non-hex character"; 542 return FALSE; 543 } 544 545 if (ckWARN(WARN_DIGIT)) { 546 const char * failure = form_alien_digit_msg(16, numbers_len, *s, 547 send, UTF, TRUE); 548 if (! packed_warn) { 549 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); 550 } 551 else { 552 *message = failure; 553 *packed_warn = packWARN(WARN_DIGIT); 554 } 555 } 556 } 557 558 /* Return past the '}' */ 559 *s = rbrace + 1; 560 561 return TRUE; 562} 563 564/* 565 * ex: set ts=8 sts=4 sw=4 et: 566 */ 567