1/* 2 * pattern.c - pattern matching 3 * 4 * This file is part of zsh, the Z shell. 5 * 6 * Copyright (c) 1999 Peter Stephenson 7 * All rights reserved. 8 * 9 * Permission is hereby granted, without written agreement and without 10 * license or royalty fees, to use, copy, modify, and distribute this 11 * software and to distribute modified versions of this software for any 12 * purpose, provided that the above copyright notice and the following 13 * two paragraphs appear in all copies of this software. 14 * 15 * In no event shall Peter Stephenson or the Zsh Development Group be liable 16 * to any party for direct, indirect, special, incidental, or consequential 17 * damages arising out of the use of this software and its documentation, 18 * even if Peter Stephenson and the Zsh Development Group have been advised of 19 * the possibility of such damage. 20 * 21 * Peter Stephenson and the Zsh Development Group specifically disclaim any 22 * warranties, including, but not limited to, the implied warranties of 23 * merchantability and fitness for a particular purpose. The software 24 * provided hereunder is on an "as is" basis, and Peter Stephenson and the 25 * Zsh Development Group have no obligation to provide maintenance, 26 * support, updates, enhancements, or modifications. 27 * 28 * Pattern matching code derived from the regexp library by Henry 29 * Spencer, which has the following copyright. 30 * 31 * Copyright (c) 1986 by University of Toronto. 32 * Written by Henry Spencer. Not derived from licensed software. 33 * 34 * Permission is granted to anyone to use this software for any 35 * purpose on any computer system, and to redistribute it freely, 36 * subject to the following restrictions: 37 * 38 * 1. The author is not responsible for the consequences of use of 39 * this software, no matter how awful, even if they arise 40 * from defects in it. 41 * 42 * 2. The origin of this software must not be misrepresented, either 43 * by explicit claim or by omission. 44 * 45 * 3. Altered versions must be plainly marked as such, and must not 46 * be misrepresented as being the original software. 47 * 48 * Eagle-eyed readers will notice this is an altered version. Incredibly 49 * sharp-eyed readers might even find bits that weren't altered. 50 * 51 * 52 * And I experienced a sense that, like certain regular 53 * expressions, seemed to match the day from beginning to end, so 54 * that I did not need to identify the parenthesised subexpression 55 * that told of dawn, nor the group of characters that indicated 56 * the moment when my grandfather returned home with news of 57 * Swann's departure for Paris; and the whole length of the month 58 * of May, as if matched by a closure, fitted into the buffer of my 59 * life with no sign of overflowing, turning the days, like a 60 * procession of insects that could consist of this or that 61 * species, into a random and unstructured repetition of different 62 * sequences, anchored from the first day of the month to the last 63 * in the same fashion as the weeks when I knew I would not see 64 * Gilberte and would search in vain for any occurrences of the 65 * string in the avenue of hawthorns by Tansonville, without my 66 * having to delimit explicitly the start or finish of the pattern. 67 * 68 * M. Proust, "In Search of Lost Files", 69 * bk I, "The Walk by Bourne's Place". 70 */ 71 72#include "zsh.mdh" 73 74/* 75 * The following union is used mostly for alignment purposes. 76 * Normal nodes are longs, while certain nodes take a char * as an argument; 77 * here we make sure that they both work out to the same length. 78 * The compiled regexp we construct consists of upats stuck together; 79 * anything else to be added (strings, numbers) is stuck after and 80 * then aligned to a whole number of upat units. 81 * 82 * Note also that offsets are in terms of the sizes of these things. 83 */ 84union upat { 85 long l; 86 unsigned char *p; 87}; 88 89typedef union upat *Upat; 90 91#include "pattern.pro" 92 93/* Number of active parenthesized expressions allowed in backreferencing */ 94#define NSUBEXP 9 95 96/* definition number opnd? meaning */ 97#define P_END 0x00 /* no End of program. */ 98#define P_EXCSYNC 0x01 /* no Test if following exclude already failed */ 99#define P_EXCEND 0x02 /* no Test if exclude matched orig branch */ 100#define P_BACK 0x03 /* no Match "", "next" ptr points backward. */ 101#define P_EXACTLY 0x04 /* lstr Match this string. */ 102#define P_NOTHING 0x05 /* no Match empty string. */ 103#define P_ONEHASH 0x06 /* node Match this (simple) thing 0 or more times. */ 104#define P_TWOHASH 0x07 /* node Match this (simple) thing 1 or more times. */ 105#define P_GFLAGS 0x08 /* long Match nothing and set globbing flags */ 106#define P_ISSTART 0x09 /* no Match start of string. */ 107#define P_ISEND 0x0a /* no Match end of string. */ 108#define P_COUNTSTART 0x0b /* no Initialise P_COUNT */ 109#define P_COUNT 0x0c /* 3*long uc* node Match a number of repetitions */ 110/* numbered so we can test bit 5 for a branch */ 111#define P_BRANCH 0x20 /* node Match this alternative, or the next... */ 112#define P_WBRANCH 0x21 /* uc* node P_BRANCH, but match at least 1 char */ 113/* excludes are also branches, but have bit 4 set, too */ 114#define P_EXCLUDE 0x30 /* uc* node Exclude this from previous branch */ 115#define P_EXCLUDP 0x31 /* uc* node Exclude, using full file path so far */ 116/* numbered so we can test bit 6 so as not to match initial '.' */ 117#define P_ANY 0x40 /* no Match any one character. */ 118#define P_ANYOF 0x41 /* str Match any character in this string. */ 119#define P_ANYBUT 0x42 /* str Match any character not in this string. */ 120#define P_STAR 0x43 /* no Match any set of characters. */ 121#define P_NUMRNG 0x44 /* zr, zr Match a numeric range. */ 122#define P_NUMFROM 0x45 /* zr Match a number >= X */ 123#define P_NUMTO 0x46 /* zr Match a number <= X */ 124#define P_NUMANY 0x47 /* no Match any set of decimal digits */ 125/* spaces left for P_OPEN+n,... for backreferences */ 126#define P_OPEN 0x80 /* no Mark this point in input as start of n. */ 127#define P_CLOSE 0x90 /* no Analogous to OPEN. */ 128/* 129 * no no argument 130 * zr the range type zrange_t: may be zlong or unsigned long 131 * char a single char 132 * uc* a pointer to unsigned char, used at run time and initialised 133 * to NULL. 134 * str null-terminated, metafied string 135 * lstr length as long then string, not null-terminated, unmetafied. 136 */ 137 138/* 139 * Notes on usage: 140 * P_WBRANCH: This works like a branch and is used in complex closures, 141 * to ensure we don't succeed on a zero-length match of the pattern, 142 * since that would cause an infinite loop. We do this by recording 143 * the positions where we have already tried to match. See the 144 * P_WBRANCH test in patmatch(). 145 * 146 * P_ANY, P_ANYOF: the operand is a null terminated 147 * string. Normal characters match as expected. Characters 148 * in the range Meta+PP_ALPHA..Meta+PP_UNKNWN do the appropriate 149 * Posix range tests. This relies on imeta returning true for these 150 * characters. We treat unknown POSIX ranges as never matching. 151 * PP_RANGE means the next two (possibly metafied) characters form 152 * the limits of a range to test; it's too much like hard work to 153 * expand the range. 154 * 155 * P_EXCLUDE, P_EXCSYNC, PEXCEND: P_EXCLUDE appears in the pattern like 156 * P_BRANCH, but applies to the immediately preceding branch. The code in 157 * the corresponding branch is followed by a P_EXCSYNC, which simply 158 * acts as a marker that a P_EXCLUDE comes next. The P_EXCLUDE 159 * has a pointer to char embeded in it, which works 160 * like P_WBRANCH: if we get to the P_EXCSYNC, and we already matched 161 * up to the same position, fail. Thus we are forced to backtrack 162 * on closures in the P_BRANCH if the first attempt was excluded. 163 * Corresponding to P_EXCSYNC in the original branch, there is a 164 * P_EXCEND in the exclusion. If we get to this point, and we did 165 * *not* match in the original branch, the exclusion itself fails, 166 * otherwise it succeeds since we know the tail already matches, 167 * so P_EXCEND is the end of the exclusion test. 168 * The whole sorry mess looks like this, where the upper lines 169 * show the linkage of the branches, and the lower shows the linkage 170 * of their pattern arguments. 171 * 172 * --------------------- ---------------------- 173 * ^ v ^ v 174 * ( <BRANCH>:apat-><EXCSYNC> <EXCLUDE>:excpat-><EXCEND> ) tail 175 * ^ 176 * | | 177 * -------------------------------------- 178 * 179 * P_EXCLUDP: this behaves exactly like P_EXCLUDE, with the sole exception 180 * that we prepend the path so far to the exclude pattern. This is 181 * for top level file globs, e.g. ** / *.c~*foo.c 182 * ^ I had to leave this space 183 * P_NUM*: zl is a zlong if that is 64-bit, else an unsigned long. 184 * 185 * P_COUNTSTART, P_COUNT: a P_COUNTSTART flags the start of a quantified 186 * closure (#cN,M) and is used to initialise the count. Executing 187 * the pattern leads back to the P_COUNT, while the next links of the 188 * P_COUNTSTART and P_COUNT lead to the tail of the pattern: 189 * 190 * ---------------- 191 * v ^ 192 * <COUNTSTART><COUNT>pattern<BACK> tail 193 * v v ^ 194 * ------------------------ 195 */ 196 197#define P_OP(p) ((p)->l & 0xff) 198#define P_NEXT(p) ((p)->l >> 8) 199#define P_OPERAND(p) ((p) + 1) 200#define P_ISBRANCH(p) ((p)->l & 0x20) 201#define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30) 202#define P_NOTDOT(p) ((p)->l & 0x40) 203 204/* Specific to lstr type, i.e. P_EXACTLY. */ 205#define P_LS_LEN(p) ((p)[1].l) /* can be used as lvalue */ 206#define P_LS_STR(p) ((char *)((p) + 2)) 207 208/* Specific to P_COUNT: arguments as offset in nodes from operator */ 209#define P_CT_CURRENT (1) /* Current count */ 210#define P_CT_MIN (2) /* Minimum count */ 211#define P_CT_MAX (3) /* Maximum count, -1 for none */ 212#define P_CT_PTR (4) /* Pointer to last match start */ 213#define P_CT_OPERAND (5) /* Operand of P_COUNT */ 214 215/* Flags needed when pattern is executed */ 216#define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */ 217#define P_HSTART 0x02 /* Starts with # or ##'d pattern. */ 218#define P_PURESTR 0x04 /* Can be matched with a strcmp */ 219 220#if defined(ZSH_64_BIT_TYPE) || defined(LONG_IS_64_BIT) 221typedef zlong zrange_t; 222#define ZRANGE_T_IS_SIGNED (1) 223#else 224typedef unsigned long zrange_t; 225#endif 226 227/* 228 * Array of characters corresponding to zpc_chars enum, which it must match. 229 */ 230static const char zpc_chars[ZPC_COUNT] = { 231 '/', '\0', Bar, Outpar, Tilde, Inpar, Quest, Star, Inbrack, Inang, 232 Hat, Pound, Bnullkeep, Quest, Star, '+', '!', '@' 233}; 234 235/* 236 * Corresponding strings used in enable/disable -p. 237 * NULL means no way of turning this on or off. 238 */ 239/**/ 240mod_export const char *zpc_strings[ZPC_COUNT] = { 241 NULL, NULL, "|", NULL, "~", "(", "?", "*", "[", "<", 242 "^", "#", NULL, "?(", "*(", "+(", "!(", "@(" 243}; 244 245/* 246 * Corresponding array of pattern disables as set by the user 247 * using "disable -p". 248 */ 249/**/ 250mod_export char zpc_disables[ZPC_COUNT]; 251 252/* 253 * Stack of saved (compressed) zpc_disables for function scope. 254 */ 255 256static struct zpc_disables_save *zpc_disables_stack; 257 258/* 259 * Characters which terminate a simple string (ZPC_COUNT) or 260 * an entire pattern segment (the first ZPC_SEG_COUNT). 261 * Each entry is either the corresponding character in zpc_chars 262 * or Marker which is guaranteed not to match a character in a 263 * pattern we are compiling. 264 * 265 * The complete list indicates characters that are special, so e.g. 266 * (testchar == special[ZPC_TILDE]) succeeds only if testchar is a Tilde 267 * *and* Tilde is currently special. 268 */ 269 270/**/ 271char zpc_special[ZPC_COUNT]; 272 273/* Default size for pattern buffer */ 274#define P_DEF_ALLOC 256 275 276/* Flags used in compilation */ 277static char *patstart, *patparse; /* input pointers */ 278static int patnpar; /* () count */ 279static char *patcode; /* point of code emission */ 280static long patsize; /* size of code */ 281static char *patout; /* start of code emission string */ 282static long patalloc; /* size allocated for same */ 283 284/* Flags used in both compilation and execution */ 285static int patflags; /* flags passed down to patcompile */ 286static int patglobflags; /* globbing flags & approx */ 287 288/* 289 * Increment pointer to metafied multibyte string. 290 */ 291#ifdef MULTIBYTE_SUPPORT 292typedef wint_t patint_t; 293 294#define PEOF WEOF 295 296#define METACHARINC(x) ((void)metacharinc(&x)) 297 298/* 299 * TODO: the shiftstate isn't well handled; we don't guarantee 300 * to maintain it properly between characters. If we don't 301 * need it we should use mbtowc() instead. 302 */ 303static mbstate_t shiftstate; 304 305/* 306 * Multibyte version: it's (almost) as easy to return the 307 * value as not, so do so since we sometimes need it.. 308 */ 309static wchar_t 310metacharinc(char **x) 311{ 312 char *inptr = *x; 313 char inchar; 314 size_t ret = MB_INVALID; 315 wchar_t wc; 316 317 /* 318 * Cheat if the top bit isn't set. This is second-guessing 319 * the library, but we know for sure that if the character 320 * set doesn't have the property that all bytes with the 8th 321 * bit clear are single characters then we are stuffed. 322 */ 323 if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*inptr) & 0x80)) 324 { 325 if (itok(*inptr)) 326 inchar = ztokens[*inptr++ - Pound]; 327 else if (*inptr == Meta) { 328 inptr++; 329 inchar = *inptr++ ^ 32; 330 } else { 331 inchar = *inptr++; 332 } 333 *x = inptr; 334 return (wchar_t)STOUC(inchar); 335 } 336 337 while (*inptr) { 338 if (itok(*inptr)) 339 inchar = ztokens[*inptr++ - Pound]; 340 else if (*inptr == Meta) { 341 inptr++; 342 inchar = *inptr++ ^ 32; 343 } else { 344 inchar = *inptr++; 345 } 346 ret = mbrtowc(&wc, &inchar, 1, &shiftstate); 347 348 if (ret == MB_INVALID) 349 break; 350 if (ret == MB_INCOMPLETE) 351 continue; 352 *x = inptr; 353 return wc; 354 } 355 356 /* Error. Treat as single byte. */ 357 /* Reset the shift state for next time. */ 358 memset(&shiftstate, 0, sizeof(shiftstate)); 359 return (wchar_t) STOUC(*(*x)++); 360} 361 362#else 363typedef int patint_t; 364 365#define PEOF EOF 366 367#define METACHARINC(x) ((void)((x) += (*(x) == Meta) ? 2 : 1)) 368#endif 369 370/* 371 * Return unmetafied char from string (x is any char *). 372 * Used with MULTIBYTE_SUPPORT if the GF_MULTIBYTE is not 373 * in effect. 374 */ 375#define UNMETA(x) (*(x) == Meta ? (x)[1] ^ 32 : *(x)) 376 377/* Add n more characters, ensuring there is enough space. */ 378 379enum { 380 PA_NOALIGN = 1, 381 PA_UNMETA = 2 382}; 383 384/**/ 385static void 386patadd(char *add, int ch, long n, int paflags) 387{ 388 /* Make sure everything gets aligned unless we get PA_NOALIGN. */ 389 long newpatsize = patsize + n; 390 if (!(paflags & PA_NOALIGN)) 391 newpatsize = (newpatsize + sizeof(union upat) - 1) & 392 ~(sizeof(union upat) - 1); 393 if (patalloc < newpatsize) { 394 long newpatalloc = 395 2*(newpatsize > patalloc ? newpatsize : patalloc); 396 patout = (char *)zrealloc((char *)patout, newpatalloc); 397 patcode = patout + patsize; 398 patalloc = newpatalloc; 399 } 400 patsize = newpatsize; 401 if (add) { 402 if (paflags & PA_UNMETA) { 403 /* 404 * Unmetafy and untokenize the string as we go. 405 * The Meta characters in add aren't counted in n. 406 */ 407 while (n--) { 408 if (itok(*add)) 409 *patcode++ = ztokens[*add++ - Pound]; 410 else if (*add == Meta) { 411 add++; 412 *patcode++ = *add++ ^ 32; 413 } else { 414 *patcode++ = *add++; 415 } 416 } 417 } else { 418 while (n--) 419 *patcode++ = *add++; 420 } 421 } else 422 *patcode++ = ch; 423 patcode = patout + patsize; 424} 425 426static long rn_offs; 427/* operates on pointers to union upat, returns a pointer */ 428#define PATNEXT(p) ((rn_offs = P_NEXT(p)) ? \ 429 (P_OP(p) == P_BACK) ? \ 430 ((p)-rn_offs) : ((p)+rn_offs) : NULL) 431 432/* 433 * Set up zpc_special with characters that end a string segment. 434 * "Marker" cannot occur in the pattern we are compiling so 435 * is used to mark "invalid". 436 */ 437static void 438patcompcharsset(void) 439{ 440 char *spp, *disp; 441 int i; 442 443 /* Initialise enabled special characters */ 444 memcpy(zpc_special, zpc_chars, ZPC_COUNT); 445 /* Apply user disables from disable -p */ 446 for (i = 0, spp = zpc_special, disp = zpc_disables; 447 i < ZPC_COUNT; 448 i++, spp++, disp++) { 449 if (*disp) 450 *spp = Marker; 451 } 452 453 if (!isset(EXTENDEDGLOB)) { 454 /* Extended glob characters are not active */ 455 zpc_special[ZPC_TILDE] = zpc_special[ZPC_HAT] = 456 zpc_special[ZPC_HASH] = Marker; 457 } 458 if (!isset(KSHGLOB)) { 459 /* 460 * Ksh glob characters are not active. 461 * * and ? are shared with normal globbing, but for their 462 * use here we are looking for a following Inpar. 463 */ 464 zpc_special[ZPC_KSH_QUEST] = zpc_special[ZPC_KSH_STAR] = 465 zpc_special[ZPC_KSH_PLUS] = zpc_special[ZPC_KSH_BANG] = 466 zpc_special[ZPC_KSH_AT] = Marker; 467 } 468 /* 469 * Note that if we are using KSHGLOB, then we test for a following 470 * Inpar, not zpc_special[ZPC_INPAR]: the latter makes an Inpar on 471 * its own active. The zpc_special[ZPC_KSH_*] followed by any old Inpar 472 * discriminate ksh globbing. 473 */ 474 if (isset(SHGLOB)) { 475 /* 476 * Grouping and numeric ranges are not valid. 477 * We do allow alternation, however; it's needed for 478 * "case". This may not be entirely consistent. 479 * 480 * Don't disable Outpar: we may need to match the end of KSHGLOB 481 * parentheses and it would be difficult to tell them apart. 482 */ 483 zpc_special[ZPC_INPAR] = zpc_special[ZPC_INANG] = Marker; 484 } 485} 486 487/* Called before parsing a set of file matchs to initialize flags */ 488 489/**/ 490void 491patcompstart(void) 492{ 493 patcompcharsset(); 494 if (isset(CASEGLOB)) 495 patglobflags = 0; 496 else 497 patglobflags = GF_IGNCASE; 498 if (isset(MULTIBYTE)) 499 patglobflags |= GF_MULTIBYTE; 500} 501 502/* 503 * Top level pattern compilation subroutine 504 * exp is a null-terminated, metafied string. 505 * inflags is an or of some PAT_* flags. 506 * endexp, if non-null, is set to a pointer to the end of the 507 * part of exp which was compiled. This is used when 508 * compiling patterns for directories which must be 509 * matched recursively. 510 */ 511 512/**/ 513mod_export Patprog 514patcompile(char *exp, int inflags, char **endexp) 515{ 516 int flags = 0; 517 long len = 0; 518 long startoff; 519 Upat pscan; 520 char *lng, *strp = NULL; 521 Patprog p; 522 523 startoff = sizeof(struct patprog); 524 /* Ensure alignment of start of program string */ 525 startoff = (startoff + sizeof(union upat) - 1) & ~(sizeof(union upat) - 1); 526 527 /* Allocate reasonable sized chunk if none, reduce size if too big */ 528 if (patalloc != P_DEF_ALLOC) 529 patout = (char *)zrealloc(patout, patalloc = P_DEF_ALLOC); 530 patcode = patout + startoff; 531 patsize = patcode - patout; 532 patstart = patparse = exp; 533 /* 534 * Note global patnpar numbers parentheses 1..9, while patnpar 535 * in struct is actual count of parentheses. 536 */ 537 patnpar = 1; 538 patflags = inflags & ~(PAT_PURES|PAT_HAS_EXCLUDP); 539 540 if (!(patflags & PAT_FILE)) { 541 patcompcharsset(); 542 zpc_special[ZPC_SLASH] = Marker; 543 remnulargs(patparse); 544 if (isset(MULTIBYTE)) 545 patglobflags = GF_MULTIBYTE; 546 else 547 patglobflags = 0; 548 } 549 if (patflags & PAT_LCMATCHUC) 550 patglobflags |= GF_LCMATCHUC; 551 /* 552 * Have to be set now, since they get updated during compilation. 553 */ 554 ((Patprog)patout)->globflags = patglobflags; 555 556 if (!(patflags & PAT_ANY)) { 557 /* Look for a really pure string, with no tokens at all. */ 558 if (!(patglobflags & ~GF_MULTIBYTE) 559#ifdef __CYGWIN__ 560 /* 561 * If the OS treats files case-insensitively and we 562 * are looking at files, we don't need to use pattern 563 * matching to find the file. 564 */ 565 || (!(patglobflags & ~GF_IGNCASE) && (patflags & PAT_FILE)) 566#endif 567 ) 568 { 569 /* 570 * Waah! I wish I understood this. 571 * Empty metafied strings have an initial Nularg. 572 * This never corresponds to a real character in 573 * a glob pattern or string, so skip it. 574 */ 575 if (*exp == Nularg) 576 exp++; 577 for (strp = exp; *strp && 578 (!(patflags & PAT_FILE) || *strp != '/') && !itok(*strp); 579 strp++) 580 ; 581 } 582 if (!strp || (*strp && *strp != '/')) { 583 /* No, do normal compilation. */ 584 strp = NULL; 585 if (patcompswitch(0, &flags) == 0) 586 return NULL; 587 } else { 588 /* 589 * Yes, copy the string, and skip compilation altogether. 590 * Null terminate for the benefit of globbing. 591 * Leave metafied both for globbing and for our own 592 * efficiency. 593 */ 594 patparse = strp; 595 len = strp - exp; 596 patadd(exp, 0, len + 1, 0); 597 patout[startoff + len] = '\0'; 598 patflags |= PAT_PURES; 599 } 600 } 601 602 /* end of compilation: safe to use pointers */ 603 p = (Patprog)patout; 604 p->startoff = startoff; 605 p->patstartch = '\0'; 606 p->globend = patglobflags; 607 p->flags = patflags; 608 p->mustoff = 0; 609 p->size = patsize; 610 p->patmlen = len; 611 p->patnpar = patnpar-1; 612 613 if (!strp) { 614 pscan = (Upat)(patout + startoff); 615 616 if (!(patflags & PAT_ANY) && P_OP(PATNEXT(pscan)) == P_END) { 617 /* only one top level choice */ 618 pscan = P_OPERAND(pscan); 619 620 if (flags & P_PURESTR) { 621 /* 622 * The pattern can be matched with a simple strncmp/strcmp. 623 * Careful in case we've overwritten the node for the next ptr. 624 */ 625 char *dst = patout + startoff; 626 Upat next; 627 p->flags |= PAT_PURES; 628 for (; pscan; pscan = next) { 629 next = PATNEXT(pscan); 630 if (P_OP(pscan) == P_EXACTLY) { 631 char *opnd = P_LS_STR(pscan), *mtest; 632 long oplen = P_LS_LEN(pscan), ilen; 633 int nmeta = 0; 634 /* 635 * Unfortunately we unmetafied the string 636 * and we need to put any metacharacters 637 * back now we know it's a pure string. 638 * This shouldn't happen too often, it's 639 * just that there are some cases such 640 * as . and .. in files where we really 641 * need a pure string even if there are 642 * pattern characters flying around. 643 */ 644 for (mtest = opnd, ilen = oplen; ilen; 645 mtest++, ilen--) 646 if (imeta(*mtest)) 647 nmeta++; 648 if (nmeta) { 649 char *oldpatout = patout; 650 patadd(NULL, 0, nmeta, 0); 651 /* 652 * Yuk. 653 */ 654 p = (Patprog)patout; 655 opnd = patout + (opnd - oldpatout); 656 dst = patout + startoff; 657 } 658 659 while (oplen--) { 660 if (imeta(*opnd)) { 661 *dst++ = Meta; 662 *dst++ = *opnd++ ^ 32; 663 } else { 664 *dst++ = *opnd++; 665 } 666 } 667 } 668 } 669 p->size = dst - patout; 670 /* patmlen is really strlen. We don't need a null. */ 671 p->patmlen = p->size - startoff; 672 } else { 673 /* starting point info */ 674 if (P_OP(pscan) == P_EXACTLY && !p->globflags && 675 P_LS_LEN(pscan)) 676 p->patstartch = *P_LS_STR(pscan); 677 /* 678 * Find the longest literal string in something expensive. 679 * This is itself not all that cheap if we have 680 * case-insensitive matching or approximation, so don't. 681 */ 682 if ((flags & P_HSTART) && !p->globflags) { 683 lng = NULL; 684 len = 0; 685 for (; pscan; pscan = PATNEXT(pscan)) 686 if (P_OP(pscan) == P_EXACTLY && 687 P_LS_LEN(pscan) >= len) { 688 lng = P_LS_STR(pscan); 689 len = P_LS_LEN(pscan); 690 } 691 if (lng) { 692 p->mustoff = lng - patout; 693 p->patmlen = len; 694 } 695 } 696 } 697 } 698 } 699 700 /* 701 * The pattern was compiled in a fixed buffer: unless told otherwise, 702 * we stick the compiled pattern on the heap. This is necessary 703 * for files where we will often be compiling multiple segments at once. 704 * But if we get the ZDUP flag we always put it in zalloc()ed memory. 705 */ 706 if (patflags & PAT_ZDUP) { 707 Patprog newp = (Patprog)zalloc(patsize); 708 memcpy((char *)newp, (char *)p, patsize); 709 p = newp; 710 } else if (!(patflags & PAT_STATIC)) { 711 Patprog newp = (Patprog)zhalloc(patsize); 712 memcpy((char *)newp, (char *)p, patsize); 713 p = newp; 714 } 715 716 if (endexp) 717 *endexp = patparse; 718 return p; 719} 720 721/* 722 * Main body or parenthesized subexpression in pattern 723 * Parenthesis (and any ksh_glob gubbins) will have been removed. 724 */ 725 726/**/ 727static long 728patcompswitch(int paren, int *flagp) 729{ 730 long starter, br, ender, excsync = 0; 731 int parno = 0; 732 int flags, gfchanged = 0; 733 long savglobflags = (long)patglobflags; 734 Upat ptr; 735 736 *flagp = 0; 737 738 if (paren && (patglobflags & GF_BACKREF) && patnpar <= NSUBEXP) { 739 /* 740 * parenthesized: make an open node. 741 * We can only refer to the first nine parentheses. 742 * For any others, we just use P_OPEN on its own; there's 743 * no gain in arbitrarily limiting the number of parentheses. 744 */ 745 parno = patnpar++; 746 starter = patnode(P_OPEN + parno); 747 } else 748 starter = 0; 749 750 br = patnode(P_BRANCH); 751 if (!patcompbranch(&flags, paren)) 752 return 0; 753 if (patglobflags != (int)savglobflags) 754 gfchanged++; 755 if (starter) 756 pattail(starter, br); 757 else 758 starter = br; 759 760 *flagp |= flags & (P_HSTART|P_PURESTR); 761 762 while (*patparse == zpc_chars[ZPC_BAR] || 763 (*patparse == zpc_special[ZPC_TILDE] && 764 (patparse[1] == '/' || 765 !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) { 766 int tilde = *patparse++ == zpc_special[ZPC_TILDE]; 767 long gfnode = 0, newbr; 768 769 *flagp &= ~P_PURESTR; 770 771 if (tilde) { 772 union upat up; 773 /* excsync remembers the P_EXCSYNC node before a chain of 774 * exclusions: all point back to this. only the 775 * original (non-excluded) branch gets a trailing P_EXCSYNC. 776 */ 777 if (!excsync) { 778 excsync = patnode(P_EXCSYNC); 779 patoptail(br, excsync); 780 } 781 /* 782 * By default, approximations are turned off in exclusions: 783 * we need to do this here as otherwise the code compiling 784 * the exclusion doesn't know if the flags have really 785 * changed if the error count gets restored. 786 */ 787 patglobflags &= ~0xff; 788 if (!(patflags & PAT_FILET) || paren) { 789 br = patnode(P_EXCLUDE); 790 } else { 791 /* 792 * At top level (paren == 0) in a file glob !(patflags 793 * &PAT_FILET) do the exclusion prepending the file path 794 * so far. We need to flag this to avoid unnecessarily 795 * copying the path. 796 */ 797 br = patnode(P_EXCLUDP); 798 patflags |= PAT_HAS_EXCLUDP; 799 } 800 up.p = NULL; 801 patadd((char *)&up, 0, sizeof(up), 0); 802 /* / is not treated as special if we are at top level */ 803 if (!paren && zpc_special[ZPC_SLASH] == '/') { 804 tilde++; 805 zpc_special[ZPC_SLASH] = Marker; 806 } 807 } else { 808 excsync = 0; 809 br = patnode(P_BRANCH); 810 /* 811 * The position of the following statements means globflags 812 * set in the main branch carry over to the exclusion. 813 */ 814 if (!paren) { 815 patglobflags = 0; 816 if (((Patprog)patout)->globflags) { 817 /* 818 * If at top level, we need to reinitialize flags to zero, 819 * since (#i)foo|bar only applies to foo and we stuck 820 * the #i into the global flags. 821 * We could have done it so that they only got set in the 822 * first branch, but it's quite convenient having any 823 * global flags set in the header and not buried in the 824 * pattern. (Or maybe it isn't and we should 825 * forget this bit and always stick in an explicit GFLAGS 826 * statement instead of using the header.) 827 * Also, this can't happen for file globs where there are 828 * no top-level |'s. 829 * 830 * No gfchanged, as nothing to follow branch at top 831 * level. 832 */ 833 union upat up; 834 gfnode = patnode(P_GFLAGS); 835 up.l = patglobflags; 836 patadd((char *)&up, 0, sizeof(union upat), 0); 837 } 838 } else { 839 patglobflags = (int)savglobflags; 840 } 841 } 842 newbr = patcompbranch(&flags, paren); 843 if (tilde == 2) { 844 /* restore special treatment of / */ 845 zpc_special[ZPC_SLASH] = '/'; 846 } 847 if (!newbr) 848 return 0; 849 if (gfnode) 850 pattail(gfnode, newbr); 851 if (!tilde && patglobflags != (int)savglobflags) 852 gfchanged++; 853 pattail(starter, br); 854 if (excsync) 855 patoptail(br, patnode(P_EXCEND)); 856 *flagp |= flags & P_HSTART; 857 } 858 859 /* 860 * Make a closing node, hooking it to the end. 861 * Note that we can't optimize P_NOTHING out here, since another 862 * branch at that point would indicate the current choices continue, 863 * which they don't. 864 */ 865 ender = patnode(paren ? parno ? P_CLOSE+parno : P_NOTHING : P_END); 866 pattail(starter, ender); 867 868 /* 869 * Hook the tails of the branches to the closing node, 870 * except for exclusions which terminate where they are. 871 */ 872 for (ptr = (Upat)patout + starter; ptr; ptr = PATNEXT(ptr)) 873 if (!P_ISEXCLUDE(ptr)) 874 patoptail(ptr-(Upat)patout, ender); 875 876 /* check for proper termination */ 877 if ((paren && *patparse++ != Outpar) || 878 (!paren && *patparse && 879 !((patflags & PAT_FILE) && *patparse == '/'))) 880 return 0; 881 882 if (paren && gfchanged) { 883 /* 884 * Restore old values of flags when leaving parentheses. 885 * gfchanged detects a change in any branch (except exclusions 886 * which are separate), since we need to emit this even if 887 * a later branch happened to put the flags back. 888 */ 889 pattail(ender, patnode(P_GFLAGS)); 890 patglobflags = (int)savglobflags; 891 patadd((char *)&savglobflags, 0, sizeof(long), 0); 892 } 893 894 return starter; 895} 896 897/* 898 * Compile something ended by Bar, Outpar, Tilde, or end of string. 899 * Note the BRANCH or EXCLUDE tag must already have been omitted: 900 * this returns the position of the operand of that. 901 */ 902 903/**/ 904static long 905patcompbranch(int *flagp, int paren) 906{ 907 long chain, latest = 0, starter; 908 int flags = 0; 909 910 *flagp = P_PURESTR; 911 912 starter = chain = 0; 913 while (!memchr(zpc_special, *patparse, ZPC_SEG_COUNT) || 914 (*patparse == zpc_special[ZPC_TILDE] && patparse[1] != '/' && 915 memchr(zpc_special, patparse[1], ZPC_SEG_COUNT))) { 916 if ((*patparse == zpc_special[ZPC_INPAR] && 917 patparse[1] == zpc_special[ZPC_HASH]) || 918 (*patparse == zpc_special[ZPC_KSH_AT] && patparse[1] == Inpar && 919 patparse[2] == zpc_special[ZPC_HASH])) { 920 /* Globbing flags. */ 921 char *pp1 = patparse; 922 int oldglobflags = patglobflags, ignore; 923 long assert; 924 patparse += (*patparse == '@') ? 3 : 2; 925 if (!patgetglobflags(&patparse, &assert, &ignore)) 926 return 0; 927 if (!ignore) { 928 if (assert) { 929 /* 930 * Start/end assertion looking like flags, but 931 * actually handled as a normal node 932 */ 933 latest = patnode(assert); 934 flags = 0; 935 } else { 936 if (pp1 == patstart) { 937 /* Right at start of pattern, the simplest case. 938 * Put them into the flags and don't emit anything. 939 */ 940 ((Patprog)patout)->globflags = patglobflags; 941 continue; 942 } else if (!*patparse) { 943 /* Right at the end, so just leave the flags for 944 * the next Patprog in the chain to pick up. 945 */ 946 break; 947 } 948 /* 949 * Otherwise, we have to stick them in as a pattern 950 * matching nothing. 951 */ 952 if (oldglobflags != patglobflags) { 953 /* Flags changed */ 954 union upat up; 955 latest = patnode(P_GFLAGS); 956 up.l = patglobflags; 957 patadd((char *)&up, 0, sizeof(union upat), 0); 958 } else { 959 /* No effect. */ 960 continue; 961 } 962 } 963 } else if (!*patparse) 964 break; 965 else 966 continue; 967 } else if (*patparse == zpc_special[ZPC_HAT]) { 968 /* 969 * ^pat: anything but pat. For proper backtracking, 970 * etc., we turn this into (*~pat), except without the 971 * parentheses. 972 */ 973 patparse++; 974 latest = patcompnot(0, &flags); 975 } else 976 latest = patcomppiece(&flags, paren); 977 if (!latest) 978 return 0; 979 if (!starter) 980 starter = latest; 981 if (!(flags & P_PURESTR)) 982 *flagp &= ~P_PURESTR; 983 if (!chain) 984 *flagp |= flags & P_HSTART; 985 else 986 pattail(chain, latest); 987 chain = latest; 988 } 989 /* check if there was nothing in the loop, i.e. () */ 990 if (!chain) 991 starter = patnode(P_NOTHING); 992 993 return starter; 994} 995 996/* get glob flags, return 1 for success, 0 for failure */ 997 998/**/ 999int 1000patgetglobflags(char **strp, long *assertp, int *ignore) 1001{ 1002 char *nptr, *ptr = *strp; 1003 zlong ret; 1004 1005 *assertp = 0; 1006 *ignore = 1; 1007 /* (#X): assumes we are still positioned on the first X */ 1008 for (; *ptr && *ptr != Outpar; ptr++) { 1009 if (*ptr == 'q') { 1010 /* Glob qualifiers, ignored in pattern code */ 1011 while (*ptr && *ptr != Outpar) 1012 ptr++; 1013 break; 1014 } else { 1015 *ignore = 0; 1016 switch (*ptr) { 1017 case 'a': 1018 /* Approximate matching, max no. of errors follows */ 1019 ret = zstrtol(++ptr, &nptr, 10); 1020 /* 1021 * We can't have more than 254, because we need 255 to 1022 * mark 254 errors in wbranch and exclude sync strings 1023 * (hypothetically --- hope no-one tries it). 1024 */ 1025 if (ret < 0 || ret > 254 || ptr == nptr) 1026 return 0; 1027 patglobflags = (patglobflags & ~0xff) | (ret & 0xff); 1028 ptr = nptr-1; 1029 break; 1030 1031 case 'l': 1032 /* Lowercase in pattern matches lower or upper in target */ 1033 patglobflags = (patglobflags & ~GF_IGNCASE) | GF_LCMATCHUC; 1034 break; 1035 1036 case 'i': 1037 /* Fully case insensitive */ 1038 patglobflags = (patglobflags & ~GF_LCMATCHUC) | GF_IGNCASE; 1039 break; 1040 1041 case 'I': 1042 /* Restore case sensitivity */ 1043 patglobflags &= ~(GF_LCMATCHUC|GF_IGNCASE); 1044 break; 1045 1046 case 'b': 1047 /* Make backreferences */ 1048 patglobflags |= GF_BACKREF; 1049 break; 1050 1051 case 'B': 1052 /* Don't make backreferences */ 1053 patglobflags &= ~GF_BACKREF; 1054 break; 1055 1056 case 'm': 1057 /* Make references to complete match */ 1058 patglobflags |= GF_MATCHREF; 1059 break; 1060 1061 case 'M': 1062 /* Don't */ 1063 patglobflags &= ~GF_MATCHREF; 1064 break; 1065 1066 case 's': 1067 *assertp = P_ISSTART; 1068 break; 1069 1070 case 'e': 1071 *assertp = P_ISEND; 1072 break; 1073 1074 case 'u': 1075 patglobflags |= GF_MULTIBYTE; 1076 break; 1077 1078 case 'U': 1079 patglobflags &= ~GF_MULTIBYTE; 1080 break; 1081 1082 default: 1083 return 0; 1084 } 1085 } 1086 } 1087 if (*ptr != Outpar) 1088 return 0; 1089 /* Start/end assertions must appear on their own. */ 1090 if (*assertp && (*strp)[1] != Outpar) 1091 return 0; 1092 *strp = ptr + 1; 1093 return 1; 1094} 1095 1096 1097static const char *colon_stuffs[] = { 1098 "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph", 1099 "lower", "print", "punct", "space", "upper", "xdigit", "IDENT", 1100 "IFS", "IFSSPACE", "WORD", NULL 1101}; 1102 1103/* 1104 * Handle the guts of a [:stuff:] character class element. 1105 * start is the beginning of "stuff" and len is its length. 1106 * This code is exported for the benefit of completion matching. 1107 */ 1108 1109/**/ 1110mod_export int 1111range_type(char *start, int len) 1112{ 1113 const char **csp; 1114 1115 for (csp = colon_stuffs; *csp; csp++) { 1116 if (!strncmp(start, *csp, len)) 1117 return (csp - colon_stuffs) + PP_FIRST; 1118 } 1119 1120 return PP_UNKWN; 1121} 1122 1123 1124/* 1125 * Convert the contents of a [...] or [^...] expression (just the 1126 * ... part) back into a string. This is used by compfiles -p/-P 1127 * for some reason. The compiled form (a metafied string) is 1128 * passed in rangestr. 1129 * 1130 * If outstr is non-NULL the compiled form is placed there. It 1131 * must be sufficiently long. A terminating NULL is appended. 1132 * 1133 * Return the length required, not including the terminating NULL. 1134 * 1135 * TODO: this is non-multibyte for now. It will need to be defined 1136 * appropriately with MULTIBYTE_SUPPORT when the completion matching 1137 * code catches up. 1138 */ 1139 1140/**/ 1141mod_export int 1142pattern_range_to_string(char *rangestr, char *outstr) 1143{ 1144 int len = 0; 1145 1146 while (*rangestr) { 1147 if (imeta(STOUC(*rangestr))) { 1148 int swtype = STOUC(*rangestr) - STOUC(Meta); 1149 1150 if (swtype == 0) { 1151 /* Ordindary metafied character */ 1152 if (outstr) 1153 { 1154 *outstr++ = Meta; 1155 *outstr++ = rangestr[1] ^ 32; 1156 } 1157 len += 2; 1158 rangestr += 2; 1159 } else if (swtype == PP_RANGE) { 1160 /* X-Y range */ 1161 int i; 1162 1163 for (i = 0; i < 2; i++) { 1164 if (*rangestr == Meta) { 1165 if (outstr) { 1166 *outstr++ = Meta; 1167 *outstr++ = rangestr[1]; 1168 } 1169 len += 2; 1170 rangestr += 2; 1171 } else { 1172 if (outstr) 1173 *outstr++ = *rangestr; 1174 len++; 1175 rangestr++; 1176 } 1177 1178 if (i == 0) { 1179 if (outstr) 1180 *outstr++ = '-'; 1181 len++; 1182 } 1183 } 1184 } else if (swtype >= PP_FIRST && swtype <= PP_LAST) { 1185 /* [:stuff:]; we need to output [: and :] */ 1186 const char *found = colon_stuffs[swtype - PP_FIRST]; 1187 int newlen = strlen(found); 1188 if (outstr) { 1189 strcpy(outstr, "[:"); 1190 outstr += 2; 1191 memcpy(outstr, found, newlen); 1192 outstr += newlen; 1193 strcpy(outstr, ":]"); 1194 outstr += 2; 1195 } 1196 len += newlen + 4; 1197 rangestr++; 1198 } else { 1199 /* shouldn't happen */ 1200 DPUTS(1, "BUG: unknown PP_ code in pattern range"); 1201 rangestr++; 1202 } 1203 } else { 1204 /* ordinary character, guaranteed no Meta handling needed */ 1205 if (outstr) 1206 *outstr++ = *rangestr; 1207 len++; 1208 rangestr++; 1209 } 1210 } 1211 1212 if (outstr) 1213 *outstr = '\0'; 1214 return len; 1215} 1216 1217/* 1218 * compile a chunk such as a literal string or a [...] followed 1219 * by a possible hash operator 1220 */ 1221 1222/**/ 1223static long 1224patcomppiece(int *flagp, int paren) 1225{ 1226 long starter = 0, next, op, opnd; 1227 int flags, flags2, kshchar, len, ch, patch, nmeta; 1228 int hash, count; 1229 union upat up; 1230 char *nptr, *str0, *ptr, *patprev; 1231 zrange_t from, to; 1232 char *charstart; 1233 1234 flags = 0; 1235 str0 = patprev = patparse; 1236 for (;;) { 1237 /* 1238 * Check if we have a string. First, we need to make sure 1239 * the string doesn't introduce a ksh-like parenthesized expression. 1240 */ 1241 kshchar = '\0'; 1242 if (*patparse && patparse[1] == Inpar) { 1243 if (*patparse == zpc_special[ZPC_KSH_PLUS]) 1244 kshchar = STOUC('+'); 1245 else if (*patparse == zpc_special[ZPC_KSH_BANG]) 1246 kshchar = STOUC('!'); 1247 else if (*patparse == zpc_special[ZPC_KSH_AT]) 1248 kshchar = STOUC('@'); 1249 else if (*patparse == zpc_special[ZPC_KSH_STAR]) 1250 kshchar = STOUC('*'); 1251 else if (*patparse == zpc_special[ZPC_KSH_QUEST]) 1252 kshchar = STOUC('?'); 1253 } 1254 1255 /* 1256 * If '(' is disabled as a pattern char, allow ')' as 1257 * an ordinary string character if there are no parentheses to 1258 * close. Don't allow it otherwise, it changes the syntax. 1259 */ 1260 if (zpc_special[ZPC_INPAR] != Marker || *patparse != Outpar || 1261 paren) { 1262 /* 1263 * End of string (or no string at all) if ksh-type parentheses, 1264 * or special character, unless that character is a tilde and 1265 * the character following is an end-of-segment character. Thus 1266 * tildes are not special if there is nothing following to 1267 * be excluded. 1268 * 1269 * Don't look for X()-style kshglobs at this point; we've 1270 * checked above for the case with parentheses and we don't 1271 * want to match without parentheses. 1272 */ 1273 if (kshchar || 1274 (memchr(zpc_special, *patparse, ZPC_NO_KSH_GLOB) && 1275 (*patparse != zpc_special[ZPC_TILDE] || 1276 patparse[1] == '/' || 1277 !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) { 1278 break; 1279 } 1280 } 1281 1282 /* Remember the previous character for backtracking */ 1283 patprev = patparse; 1284 METACHARINC(patparse); 1285 } 1286 1287 if (patparse > str0) { 1288 long slen = patparse - str0; 1289 int morelen; 1290 1291 /* Ordinary string: cancel kshchar lookahead */ 1292 kshchar = '\0'; 1293 /* 1294 * Assume it matches a simple string until we find otherwise. 1295 */ 1296 flags |= P_PURESTR; 1297 DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece."); 1298 /* more than one character matched? */ 1299 morelen = (patprev > str0); 1300 /* 1301 * If we have more than one character, a following hash 1302 * or (#c...) only applies to the last, so backtrack one character. 1303 */ 1304 if ((*patparse == zpc_special[ZPC_HASH] || 1305 (*patparse == zpc_special[ZPC_INPAR] && 1306 patparse[1] == zpc_special[ZPC_HASH] && 1307 patparse[2] == 'c') || 1308 (*patparse == zpc_special[ZPC_KSH_AT] && 1309 patparse[1] == Inpar && 1310 patparse[2] == zpc_special[ZPC_HASH] && 1311 patparse[3] == 'c')) && morelen) 1312 patparse = patprev; 1313 /* 1314 * If len is 1, we can't have an active # following, so doesn't 1315 * matter that we don't make X in `XX#' simple. 1316 */ 1317 if (!morelen) 1318 flags |= P_SIMPLE; 1319 starter = patnode(P_EXACTLY); 1320 1321 /* Get length of string without metafication. */ 1322 nmeta = 0; 1323 /* inherited from domatch, but why, exactly? */ 1324 if (*str0 == Nularg) 1325 str0++; 1326 for (ptr = str0; ptr < patparse; ptr++) { 1327 if (*ptr == Meta) { 1328 nmeta++; 1329 ptr++; 1330 } 1331 } 1332 slen = (patparse - str0) - nmeta; 1333 /* First add length, which is a long */ 1334 patadd((char *)&slen, 0, sizeof(long), 0); 1335 /* 1336 * Then the string, not null terminated. 1337 * Unmetafy and untokenize; pass the final length, 1338 * which is what we need to allocate, i.e. not including 1339 * a count for each Meta in the string. 1340 */ 1341 patadd(str0, 0, slen, PA_UNMETA); 1342 nptr = P_LS_STR((Upat)patout + starter); 1343 /* 1344 * It's much simpler to turn off pure string mode for 1345 * any case-insensitive or approximate matching; usually, 1346 * that is correct, or they wouldn't have been turned on. 1347 * However, we need to make sure we match a "." or ".." 1348 * in a file name as a pure string. There's a minor bug 1349 * that this will also apply to something like 1350 * ..(#a1).. (i.e. the (#a1) has no effect), but if you're 1351 * going to write funny patterns, you get no sympathy from me. 1352 */ 1353 if (patglobflags & 1354#ifdef __CYGWIN__ 1355 /* 1356 * As above: don't use pattern matching for files 1357 * just because of case insensitivity if file system 1358 * is known to be case insensitive. 1359 * 1360 * This is known to be necessary in at least one case: 1361 * if "mount -c /" is in effect, so that drives appear 1362 * directly under / instead of the usual /cygdrive, they 1363 * aren't shown by readdir(). So it's vital we don't use 1364 * globbing to find "/c", since that'll fail. 1365 */ 1366 ((patflags & PAT_FILE) ? 1367 (0xFF|GF_LCMATCHUC) : 1368 (0xFF|GF_LCMATCHUC|GF_IGNCASE)) 1369#else 1370 (0xFF|GF_LCMATCHUC|GF_IGNCASE) 1371#endif 1372 ) { 1373 if (!(patflags & PAT_FILE)) 1374 flags &= ~P_PURESTR; 1375 else if (!(nptr[0] == '.' && 1376 (slen == 1 || (nptr[1] == '.' && slen == 2)))) 1377 flags &= ~P_PURESTR; 1378 } 1379 } else { 1380 if (kshchar) 1381 patparse++; 1382 1383 patch = *patparse; 1384 METACHARINC(patparse); 1385 switch(patch) { 1386 case Quest: 1387 DPUTS(zpc_special[ZPC_QUEST] == Marker, 1388 "Treating '?' as pattern character although disabled"); 1389 flags |= P_SIMPLE; 1390 starter = patnode(P_ANY); 1391 break; 1392 case Star: 1393 DPUTS(zpc_special[ZPC_STAR] == Marker, 1394 "Treating '*' as pattern character although disabled"); 1395 /* kshchar is used as a sign that we can't have #'s. */ 1396 kshchar = -1; 1397 starter = patnode(P_STAR); 1398 break; 1399 case Inbrack: 1400 DPUTS(zpc_special[ZPC_INBRACK] == Marker, 1401 "Treating '[' as pattern character although disabled"); 1402 flags |= P_SIMPLE; 1403 if (*patparse == Hat || *patparse == '^' || *patparse == '!') { 1404 patparse++; 1405 starter = patnode(P_ANYBUT); 1406 } else 1407 starter = patnode(P_ANYOF); 1408 if (*patparse == Outbrack) { 1409 patparse++; 1410 patadd(NULL, ']', 1, PA_NOALIGN); 1411 } 1412 while (*patparse && *patparse != Outbrack) { 1413 /* Meta is not a token */ 1414 if (*patparse == Inbrack && patparse[1] == ':' && 1415 (nptr = strchr(patparse+2, ':')) && 1416 nptr[1] == Outbrack) { 1417 /* Posix range. */ 1418 patparse += 2; 1419 len = nptr - patparse; 1420 ch = range_type(patparse, len); 1421 patparse = nptr + 2; 1422 if (ch != PP_UNKWN) 1423 patadd(NULL, STOUC(Meta) + ch, 1, PA_NOALIGN); 1424 continue; 1425 } 1426 charstart = patparse; 1427 METACHARINC(patparse); 1428 1429 if (*patparse == '-' && patparse[1] && 1430 patparse[1] != Outbrack) { 1431 patadd(NULL, STOUC(Meta)+PP_RANGE, 1, PA_NOALIGN); 1432 if (itok(*charstart)) { 1433 patadd(0, STOUC(ztokens[*charstart - Pound]), 1, 1434 PA_NOALIGN); 1435 } else { 1436 patadd(charstart, 0, patparse-charstart, PA_NOALIGN); 1437 } 1438 charstart = ++patparse; /* skip ASCII '-' */ 1439 METACHARINC(patparse); 1440 } 1441 if (itok(*charstart)) { 1442 patadd(0, STOUC(ztokens[*charstart - Pound]), 1, 1443 PA_NOALIGN); 1444 } else { 1445 patadd(charstart, 0, patparse-charstart, PA_NOALIGN); 1446 } 1447 } 1448 if (*patparse != Outbrack) 1449 return 0; 1450 patparse++; 1451 /* terminate null string and fix alignment */ 1452 patadd(NULL, 0, 1, 0); 1453 break; 1454 case Inpar: 1455 DPUTS(!kshchar && zpc_special[ZPC_INPAR] == Marker, 1456 "Treating '(' as pattern character although disabled"); 1457 DPUTS(isset(SHGLOB) && !kshchar, 1458 "Treating bare '(' as pattern character with SHGLOB"); 1459 if (kshchar == '!') { 1460 /* This is nasty, we should really either handle all 1461 * kshglobbing below or here. But most of the 1462 * others look like non-ksh patterns, while this one 1463 * doesn't, so we handle it here and leave the rest. 1464 * We treat it like an extendedglob ^, except that 1465 * it goes into parentheses. 1466 * 1467 * If we did do kshglob here, we could support 1468 * the old behaviour that things like !(foo)## 1469 * work, but it makes the code more complicated at 1470 * the expense of allowing the user to do things 1471 * they shouldn't. 1472 */ 1473 if (!(starter = patcompnot(1, &flags2))) 1474 return 0; 1475 } else if (!(starter = patcompswitch(1, &flags2))) 1476 return 0; 1477 flags |= flags2 & P_HSTART; 1478 break; 1479 case Inang: 1480 /* Numeric glob */ 1481 DPUTS(zpc_special[ZPC_INANG] == Marker, 1482 "Treating '<' as pattern character although disabled"); 1483 DPUTS(isset(SHGLOB), "Treating <..> as numeric range with SHGLOB"); 1484 len = 0; /* beginning present 1, end present 2 */ 1485 if (idigit(*patparse)) { 1486 from = (zrange_t) zstrtol((char *)patparse, 1487 (char **)&nptr, 10); 1488 patparse = nptr; 1489 len |= 1; 1490 } 1491 DPUTS(*patparse != '-', "BUG: - missing from numeric glob"); 1492 patparse++; 1493 if (idigit(*patparse)) { 1494 to = (zrange_t) zstrtol((char *)patparse, 1495 (char **)&nptr, 10); 1496 patparse = nptr; 1497 len |= 2; 1498 } 1499 if (*patparse != Outang) 1500 return 0; 1501 patparse++; 1502 switch(len) { 1503 case 3: 1504 starter = patnode(P_NUMRNG); 1505 patadd((char *)&from, 0, sizeof(from), 0); 1506 patadd((char *)&to, 0, sizeof(to), 0); 1507 break; 1508 case 2: 1509 starter = patnode(P_NUMTO); 1510 patadd((char *)&to, 0, sizeof(to), 0); 1511 break; 1512 case 1: 1513 starter = patnode(P_NUMFROM); 1514 patadd((char *)&from, 0, sizeof(from), 0); 1515 break; 1516 case 0: 1517 starter = patnode(P_NUMANY); 1518 break; 1519 } 1520 /* This can't be simple, because it isn't. 1521 * Mention in manual that matching digits with [...] 1522 * is more efficient. 1523 */ 1524 break; 1525 case Pound: 1526 DPUTS(zpc_special[ZPC_HASH] == Marker, 1527 "Treating '#' as pattern character although disabled"); 1528 DPUTS(!isset(EXTENDEDGLOB), "BUG: # not treated as string"); 1529 /* 1530 * A hash here is an error; it should follow something 1531 * repeatable. 1532 */ 1533 return 0; 1534 break; 1535 case Bnullkeep: 1536 /* 1537 * Marker for restoring a backslash in output: 1538 * does not match a character. 1539 */ 1540 next = patcomppiece(flagp, paren); 1541 /* 1542 * Can't match a pure string since we need to do this 1543 * as multiple chunks. 1544 */ 1545 *flagp &= ~P_PURESTR; 1546 return next; 1547 break; 1548#ifdef DEBUG 1549 default: 1550 dputs("BUG: character not handled in patcomppiece"); 1551 return 0; 1552 break; 1553#endif 1554 } 1555 } 1556 1557 count = 0; 1558 if (!(hash = (*patparse == zpc_special[ZPC_HASH])) && 1559 !(count = ((*patparse == zpc_special[ZPC_INPAR] && 1560 patparse[1] == zpc_special[ZPC_HASH] && 1561 patparse[2] == 'c') || 1562 (*patparse == zpc_special[ZPC_KSH_AT] && 1563 patparse[1] == Inpar && 1564 patparse[2] == zpc_special[ZPC_HASH] && 1565 patparse[3] == 'c'))) && 1566 (kshchar <= 0 || kshchar == '@' || kshchar == '!')) { 1567 *flagp = flags; 1568 return starter; 1569 } 1570 1571 /* too much at once doesn't currently work */ 1572 if (kshchar && (hash || count)) 1573 return 0; 1574 1575 if (kshchar == '*') { 1576 op = P_ONEHASH; 1577 *flagp = P_HSTART; 1578 } else if (kshchar == '+') { 1579 op = P_TWOHASH; 1580 *flagp = P_HSTART; 1581 } else if (kshchar == '?') { 1582 op = 0; 1583 *flagp = 0; 1584 } else if (count) { 1585 op = P_COUNT; 1586 patparse += 3; 1587 *flagp = P_HSTART; 1588 } else if (*++patparse == zpc_special[ZPC_HASH]) { 1589 op = P_TWOHASH; 1590 patparse++; 1591 *flagp = P_HSTART; 1592 } else { 1593 op = P_ONEHASH; 1594 *flagp = P_HSTART; 1595 } 1596 1597 /* 1598 * Note optimizations with pointers into P_NOTHING branches: some 1599 * should logically point to next node after current piece. 1600 * 1601 * Backtracking is also encoded in a slightly obscure way: the 1602 * code emitted ensures we test the non-empty branch of complex 1603 * patterns before the empty branch on each repetition. Hence 1604 * each time we fail on a non-empty branch, we try the empty branch, 1605 * which is equivalent to backtracking. 1606 */ 1607 if (op == P_COUNT) { 1608 /* (#cN,M) */ 1609 union upat countargs[P_CT_OPERAND]; 1610 char *opp = patparse; 1611 1612 countargs[0].l = P_COUNT; 1613 countargs[P_CT_CURRENT].l = 0L; 1614 countargs[P_CT_MIN].l = (long)zstrtol(patparse, &patparse, 10); 1615 if (patparse == opp) { 1616 /* missing number treated as zero */ 1617 countargs[P_CT_MIN].l = 0L; 1618 } 1619 if (*patparse != ',' && *patparse != Comma) { 1620 /* either max = min or error */ 1621 if (*patparse != Outpar) 1622 return 0; 1623 countargs[P_CT_MAX].l = countargs[P_CT_MIN].l; 1624 } else { 1625 opp = ++patparse; 1626 countargs[P_CT_MAX].l = (long)zstrtol(patparse, &patparse, 10); 1627 if (*patparse != Outpar) 1628 return 0; 1629 if (patparse == opp) { 1630 /* missing number treated as infinity: record as -1 */ 1631 countargs[P_CT_MAX].l = -1L; 1632 } 1633 } 1634 patparse++; 1635 countargs[P_CT_PTR].p = NULL; 1636 /* Mark this chain as a min/max count... */ 1637 patinsert(P_COUNTSTART, starter, (char *)countargs, sizeof(countargs)); 1638 /* 1639 * The next of the operand is a loop back to the P_COUNT. This is 1640 * how we get recursion for the count. We don't loop back to 1641 * the P_COUNTSTART; that's used for initialising the count 1642 * and saving and restoring the count for any enclosing use 1643 * of the match. 1644 */ 1645 opnd = P_OPERAND(starter) + P_CT_OPERAND; 1646 pattail(opnd, patnode(P_BACK)); 1647 pattail(opnd, P_OPERAND(starter)); 1648 /* 1649 * The next of the counter operators is what follows the 1650 * closure. 1651 * This handles matching of the tail. 1652 */ 1653 next = patnode(P_NOTHING); 1654 pattail(starter, next); 1655 pattail(P_OPERAND(starter), next); 1656 } else if ((flags & P_SIMPLE) && (op == P_ONEHASH || op == P_TWOHASH) && 1657 P_OP((Upat)patout+starter) == P_ANY) { 1658 /* Optimize ?# to *. Silly thing to do, since who would use 1659 * use ?# ? But it makes the later code shorter. 1660 */ 1661 Upat uptr = (Upat)patout + starter; 1662 if (op == P_TWOHASH) { 1663 /* ?## becomes ?* */ 1664 uptr->l = (uptr->l & ~0xff) | P_ANY; 1665 pattail(starter, patnode(P_STAR)); 1666 } else { 1667 uptr->l = (uptr->l & ~0xff) | P_STAR; 1668 } 1669 } else if ((flags & P_SIMPLE) && op && !(patglobflags & 0xff)) { 1670 /* Simplify, but not if we need to look for approximations. */ 1671 patinsert(op, starter, NULL, 0); 1672 } else if (op == P_ONEHASH) { 1673 /* Emit x# as (x&|), where & means "self". */ 1674 up.p = NULL; 1675 patinsert(P_WBRANCH, starter, (char *)&up, sizeof(up)); 1676 /* Either x */ 1677 patoptail(starter, patnode(P_BACK)); /* and loop */ 1678 patoptail(starter, starter); /* back */ 1679 pattail(starter, patnode(P_BRANCH)); /* or */ 1680 pattail(starter, patnode(P_NOTHING)); /* null. */ 1681 } else if (op == P_TWOHASH) { 1682 /* Emit x## as x(&|) where & means "self". */ 1683 next = patnode(P_WBRANCH); /* Either */ 1684 up.p = NULL; 1685 patadd((char *)&up, 0, sizeof(up), 0); 1686 pattail(starter, next); 1687 pattail(patnode(P_BACK), starter); /* loop back */ 1688 pattail(next, patnode(P_BRANCH)); /* or */ 1689 pattail(starter, patnode(P_NOTHING)); /* null. */ 1690 } else if (kshchar == '?') { 1691 /* Emit ?(x) as (x|) */ 1692 patinsert(P_BRANCH, starter, NULL, 0); /* Either x */ 1693 pattail(starter, patnode(P_BRANCH)); /* or */ 1694 next = patnode(P_NOTHING); /* null */ 1695 pattail(starter, next); 1696 patoptail(starter, next); 1697 } 1698 if (*patparse == zpc_special[ZPC_HASH]) 1699 return 0; 1700 1701 return starter; 1702} 1703 1704/* 1705 * Turn a ^foo (paren = 0) or !(foo) (paren = 1) into *~foo with 1706 * parentheses if necessary. As you see, that's really quite easy. 1707 */ 1708 1709/**/ 1710static long 1711patcompnot(int paren, int *flagsp) 1712{ 1713 union upat up; 1714 long excsync, br, excl, n, starter; 1715 int dummy; 1716 1717 /* Here, we're matching a star at the start. */ 1718 *flagsp = P_HSTART; 1719 1720 starter = patnode(P_BRANCH); 1721 br = patnode(P_STAR); 1722 excsync = patnode(P_EXCSYNC); 1723 pattail(br, excsync); 1724 pattail(starter, excl = patnode(P_EXCLUDE)); 1725 up.p = NULL; 1726 patadd((char *)&up, 0, sizeof(up), 0); 1727 if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy, 0)))) 1728 return 0; 1729 pattail(br, patnode(P_EXCEND)); 1730 n = patnode(P_NOTHING); /* just so much easier */ 1731 pattail(excsync, n); 1732 pattail(excl, n); 1733 1734 return starter; 1735} 1736 1737/* Emit a node */ 1738 1739/**/ 1740static long 1741patnode(long op) 1742{ 1743 long starter = (Upat)patcode - (Upat)patout; 1744 union upat up; 1745 1746 up.l = op; 1747 patadd((char *)&up, 0, sizeof(union upat), 0); 1748 return starter; 1749} 1750 1751/* 1752 * insert an operator in front of an already emitted operand: 1753 * we relocate the operand. there had better be nothing else after. 1754 */ 1755 1756/**/ 1757static void 1758patinsert(long op, int opnd, char *xtra, int sz) 1759{ 1760 char *src, *dst, *opdst; 1761 union upat buf, *lptr; 1762 1763 buf.l = 0; 1764 patadd((char *)&buf, 0, sizeof(buf), 0); 1765 if (sz) 1766 patadd(xtra, 0, sz, 0); 1767 src = patcode - sizeof(union upat) - sz; 1768 dst = patcode; 1769 opdst = patout + opnd * sizeof(union upat); 1770 while (src > opdst) 1771 *--dst = *--src; 1772 1773 /* A cast can't be an lvalue */ 1774 lptr = (Upat)opdst; 1775 lptr->l = op; 1776 opdst += sizeof(union upat); 1777 while (sz--) 1778 *opdst++ = *xtra++; 1779} 1780 1781/* set the 'next' pointer at the end of a node chain */ 1782 1783/**/ 1784static void 1785pattail(long p, long val) 1786{ 1787 Upat scan, temp; 1788 long offset; 1789 1790 scan = (Upat)patout + p; 1791 for (;;) { 1792 if (!(temp = PATNEXT(scan))) 1793 break; 1794 scan = temp; 1795 } 1796 1797 offset = (P_OP(scan) == P_BACK) 1798 ? (scan - (Upat)patout) - val : val - (scan - (Upat)patout); 1799 1800 scan->l |= offset << 8; 1801} 1802 1803/* do pattail, but on operand of first argument; nop if operandless */ 1804 1805/**/ 1806static void patoptail(long p, long val) 1807{ 1808 Upat ptr = (Upat)patout + p; 1809 int op = P_OP(ptr); 1810 if (!p || !P_ISBRANCH(ptr)) 1811 return; 1812 if (op == P_BRANCH) 1813 pattail(P_OPERAND(p), val); 1814 else 1815 pattail(P_OPERAND(p) + 1, val); 1816} 1817 1818 1819/* 1820 * Run a pattern. 1821 */ 1822static char *patinstart; /* Start of input string */ 1823static char *patinend; /* End of input string */ 1824static char *patinput; /* String input pointer */ 1825static char *patinpath; /* Full path for use with ~ exclusions */ 1826static int patinlen; /* Length of last successful match. 1827 * Includes count of Meta characters. 1828 */ 1829 1830static char *patbeginp[NSUBEXP]; /* Pointer to backref beginnings */ 1831static char *patendp[NSUBEXP]; /* Pointer to backref ends */ 1832static int parsfound; /* parentheses (with backrefs) found */ 1833 1834static int globdots; /* Glob initial dots? */ 1835 1836/* 1837 * Character functions operating on unmetafied strings. 1838 */ 1839#ifdef MULTIBYTE_SUPPORT 1840 1841/* Get a character from the start point in a string */ 1842#define CHARREF(x, y) charref((x), (y)) 1843static wchar_t 1844charref(char *x, char *y) 1845{ 1846 wchar_t wc; 1847 size_t ret; 1848 1849 if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80)) 1850 return (wchar_t) STOUC(*x); 1851 1852 ret = mbrtowc(&wc, x, y-x, &shiftstate); 1853 1854 if (ret == MB_INVALID || ret == MB_INCOMPLETE) { 1855 /* Error. Treat as single byte. */ 1856 /* Reset the shift state for next time. */ 1857 memset(&shiftstate, 0, sizeof(shiftstate)); 1858 return (wchar_t) STOUC(*x); 1859 } 1860 1861 return wc; 1862} 1863 1864/* Get a pointer to the next character */ 1865#define CHARNEXT(x, y) charnext((x), (y)) 1866static char * 1867charnext(char *x, char *y) 1868{ 1869 wchar_t wc; 1870 size_t ret; 1871 1872 if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80)) 1873 return x + 1; 1874 1875 ret = mbrtowc(&wc, x, y-x, &shiftstate); 1876 1877 if (ret == MB_INVALID || ret == MB_INCOMPLETE) { 1878 /* Error. Treat as single byte. */ 1879 /* Reset the shift state for next time. */ 1880 memset(&shiftstate, 0, sizeof(shiftstate)); 1881 return x + 1; 1882 } 1883 1884 /* Nulls here are normal characters */ 1885 return x + (ret ? ret : 1); 1886} 1887 1888/* Increment a pointer past the current character. */ 1889#define CHARINC(x, y) ((x) = charnext((x), (y))) 1890 1891 1892/* Get a character and increment */ 1893#define CHARREFINC(x, y, z) charrefinc(&(x), (y), (z)) 1894static wchar_t 1895charrefinc(char **x, char *y, int *z) 1896{ 1897 wchar_t wc; 1898 size_t ret; 1899 1900 if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80)) 1901 return (wchar_t) STOUC(*(*x)++); 1902 1903 ret = mbrtowc(&wc, *x, y-*x, &shiftstate); 1904 1905 if (ret == MB_INVALID || ret == MB_INCOMPLETE) { 1906 /* Error. Treat as single byte, but flag. */ 1907 *z = 1; 1908 /* Reset the shift state for next time. */ 1909 memset(&shiftstate, 0, sizeof(shiftstate)); 1910 return (wchar_t) STOUC(*(*x)++); 1911 } 1912 1913 /* Nulls here are normal characters */ 1914 *x += ret ? ret : 1; 1915 1916 return wc; 1917} 1918 1919 1920/* 1921 * Counter the number of characters between two pointers, smaller first 1922 * 1923 * This is used when setting values in parameters, so we obey 1924 * the MULTIBYTE option (even if it's been overridden locally). 1925 */ 1926#define CHARSUB(x,y) charsub(x, y) 1927static ptrdiff_t 1928charsub(char *x, char *y) 1929{ 1930 ptrdiff_t res = 0; 1931 size_t ret; 1932 wchar_t wc; 1933 1934 if (!isset(MULTIBYTE)) 1935 return y - x; 1936 1937 while (x < y) { 1938 ret = mbrtowc(&wc, x, y-x, &shiftstate); 1939 1940 if (ret == MB_INVALID || ret == MB_INCOMPLETE) { 1941 /* Error. Treat remainder as single characters */ 1942 return res + (y - x); 1943 } 1944 1945 /* Treat nulls as normal characters */ 1946 if (!ret) 1947 ret = 1; 1948 res++; 1949 x += ret; 1950 } 1951 1952 return res; 1953} 1954 1955#else /* no MULTIBYTE_SUPPORT */ 1956 1957/* Get a character from the start point in a string */ 1958#define CHARREF(x, y) (STOUC(*(x))) 1959/* Get a pointer to the next character */ 1960#define CHARNEXT(x, y) ((x)+1) 1961/* Increment a pointer past the current character. */ 1962#define CHARINC(x, y) ((x)++) 1963/* Get a character and increment */ 1964#define CHARREFINC(x, y, z) (STOUC(*(x)++)) 1965/* Counter the number of characters between two pointers, smaller first */ 1966#define CHARSUB(x,y) ((y) - (x)) 1967 1968#endif /* MULTIBYTE_SUPPORT */ 1969 1970/* 1971 * The following need to be accessed in the globbing scanner for 1972 * a multi-component file path. See horror story in glob.c. 1973 */ 1974/**/ 1975int errsfound; /* Total error count so far */ 1976 1977/**/ 1978int forceerrs; /* Forced maximum error count */ 1979 1980/**/ 1981void 1982pattrystart(void) 1983{ 1984 forceerrs = -1; 1985 errsfound = 0; 1986} 1987 1988/* 1989 * Test prog against null-terminated, metafied string. 1990 */ 1991 1992/**/ 1993mod_export int 1994pattry(Patprog prog, char *string) 1995{ 1996 return pattryrefs(prog, string, -1, -1, 0, NULL, NULL, NULL); 1997} 1998 1999/* 2000 * Test prog against string of given length, no null termination 2001 * but still metafied at this point. offset gives an offset 2002 * to include in reported match indices 2003 */ 2004 2005/**/ 2006mod_export int 2007pattrylen(Patprog prog, char *string, int len, int unmetalen, int offset) 2008{ 2009 return pattryrefs(prog, string, len, unmetalen, offset, NULL, NULL, NULL); 2010} 2011 2012/* 2013 * Test prog against string with given lengths. The input 2014 * string is metafied; stringlen is the raw string length, and 2015 * unmetalen the number of characters in the original string (some 2016 * of which may now be metafied). Either value may be -1 2017 * to indicate a null-terminated string which will be counted. Note 2018 * there may be a severe penalty for this if a lot of matching is done 2019 * on one string. 2020 * 2021 * offset is the position in the original string (not seen by 2022 * the pattern module) at which we are trying to match. 2023 * This is added in to the positions recorded in patbeginp and patendp 2024 * when we are looking for substrings. Currently this only happens 2025 * in the parameter substitution code. 2026 * 2027 * Note this is a character offset, i.e. a metafied character 2028 * counts as 1. 2029 * 2030 * The last three arguments are used to report the positions for the 2031 * backreferences. On entry, *nump should contain the maximum number 2032 * of positions to report. In this case the match, mbegin, mend 2033 * arrays are not altered. 2034 * 2035 * If nump is NULL but endp is not NULL, then *endp is set to the 2036 * end position of the match, taking into account patinstart. 2037 */ 2038 2039/**/ 2040mod_export int 2041pattryrefs(Patprog prog, char *string, int stringlen, int unmetalen, 2042 int patoffset, 2043 int *nump, int *begp, int *endp) 2044{ 2045 int i, maxnpos = 0, ret, needfullpath, unmetalenp; 2046 int origlen; 2047 char **sp, **ep, *tryalloced, *ptr; 2048 char *progstr = (char *)prog + prog->startoff; 2049 2050 if (nump) { 2051 maxnpos = *nump; 2052 *nump = 0; 2053 } 2054 /* inherited from domatch, but why, exactly? */ 2055 if (*string == Nularg) { 2056 string++; 2057 unmetalen--; 2058 } 2059 2060 if (stringlen < 0) 2061 stringlen = strlen(string); 2062 origlen = stringlen; 2063 2064 patflags = prog->flags; 2065 /* 2066 * For a top-level ~-exclusion, we will need the full 2067 * path to exclude, so copy the path so far and append the 2068 * current test string. 2069 */ 2070 needfullpath = (patflags & PAT_HAS_EXCLUDP) && pathpos; 2071 2072 /* Get the length of the full string when unmetafied. */ 2073 if (unmetalen < 0) 2074 unmetalen = ztrsub(string + stringlen, string); 2075 if (needfullpath) 2076 unmetalenp = ztrsub(pathbuf + pathpos, pathbuf); 2077 else 2078 unmetalenp = 0; 2079 2080 DPUTS(needfullpath && (patflags & (PAT_PURES|PAT_ANY)), 2081 "rum sort of file exclusion"); 2082 /* 2083 * Partly for efficiency, and partly for the convenience of 2084 * globbing, we don't unmetafy pure string patterns, and 2085 * there's no reason to if the pattern is just a *. 2086 */ 2087 if (!(patflags & (PAT_PURES|PAT_ANY)) 2088 && (needfullpath || unmetalen != stringlen)) { 2089 /* 2090 * We need to copy if we need to prepend the path so far 2091 * (in which case we copy both chunks), or if we have 2092 * Meta characters. 2093 */ 2094 char *dst; 2095 int icopy, ncopy; 2096 2097 dst = tryalloced = zalloc(unmetalen + unmetalenp); 2098 2099 if (needfullpath) { 2100 /* loop twice, copy path buffer first time */ 2101 ptr = pathbuf; 2102 ncopy = unmetalenp; 2103 } else { 2104 /* just loop once, copy string with unmetafication */ 2105 ptr = string; 2106 ncopy = unmetalen; 2107 } 2108 for (icopy = 0; icopy < 2; icopy++) { 2109 for (i = 0; i < ncopy; i++) { 2110 if (*ptr == Meta) { 2111 ptr++; 2112 *dst++ = *ptr++ ^ 32; 2113 } else { 2114 *dst++ = *ptr++; 2115 } 2116 } 2117 if (!needfullpath) 2118 break; 2119 /* next time append test string to path so far */ 2120 ptr = string; 2121 ncopy = unmetalen; 2122 } 2123 2124 if (needfullpath) { 2125 patinstart = tryalloced + unmetalenp; 2126 patinpath = tryalloced; 2127 } else { 2128 patinstart = tryalloced; 2129 patinpath = NULL; 2130 } 2131 stringlen = unmetalen; 2132 } else { 2133 patinstart = string; 2134 tryalloced = patinpath = NULL; 2135 } 2136 2137 patinend = patinstart + stringlen; 2138 /* 2139 * From now on we do not require NULL termination of 2140 * the test string. There should also be no more references 2141 * to the variable string. 2142 */ 2143 2144 if (prog->flags & (PAT_PURES|PAT_ANY)) { 2145 /* 2146 * Either we are testing against a pure string, 2147 * or we can match anything at all. 2148 */ 2149 int ret; 2150 if (prog->flags & PAT_ANY) { 2151 /* 2152 * Optimisation for a single "*": always matches 2153 * (except for no_glob_dots, see below). 2154 */ 2155 ret = 1; 2156 } else { 2157 /* 2158 * Testing a pure string. See if initial 2159 * components match. 2160 */ 2161 int lendiff = stringlen - prog->patmlen; 2162 if (lendiff < 0) { 2163 /* No, the pattern string is too long. */ 2164 ret = 0; 2165 } else if (!memcmp(progstr, patinstart, prog->patmlen)) { 2166 /* 2167 * Initial component matches. Matches either 2168 * if lengths are the same or we are not anchored 2169 * to the end of the string. 2170 */ 2171 ret = !lendiff || (prog->flags & PAT_NOANCH); 2172 } else { 2173 /* No match. */ 2174 ret = 0; 2175 } 2176 } 2177 if (ret) { 2178 /* 2179 * For files, we won't match initial "."s unless 2180 * glob_dots is set. 2181 */ 2182 if ((prog->flags & PAT_NOGLD) && *patinstart == '.') { 2183 ret = 0; 2184 } else { 2185 /* 2186 * Remember the length in case used for ${..#..} etc. 2187 * In this case, we didn't unmetafy the string. 2188 */ 2189 patinlen = (int)prog->patmlen; 2190 /* if matching files, must update globbing flags */ 2191 patglobflags = prog->globend; 2192 2193 if ((patglobflags & GF_MATCHREF) && 2194 !(patflags & PAT_FILE)) { 2195 char *str = ztrduppfx(patinstart, patinlen); 2196 char *ptr = patinstart; 2197 int mlen = 0; 2198 2199 /* 2200 * Count the characters. We're not using CHARSUB() 2201 * because the string is still metafied. We're 2202 * not using mb_metastrlen() because that expects 2203 * the string to be null terminated. 2204 */ 2205 MB_METACHARINIT(); 2206 while (ptr < patinstart + patinlen) { 2207 mlen++; 2208 ptr += MB_METACHARLEN(ptr); 2209 } 2210 2211 setsparam("MATCH", str); 2212 setiparam("MBEGIN", 2213 (zlong)(patoffset + !isset(KSHARRAYS))); 2214 setiparam("MEND", 2215 (zlong)(mlen + patoffset + 2216 !isset(KSHARRAYS) - 1)); 2217 } 2218 } 2219 } 2220 2221 if (tryalloced) 2222 zfree(tryalloced, unmetalen + unmetalenp); 2223 2224 return ret; 2225 } else { 2226 /* 2227 * Test for a `must match' string, unless we're scanning for a match 2228 * in which case we don't need to do this each time. 2229 */ 2230 ret = 1; 2231 if (!(prog->flags & PAT_SCAN) && prog->mustoff) 2232 { 2233 char *testptr; /* start pointer into test string */ 2234 char *teststop; /* last point from which we can match */ 2235 char *patptr = (char *)prog + prog->mustoff; 2236 int patlen = prog->patmlen; 2237 int found = 0; 2238 2239 if (patlen > stringlen) { 2240 /* Too long, can't match. */ 2241 ret = 0; 2242 } else { 2243 teststop = patinend - patlen; 2244 2245 for (testptr = patinstart; testptr <= teststop; testptr++) 2246 { 2247 if (!memcmp(testptr, patptr, patlen)) { 2248 found = 1; 2249 break; 2250 } 2251 } 2252 2253 if (!found) 2254 ret = 0; 2255 } 2256 } 2257 if (!ret) { 2258 if (tryalloced) 2259 zfree(tryalloced, unmetalen + unmetalenp); 2260 return 0; 2261 } 2262 2263 patglobflags = prog->globflags; 2264 if (!(patflags & PAT_FILE)) { 2265 forceerrs = -1; 2266 errsfound = 0; 2267 } 2268 globdots = !(patflags & PAT_NOGLD); 2269 parsfound = 0; 2270 2271 patinput = patinstart; 2272 2273 if (patmatch((Upat)progstr)) { 2274 /* 2275 * we were lazy and didn't save the globflags if an exclusion 2276 * failed, so set it now 2277 */ 2278 patglobflags = prog->globend; 2279 2280 /* 2281 * Record length of successful match, including Meta 2282 * characters. Do it here so that patmatchlen() can return 2283 * it even if we delete the pattern strings. 2284 */ 2285 patinlen = patinput - patinstart; 2286 /* 2287 * Optimization: if we didn't find any Meta characters 2288 * to begin with, we don't need to look for them now. 2289 */ 2290 if (unmetalen != origlen) { 2291 for (ptr = patinstart; ptr < patinput; ptr++) 2292 if (imeta(*ptr)) 2293 patinlen++; 2294 } 2295 2296 /* 2297 * Should we clear backreferences and matches on a failed 2298 * match? 2299 */ 2300 if ((patglobflags & GF_MATCHREF) && !(patflags & PAT_FILE)) { 2301 /* 2302 * m flag: for global match. This carries no overhead 2303 * in the pattern matching part. 2304 * 2305 * Remember the test pattern is already unmetafied. 2306 */ 2307 char *str; 2308 int mlen = CHARSUB(patinstart, patinput); 2309 2310 str = metafy(patinstart, patinput - patinstart, META_DUP); 2311 setsparam("MATCH", str); 2312 setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS))); 2313 setiparam("MEND", 2314 (zlong)(mlen + patoffset + 2315 !isset(KSHARRAYS) - 1)); 2316 } 2317 if (prog->patnpar && nump) { 2318 /* 2319 * b flag: for backreferences using parentheses. Reported 2320 * directly. 2321 */ 2322 *nump = prog->patnpar; 2323 2324 sp = patbeginp; 2325 ep = patendp; 2326 2327 for (i = 0; i < prog->patnpar && i < maxnpos; i++) { 2328 if (parsfound & (1 << i)) { 2329 if (begp) 2330 *begp++ = CHARSUB(patinstart, *sp) + patoffset; 2331 if (endp) 2332 *endp++ = CHARSUB(patinstart, *ep) + patoffset 2333 - 1; 2334 } else { 2335 if (begp) 2336 *begp++ = -1; 2337 if (endp) 2338 *endp++ = -1; 2339 } 2340 2341 sp++; 2342 ep++; 2343 } 2344 } else if (prog->patnpar && !(patflags & PAT_FILE)) { 2345 /* 2346 * b flag: for backreferences using parentheses. 2347 */ 2348 int palen = prog->patnpar+1; 2349 char **matcharr, **mbeginarr, **mendarr; 2350 char numbuf[DIGBUFSIZE]; 2351 2352 matcharr = zshcalloc(palen*sizeof(char *)); 2353 mbeginarr = zshcalloc(palen*sizeof(char *)); 2354 mendarr = zshcalloc(palen*sizeof(char *)); 2355 2356 sp = patbeginp; 2357 ep = patendp; 2358 2359 for (i = 0; i < prog->patnpar; i++) { 2360 if (parsfound & (1 << i)) { 2361 matcharr[i] = metafy(*sp, *ep - *sp, META_DUP); 2362 /* 2363 * mbegin and mend give indexes into the string 2364 * in the standard notation, i.e. respecting 2365 * KSHARRAYS, and with the end index giving 2366 * the last character, not one beyond. 2367 * For example, foo=foo; [[ $foo = (f)oo ]] gives 2368 * (without KSHARRAYS) indexes 1 and 1, which 2369 * corresponds to indexing as ${foo[1,1]}. 2370 */ 2371 sprintf(numbuf, "%ld", 2372 (long)(CHARSUB(patinstart, *sp) + 2373 patoffset + 2374 !isset(KSHARRAYS))); 2375 mbeginarr[i] = ztrdup(numbuf); 2376 sprintf(numbuf, "%ld", 2377 (long)(CHARSUB(patinstart, *ep) + 2378 patoffset + 2379 !isset(KSHARRAYS) - 1)); 2380 mendarr[i] = ztrdup(numbuf); 2381 } else { 2382 /* Pattern wasn't set: either it was in an 2383 * unmatched branch, or a hashed parenthesis 2384 * that didn't match at all. 2385 */ 2386 matcharr[i] = ztrdup(""); 2387 mbeginarr[i] = ztrdup("-1"); 2388 mendarr[i] = ztrdup("-1"); 2389 } 2390 sp++; 2391 ep++; 2392 } 2393 setaparam("match", matcharr); 2394 setaparam("mbegin", mbeginarr); 2395 setaparam("mend", mendarr); 2396 } 2397 2398 if (!nump && endp) { 2399 /* 2400 * We just need the overall end position. 2401 */ 2402 *endp = CHARSUB(patinstart, patinput) + patoffset; 2403 } 2404 2405 ret = 1; 2406 } else 2407 ret = 0; 2408 2409 if (tryalloced) 2410 zfree(tryalloced, unmetalen + unmetalenp); 2411 2412 return ret; 2413 } 2414} 2415 2416/* 2417 * Return length of previous succesful match. This is 2418 * in metafied bytes, i.e. includes a count of Meta characters. 2419 * Unusual and futile attempt at modular encapsulation. 2420 */ 2421 2422/**/ 2423int 2424patmatchlen(void) 2425{ 2426 return patinlen; 2427} 2428 2429/* 2430 * Match literal characters with case insensitivity test: the first 2431 * comes from the input string, the second the current pattern. 2432 */ 2433#ifdef MULTIBYTE_SUPPORT 2434#define ISUPPER(x) iswupper(x) 2435#define ISLOWER(x) iswlower(x) 2436#define TOUPPER(x) towupper(x) 2437#define TOLOWER(x) towlower(x) 2438#define ISDIGIT(x) iswdigit(x) 2439#else 2440#define ISUPPER(x) isupper(x) 2441#define ISLOWER(x) islower(x) 2442#define TOUPPER(x) toupper(x) 2443#define TOLOWER(x) tolower(x) 2444#define ISDIGIT(x) idigit(x) 2445#endif 2446#define CHARMATCH(chin, chpa) (chin == chpa || \ 2447 ((patglobflags & GF_IGNCASE) ? \ 2448 ((ISUPPER(chin) ? TOLOWER(chin) : chin) == \ 2449 (ISUPPER(chpa) ? TOLOWER(chpa) : chpa)) : \ 2450 (patglobflags & GF_LCMATCHUC) ? \ 2451 (ISLOWER(chpa) && TOUPPER(chpa) == chin) : 0)) 2452 2453/* 2454 * The same but caching an expression from the first argument, 2455 * Requires local charmatch_cache definition. 2456 */ 2457#define CHARMATCH_EXPR(expr, chpa) \ 2458 (charmatch_cache = (expr), CHARMATCH(charmatch_cache, chpa)) 2459 2460/* 2461 * exactpos is used to remember how far down an exact string we have 2462 * matched, if we are doing approximation and can therefore redo from 2463 * the same point; we never need to otherwise. 2464 * 2465 * exactend is a pointer to the end of the string, which isn't 2466 * null-terminated. 2467 */ 2468static char *exactpos, *exactend; 2469 2470/* 2471 * Main matching routine. 2472 * 2473 * Testing the tail end of a match is usually done by recursion, but 2474 * we try to eliminate that in favour of looping for simple cases. 2475 */ 2476 2477/**/ 2478static int 2479patmatch(Upat prog) 2480{ 2481 /* Current and next nodes */ 2482 Upat scan = prog, next, opnd; 2483 char *start, *save, *chrop, *chrend, *compend; 2484 int savglobflags, op, no, min, fail = 0, saverrsfound; 2485 zrange_t from, to, comp; 2486 patint_t nextch; 2487 2488 while (scan) { 2489 next = PATNEXT(scan); 2490 2491 if (!globdots && P_NOTDOT(scan) && patinput == patinstart && 2492 patinput < patinend && *patinput == '.') 2493 return 0; 2494 2495 switch (P_OP(scan)) { 2496 case P_ANY: 2497 if (patinput == patinend) 2498 fail = 1; 2499 else 2500 CHARINC(patinput, patinend); 2501 break; 2502 case P_EXACTLY: 2503 /* 2504 * acts as nothing if *chrop is null: this is used by 2505 * approx code. 2506 */ 2507 if (exactpos) { 2508 chrop = exactpos; 2509 chrend = exactend; 2510 } else { 2511 chrop = P_LS_STR(scan); 2512 chrend = chrop + P_LS_LEN(scan); 2513 } 2514 exactpos = NULL; 2515 while (chrop < chrend && patinput < patinend) { 2516 char *savpatinput = patinput; 2517 char *savchrop = chrop; 2518 int badin = 0, badpa = 0; 2519 /* 2520 * Care with character matching: 2521 * We do need to convert the character to wide 2522 * representation if possible, because we may need 2523 * to do case transformation. However, we should 2524 * be careful in case one, but not the other, wasn't 2525 * representable in the current locale---in that 2526 * case they don't match even if the returned 2527 * values (one properly converted, one raw) are 2528 * the same. 2529 */ 2530 patint_t chin = CHARREFINC(patinput, patinend, &badin); 2531 patint_t chpa = CHARREFINC(chrop, chrend, &badpa); 2532 if (!CHARMATCH(chin, chpa) || badin != badpa) { 2533 fail = 1; 2534 patinput = savpatinput; 2535 chrop = savchrop; 2536 break; 2537 } 2538 } 2539 if (chrop < chrend) { 2540 exactpos = chrop; 2541 exactend = chrend; 2542 fail = 1; 2543 } 2544 break; 2545 case P_ANYOF: 2546 case P_ANYBUT: 2547 if (patinput == patinend) 2548 fail = 1; 2549 else { 2550#ifdef MULTIBYTE_SUPPORT 2551 wchar_t cr = CHARREF(patinput, patinend); 2552 char *scanop = (char *)P_OPERAND(scan); 2553 if (patglobflags & GF_MULTIBYTE) { 2554 if (mb_patmatchrange(scanop, cr, NULL, NULL) ^ 2555 (P_OP(scan) == P_ANYOF)) 2556 fail = 1; 2557 else 2558 CHARINC(patinput, patinend); 2559 } else if (patmatchrange(scanop, (int)cr, NULL, NULL) ^ 2560 (P_OP(scan) == P_ANYOF)) 2561 fail = 1; 2562 else 2563 CHARINC(patinput, patinend); 2564#else 2565 if (patmatchrange((char *)P_OPERAND(scan), 2566 CHARREF(patinput, patinend), NULL, NULL) ^ 2567 (P_OP(scan) == P_ANYOF)) 2568 fail = 1; 2569 else 2570 CHARINC(patinput, patinend); 2571#endif 2572 } 2573 break; 2574 case P_NUMRNG: 2575 case P_NUMFROM: 2576 case P_NUMTO: 2577 /* 2578 * To do this properly, we really have to treat numbers as 2579 * closures: that's so things like <1-1000>33 will 2580 * match 633 (they didn't up to 3.1.6). To avoid making this 2581 * too inefficient, we see if there's an exact match next: 2582 * if there is, and it's not a digit, we return 1 after 2583 * the first attempt. 2584 */ 2585 op = P_OP(scan); 2586 start = (char *)P_OPERAND(scan); 2587 from = to = 0; 2588 if (op != P_NUMTO) { 2589#ifdef ZSH_64_BIT_TYPE 2590 /* We can't rely on pointer alignment being good enough. */ 2591 memcpy((char *)&from, start, sizeof(zrange_t)); 2592#else 2593 from = *((zrange_t *) start); 2594#endif 2595 start += sizeof(zrange_t); 2596 } 2597 if (op != P_NUMFROM) { 2598#ifdef ZSH_64_BIT_TYPE 2599 memcpy((char *)&to, start, sizeof(zrange_t)); 2600#else 2601 to = *((zrange_t *) start); 2602#endif 2603 } 2604 start = compend = patinput; 2605 comp = 0; 2606 while (patinput < patinend && idigit(*patinput)) { 2607 if (comp) 2608 comp *= 10; 2609 comp += *patinput - '0'; 2610 patinput++; 2611 compend++; 2612 2613 if (comp & ((zrange_t)1 << (sizeof(comp)*8 - 2614#ifdef ZRANGE_T_IS_SIGNED 2615 2 2616#else 2617 1 2618#endif 2619 ))) { 2620 /* 2621 * Out of range (allowing for signedness, which 2622 * we need if we are using zlongs). 2623 * This is as far as we can go. 2624 * If we're doing a range "from", skip all the 2625 * remaining numbers. Otherwise, we can't 2626 * match beyond the previous point anyway. 2627 * Leave the pointer to the last calculated 2628 * position (compend) where it was before. 2629 */ 2630 if (op == P_NUMFROM) { 2631 while (patinput < patinend && idigit(*patinput)) 2632 patinput++; 2633 } 2634 } 2635 } 2636 save = patinput; 2637 no = 0; 2638 while (patinput > start) { 2639 /* if already too small, no power on earth can save it */ 2640 if (comp < from && patinput <= compend) 2641 break; 2642 if ((op == P_NUMFROM || comp <= to) && patmatch(next)) { 2643 return 1; 2644 } 2645 if (!no && P_OP(next) == P_EXACTLY && 2646 (!P_LS_LEN(next) || 2647 !idigit(STOUC(*P_LS_STR(next)))) && 2648 !(patglobflags & 0xff)) 2649 return 0; 2650 patinput = --save; 2651 no++; 2652 /* 2653 * With a range start and an unrepresentable test 2654 * number, we just back down the test string without 2655 * changing the number until we get to a representable 2656 * one. 2657 */ 2658 if (patinput < compend) 2659 comp /= 10; 2660 } 2661 patinput = start; 2662 fail = 1; 2663 break; 2664 case P_NUMANY: 2665 /* This is <->: any old set of digits, don't bother comparing */ 2666 start = patinput; 2667 while (patinput < patinend && idigit(*patinput)) 2668 patinput++; 2669 save = patinput; 2670 no = 0; 2671 while (patinput > start) { 2672 if (patmatch(next)) 2673 return 1; 2674 if (!no && P_OP(next) == P_EXACTLY && 2675 (!P_LS_LEN(next) || 2676 !idigit(*P_LS_STR(next))) && 2677 !(patglobflags & 0xff)) 2678 return 0; 2679 patinput = --save; 2680 no++; 2681 } 2682 patinput = start; 2683 fail = 1; 2684 break; 2685 case P_NOTHING: 2686 break; 2687 case P_BACK: 2688 break; 2689 case P_GFLAGS: 2690 patglobflags = P_OPERAND(scan)->l; 2691 break; 2692 case P_OPEN: 2693 case P_OPEN+1: 2694 case P_OPEN+2: 2695 case P_OPEN+3: 2696 case P_OPEN+4: 2697 case P_OPEN+5: 2698 case P_OPEN+6: 2699 case P_OPEN+7: 2700 case P_OPEN+8: 2701 case P_OPEN+9: 2702 no = P_OP(scan) - P_OPEN; 2703 save = patinput; 2704 2705 if (patmatch(next)) { 2706 /* 2707 * Don't set patbeginp if some later invocation of 2708 * the same parentheses already has. 2709 */ 2710 if (no && !(parsfound & (1 << (no - 1)))) { 2711 patbeginp[no-1] = save; 2712 parsfound |= 1 << (no - 1); 2713 } 2714 return 1; 2715 } else 2716 return 0; 2717 break; 2718 case P_CLOSE: 2719 case P_CLOSE+1: 2720 case P_CLOSE+2: 2721 case P_CLOSE+3: 2722 case P_CLOSE+4: 2723 case P_CLOSE+5: 2724 case P_CLOSE+6: 2725 case P_CLOSE+7: 2726 case P_CLOSE+8: 2727 case P_CLOSE+9: 2728 no = P_OP(scan) - P_CLOSE; 2729 save = patinput; 2730 2731 if (patmatch(next)) { 2732 if (no && !(parsfound & (1 << (no + 15)))) { 2733 patendp[no-1] = save; 2734 parsfound |= 1 << (no + 15); 2735 } 2736 return 1; 2737 } else 2738 return 0; 2739 break; 2740 case P_EXCSYNC: 2741 /* See the P_EXCLUDE code below for where syncptr comes from */ 2742 { 2743 unsigned char *syncptr; 2744 Upat after; 2745 after = P_OPERAND(scan); 2746 DPUTS(!P_ISEXCLUDE(after), 2747 "BUG: EXCSYNC not followed by EXCLUDE."); 2748 DPUTS(!P_OPERAND(after)->p, 2749 "BUG: EXCSYNC not handled by EXCLUDE"); 2750 syncptr = P_OPERAND(after)->p + (patinput - patinstart); 2751 /* 2752 * If we already matched from here, this time we fail. 2753 * See WBRANCH code for story about error count. 2754 */ 2755 if (*syncptr && errsfound + 1 >= *syncptr) 2756 return 0; 2757 /* 2758 * Else record that we (possibly) matched this time. 2759 * No harm if we don't: then the previous test will just 2760 * short cut the attempted match that is bound to fail. 2761 * We never try to exclude something that has already 2762 * failed anyway. 2763 */ 2764 *syncptr = errsfound + 1; 2765 } 2766 break; 2767 case P_EXCEND: 2768 /* 2769 * This is followed by a P_EXCSYNC, but only in the P_EXCLUDE 2770 * branch. Actually, we don't bother following it: all we 2771 * need to know is that we successfully matched so far up 2772 * to the end of the asserted pattern; the endpoint 2773 * in the target string is nulled out. 2774 */ 2775 if (!(fail = (patinput < patinend))) 2776 return 1; 2777 break; 2778 case P_BRANCH: 2779 case P_WBRANCH: 2780 /* P_EXCLUDE shouldn't occur without a P_BRANCH */ 2781 if (!P_ISBRANCH(next)) { 2782 /* no choice, avoid recursion */ 2783 DPUTS(P_OP(scan) == P_WBRANCH, 2784 "BUG: WBRANCH with no alternative."); 2785 next = P_OPERAND(scan); 2786 } else { 2787 do { 2788 save = patinput; 2789 savglobflags = patglobflags; 2790 saverrsfound = errsfound; 2791 if (P_ISEXCLUDE(next)) { 2792 /* 2793 * The strategy is to test the asserted pattern, 2794 * recording via P_EXCSYNC how far the part to 2795 * be excluded matched. We then set the 2796 * length of the test string to that 2797 * point and see if the exclusion as far as 2798 * P_EXCEND also matches that string. 2799 * We need to keep testing the asserted pattern 2800 * by backtracking, since the first attempt 2801 * may be excluded while a later attempt may not. 2802 * For this we keep a pointer just after 2803 * the P_EXCLUDE which is tested by the P_EXCSYNC 2804 * to see if we matched there last time, in which 2805 * case we fail. If there is nothing to backtrack 2806 * over, that doesn't matter: we should fail anyway. 2807 * The pointer also tells us where the asserted 2808 * pattern matched for use by the exclusion. 2809 * 2810 * It's hard to allocate space for this 2811 * beforehand since we may need to do it 2812 * recursively. 2813 * 2814 * P.S. in case you were wondering, this code 2815 * is horrible. 2816 */ 2817 Upat syncstrp; 2818 char *origpatinend; 2819 unsigned char *oldsyncstr; 2820 char *matchpt = NULL; 2821 int ret, savglobdots, matchederrs = 0; 2822 int savparsfound = parsfound; 2823 DPUTS(P_OP(scan) == P_WBRANCH, 2824 "BUG: excluded WBRANCH"); 2825 syncstrp = P_OPERAND(next); 2826 /* 2827 * Unlike WBRANCH, each test at the same exclude 2828 * sync point (due to an external loop) is separate, 2829 * i.e testing (foo~bar)# is no different from 2830 * (foo~bar)(foo~bar)... from the exclusion point 2831 * of view, so we use a different sync string. 2832 */ 2833 oldsyncstr = syncstrp->p; 2834 syncstrp->p = (unsigned char *) 2835 zshcalloc((patinend - patinstart) + 1); 2836 origpatinend = patinend; 2837 while ((ret = patmatch(P_OPERAND(scan)))) { 2838 unsigned char *syncpt; 2839 char *savpatinstart; 2840 int savforce = forceerrs; 2841 int savpatflags = patflags, synclen; 2842 forceerrs = -1; 2843 savglobdots = globdots; 2844 matchederrs = errsfound; 2845 matchpt = patinput; /* may not be end */ 2846 globdots = 1; /* OK to match . first */ 2847 /* Find the point where the scan 2848 * matched the part to be excluded: because 2849 * of backtracking, the one 2850 * most recently matched will be the first. 2851 * (Luckily, backtracking is done after all 2852 * possibilities for approximation have been 2853 * checked.) 2854 */ 2855 for (syncpt = syncstrp->p; !*syncpt; syncpt++) 2856 ; 2857 synclen = syncpt - syncstrp->p; 2858 if (patinstart + synclen != patinend) { 2859 /* 2860 * Temporarily mark the string as 2861 * ending at this point. 2862 */ 2863 DPUTS(patinstart + synclen > matchpt, 2864 "BUG: EXCSYNC failed"); 2865 2866 patinend = patinstart + synclen; 2867 /* 2868 * If this isn't really the end of the string, 2869 * remember this for the (#e) assertion. 2870 */ 2871 patflags |= PAT_NOTEND; 2872 } 2873 savpatinstart = patinstart; 2874 next = PATNEXT(scan); 2875 while (next && P_ISEXCLUDE(next)) { 2876 patinput = save; 2877 /* 2878 * turn off approximations in exclusions: 2879 * note we keep remaining patglobflags 2880 * set by asserted branch (or previous 2881 * excluded branches, for consistency). 2882 */ 2883 patglobflags &= ~0xff; 2884 errsfound = 0; 2885 opnd = P_OPERAND(next) + 1; 2886 if (P_OP(next) == P_EXCLUDP && patinpath) { 2887 /* 2888 * Top level exclusion with a file, 2889 * applies to whole path so add the 2890 * segments already matched. 2891 * We copied these in front of the 2892 * test pattern, so patinend doesn't 2893 * need moving. 2894 */ 2895 DPUTS(patinput != patinstart, 2896 "BUG: not at start excluding path"); 2897 patinput = patinstart = patinpath; 2898 } 2899 if (patmatch(opnd)) { 2900 ret = 0; 2901 /* 2902 * Another subtlety: if we exclude the 2903 * match, any parentheses just found 2904 * become invalidated. 2905 */ 2906 parsfound = savparsfound; 2907 } 2908 if (patinpath) { 2909 patinput = savpatinstart + 2910 (patinput - patinstart); 2911 patinstart = savpatinstart; 2912 } 2913 if (!ret) 2914 break; 2915 next = PATNEXT(next); 2916 } 2917 /* 2918 * Restore original end position. 2919 */ 2920 patinend = origpatinend; 2921 patflags = savpatflags; 2922 globdots = savglobdots; 2923 forceerrs = savforce; 2924 if (ret) 2925 break; 2926 patinput = save; 2927 patglobflags = savglobflags; 2928 errsfound = saverrsfound; 2929 } 2930 zfree((char *)syncstrp->p, 2931 (patinend - patinstart) + 1); 2932 syncstrp->p = oldsyncstr; 2933 if (ret) { 2934 patinput = matchpt; 2935 errsfound = matchederrs; 2936 return 1; 2937 } 2938 while ((scan = PATNEXT(scan)) && 2939 P_ISEXCLUDE(scan)) 2940 ; 2941 } else { 2942 int ret = 1, pfree = 0; 2943 Upat ptrp = NULL; 2944 unsigned char *ptr; 2945 if (P_OP(scan) == P_WBRANCH) { 2946 /* 2947 * This is where we make sure that we are not 2948 * repeatedly matching zero-length strings in 2949 * a closure, which would cause an infinite loop, 2950 * and also remove exponential behaviour in 2951 * backtracking nested closures. 2952 * The P_WBRANCH operator leaves a space for a 2953 * uchar *, initialized to NULL, which is 2954 * turned into a string the same length as the 2955 * target string. Every time we match from a 2956 * particular point in the target string, we 2957 * stick a 1 at the corresponding point here. 2958 * If we come round to the same branch again, and 2959 * there is already a 1, then the test fails. 2960 */ 2961 opnd = P_OPERAND(scan); 2962 ptrp = opnd++; 2963 if (!ptrp->p) { 2964 ptrp->p = (unsigned char *) 2965 zshcalloc((patinend - patinstart) + 1); 2966 pfree = 1; 2967 } 2968 ptr = ptrp->p + (patinput - patinstart); 2969 2970 /* 2971 * Without approximation, this is just a 2972 * single bit test. With approximation, we 2973 * need to know how many errors there were 2974 * last time we made the test. If errsfound 2975 * is now smaller than it was, hence we can 2976 * make more approximations in the remaining 2977 * code, we continue with the test. 2978 * (This is why the max number of errors is 2979 * 254, not 255.) 2980 */ 2981 if (*ptr && errsfound + 1 >= *ptr) 2982 ret = 0; 2983 *ptr = errsfound + 1; 2984 } else 2985 opnd = P_OPERAND(scan); 2986 if (ret) 2987 ret = patmatch(opnd); 2988 if (pfree) { 2989 zfree((char *)ptrp->p, 2990 (patinend - patinstart) + 1); 2991 ptrp->p = NULL; 2992 } 2993 if (ret) 2994 return 1; 2995 scan = PATNEXT(scan); 2996 } 2997 patinput = save; 2998 patglobflags = savglobflags; 2999 errsfound = saverrsfound; 3000 DPUTS(P_OP(scan) == P_WBRANCH, 3001 "BUG: WBRANCH not first choice."); 3002 next = PATNEXT(scan); 3003 } while (scan && P_ISBRANCH(scan)); 3004 return 0; 3005 } 3006 break; 3007 case P_STAR: 3008 /* Handle specially for speed, although really P_ONEHASH+P_ANY */ 3009 case P_ONEHASH: 3010 case P_TWOHASH: 3011 /* 3012 * This is just simple cases, matching one character. 3013 * With approximations, we still handle * this way, since 3014 * no approximation is ever necessary, but other closures 3015 * are handled by the more complicated branching method 3016 */ 3017 op = P_OP(scan); 3018 /* Note that no counts possibly metafied characters */ 3019 start = patinput; 3020 { 3021 char *lastcharstart; 3022 /* 3023 * Array to record the start of characters for 3024 * backtracking. 3025 */ 3026 VARARR(char, charstart, patinend-patinput); 3027 memset(charstart, 0, patinend-patinput); 3028 3029 if (op == P_STAR) { 3030 for (no = 0; patinput < patinend; 3031 CHARINC(patinput, patinend)) 3032 { 3033 charstart[patinput-start] = 1; 3034 no++; 3035 } 3036 /* simple optimization for reasonably common case */ 3037 if (P_OP(next) == P_END) 3038 return 1; 3039 } else { 3040 DPUTS(patglobflags & 0xff, 3041 "BUG: wrong backtracking with approximation."); 3042 if (!globdots && P_NOTDOT(P_OPERAND(scan)) && 3043 patinput == patinstart && patinput < patinend && 3044 CHARREF(patinput, patinend) == ZWC('.')) 3045 return 0; 3046 no = patrepeat(P_OPERAND(scan), charstart); 3047 } 3048 min = (op == P_TWOHASH) ? 1 : 0; 3049 /* 3050 * Lookahead to avoid useless matches. This is not possible 3051 * with approximation. 3052 */ 3053 if (P_OP(next) == P_EXACTLY && P_LS_LEN(next) && 3054 !(patglobflags & 0xff)) { 3055 char *nextop = P_LS_STR(next); 3056#ifdef MULTIBYTE_SUPPORT 3057 /* else second argument of CHARREF isn't used */ 3058 int nextlen = P_LS_LEN(next); 3059#endif 3060 /* 3061 * If that P_EXACTLY is last (common in simple patterns, 3062 * such as *.c), then it can be only be matched at one 3063 * point in the test string, so record that. 3064 */ 3065 if (P_OP(PATNEXT(next)) == P_END && 3066 !(patflags & PAT_NOANCH)) { 3067 int ptlen = patinend - patinput; 3068 int lenmatch = patinend - 3069 (min ? CHARNEXT(start, patinend) : start); 3070 /* Are we in the right range? */ 3071 if (P_LS_LEN(next) > lenmatch || 3072 P_LS_LEN(next) < ptlen) 3073 return 0; 3074 /* Yes, just position appropriately and test. */ 3075 patinput += ptlen - P_LS_LEN(next); 3076 /* 3077 * Here we will need to be careful that patinput is not 3078 * in the middle of a multibyte character. 3079 */ 3080 /* Continue loop with P_EXACTLY test. */ 3081 break; 3082 } 3083 nextch = CHARREF(nextop, nextop + nextlen); 3084 } else 3085 nextch = PEOF; 3086 savglobflags = patglobflags; 3087 saverrsfound = errsfound; 3088 lastcharstart = charstart + (patinput - start); 3089 if (no >= min) { 3090 for (;;) { 3091 patint_t charmatch_cache; 3092 if (nextch == PEOF || 3093 (patinput < patinend && 3094 CHARMATCH_EXPR(CHARREF(patinput, patinend), 3095 nextch))) { 3096 if (patmatch(next)) 3097 return 1; 3098 } 3099 if (--no < min) 3100 break; 3101 /* find start of previous full character */ 3102 while (!*--lastcharstart) 3103 DPUTS(lastcharstart < charstart, 3104 "lastcharstart invalid"); 3105 patinput = start + (lastcharstart-charstart); 3106 patglobflags = savglobflags; 3107 errsfound = saverrsfound; 3108 } 3109 } 3110 } 3111 /* 3112 * As with branches, the patmatch(next) stuff for * 3113 * handles approximation, so we don't need to try 3114 * anything here. 3115 */ 3116 return 0; 3117 case P_ISSTART: 3118 if (patinput != patinstart || (patflags & PAT_NOTSTART)) 3119 fail = 1; 3120 break; 3121 case P_ISEND: 3122 if (patinput < patinend || (patflags & PAT_NOTEND)) 3123 fail = 1; 3124 break; 3125 case P_COUNTSTART: 3126 { 3127 /* 3128 * Save and restore the current count and the 3129 * start pointer in case the pattern has been 3130 * executed by a previous repetition of a 3131 * closure. 3132 */ 3133 long *curptr = &P_OPERAND(scan)[P_CT_CURRENT].l; 3134 long savecount = *curptr; 3135 unsigned char *saveptr = scan[P_CT_PTR].p; 3136 int ret; 3137 3138 *curptr = 0L; 3139 ret = patmatch(P_OPERAND(scan)); 3140 *curptr = savecount; 3141 scan[P_CT_PTR].p = saveptr; 3142 return ret; 3143 } 3144 case P_COUNT: 3145 { 3146 /* (#cN,M): execution is relatively straightforward */ 3147 long cur = scan[P_CT_CURRENT].l; 3148 long min = scan[P_CT_MIN].l; 3149 long max = scan[P_CT_MAX].l; 3150 3151 if (cur && cur >= min && 3152 (unsigned char *)patinput == scan[P_CT_PTR].p) { 3153 /* 3154 * Not at the first attempt to match so 3155 * the previous attempt managed zero length. 3156 * We can do this indefinitely so there's 3157 * no point in going on. Simply try to 3158 * match the remainder of the pattern. 3159 */ 3160 return patmatch(next); 3161 } 3162 scan[P_CT_PTR].p = (unsigned char *)patinput; 3163 3164 if (max < 0 || cur < max) { 3165 char *patinput_thistime = patinput; 3166 scan[P_CT_CURRENT].l = cur + 1; 3167 if (patmatch(scan + P_CT_OPERAND)) 3168 return 1; 3169 patinput = patinput_thistime; 3170 } 3171 if (cur < min) 3172 return 0; 3173 return patmatch(next); 3174 } 3175 case P_END: 3176 if (!(fail = (patinput < patinend && !(patflags & PAT_NOANCH)))) 3177 return 1; 3178 break; 3179#ifdef DEBUG 3180 default: 3181 dputs("BUG: bad operand in patmatch."); 3182 return 0; 3183 break; 3184#endif 3185 } 3186 3187 if (fail) { 3188 if (errsfound < (patglobflags & 0xff) && 3189 (forceerrs == -1 || errsfound < forceerrs)) { 3190 /* 3191 * Approximation code. There are four possibilities 3192 * 3193 * 1. omit character from input string 3194 * 2. transpose characters in input and pattern strings 3195 * 3. omit character in both input and pattern strings 3196 * 4. omit character from pattern string. 3197 * 3198 * which we try in that order. 3199 * 3200 * Of these, 2, 3 and 4 require an exact match string 3201 * (P_EXACTLY) while 1, 2 and 3 require that we not 3202 * have reached the end of the input string. 3203 * 3204 * Note in each case after making the approximation we 3205 * need to retry the *same* pattern; this is what 3206 * requires exactpos, a slightly doleful way of 3207 * communicating with the exact character matcher. 3208 */ 3209 char *savexact = exactpos; 3210 save = patinput; 3211 savglobflags = patglobflags; 3212 saverrsfound = ++errsfound; 3213 fail = 0; 3214 3215 DPUTS(P_OP(scan) != P_EXACTLY && exactpos, 3216 "BUG: non-exact match has set exactpos"); 3217 3218 /* Try omitting a character from the input string */ 3219 if (patinput < patinend) { 3220 CHARINC(patinput, patinend); 3221 /* If we are not on an exact match, then this is 3222 * our last gasp effort, so we can optimize out 3223 * the recursive call. 3224 */ 3225 if (P_OP(scan) != P_EXACTLY) 3226 continue; 3227 if (patmatch(scan)) 3228 return 1; 3229 } 3230 3231 if (P_OP(scan) == P_EXACTLY) { 3232 char *nextexact = savexact; 3233 DPUTS(!savexact, 3234 "BUG: exact match has not set exactpos"); 3235 CHARINC(nextexact, exactend); 3236 3237 if (save < patinend) { 3238 char *nextin = save; 3239 CHARINC(nextin, patinend); 3240 patglobflags = savglobflags; 3241 errsfound = saverrsfound; 3242 exactpos = savexact; 3243 3244 /* 3245 * Try swapping two characters in patinput and 3246 * exactpos 3247 */ 3248 if (save < patinend && nextin < patinend && 3249 nextexact < exactend) { 3250 patint_t cin0 = CHARREF(save, patinend); 3251 patint_t cpa0 = CHARREF(exactpos, exactend); 3252 patint_t cin1 = CHARREF(nextin, patinend); 3253 patint_t cpa1 = CHARREF(nextexact, exactend); 3254 3255 if (CHARMATCH(cin0, cpa1) && 3256 CHARMATCH(cin1, cpa0)) { 3257 patinput = nextin; 3258 CHARINC(patinput, patinend); 3259 exactpos = nextexact; 3260 CHARINC(exactpos, exactend); 3261 if (patmatch(scan)) 3262 return 1; 3263 3264 patglobflags = savglobflags; 3265 errsfound = saverrsfound; 3266 } 3267 } 3268 3269 /* 3270 * Try moving up both strings. 3271 */ 3272 patinput = nextin; 3273 exactpos = nextexact; 3274 if (patmatch(scan)) 3275 return 1; 3276 3277 patinput = save; 3278 patglobflags = savglobflags; 3279 errsfound = saverrsfound; 3280 exactpos = savexact; 3281 } 3282 3283 DPUTS(exactpos == exactend, "approximating too far"); 3284 /* 3285 * Try moving up the exact match pattern. 3286 * This must be the last attempt, so just loop 3287 * instead of calling recursively. 3288 */ 3289 CHARINC(exactpos, exactend); 3290 continue; 3291 } 3292 } 3293 exactpos = NULL; 3294 return 0; 3295 } 3296 3297 scan = next; 3298 } 3299 3300 return 0; 3301} 3302 3303 3304/**/ 3305#ifdef MULTIBYTE_SUPPORT 3306 3307/* 3308 * See if character ch matches a pattern range specification. 3309 * The null-terminated specification is in range; the test 3310 * character is in ch. 3311 * 3312 * indptr is used by completion matching, which is why this 3313 * function is exported. If indptr is not NULL we set *indptr 3314 * to the index of the character in the range string, adjusted 3315 * in the case of "A-B" ranges such that A would count as its 3316 * normal index (say IA), B would count as IA + (B-A), and any 3317 * character within the range as appropriate. We're not strictly 3318 * guaranteed this fits within a wint_t, but if this is Unicode 3319 * in 32 bits we have a fair amount of distance left over. 3320 * 3321 * mtp is used in the same circumstances. *mtp returns the match type: 3322 * 0 for a standard character, else the PP_ index. It's not 3323 * useful if the match failed. 3324 */ 3325 3326/**/ 3327mod_export int 3328mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp) 3329{ 3330 wchar_t r1, r2; 3331 3332 if (indptr) 3333 *indptr = 0; 3334 /* 3335 * Careful here: unlike other strings, range is a NULL-terminated, 3336 * metafied string, because we need to treat the Posix and hyphenated 3337 * ranges specially. 3338 */ 3339 while (*range) { 3340 if (imeta(STOUC(*range))) { 3341 int swtype = STOUC(*range++) - STOUC(Meta); 3342 if (mtp) 3343 *mtp = swtype; 3344 switch (swtype) { 3345 case 0: 3346 /* ordinary metafied character */ 3347 range--; 3348 if (metacharinc(&range) == ch) 3349 return 1; 3350 break; 3351 case PP_ALPHA: 3352 if (iswalpha(ch)) 3353 return 1; 3354 break; 3355 case PP_ALNUM: 3356 if (iswalnum(ch)) 3357 return 1; 3358 break; 3359 case PP_ASCII: 3360 if ((ch & ~0x7f) == 0) 3361 return 1; 3362 break; 3363 case PP_BLANK: 3364 if (ch == L' ' || ch == L'\t') 3365 return 1; 3366 break; 3367 case PP_CNTRL: 3368 if (iswcntrl(ch)) 3369 return 1; 3370 break; 3371 case PP_DIGIT: 3372 if (iswdigit(ch)) 3373 return 1; 3374 break; 3375 case PP_GRAPH: 3376 if (iswgraph(ch)) 3377 return 1; 3378 break; 3379 case PP_LOWER: 3380 if (iswlower(ch)) 3381 return 1; 3382 break; 3383 case PP_PRINT: 3384 if (iswprint(ch)) 3385 return 1; 3386 break; 3387 case PP_PUNCT: 3388 if (iswpunct(ch)) 3389 return 1; 3390 break; 3391 case PP_SPACE: 3392 if (iswspace(ch)) 3393 return 1; 3394 break; 3395 case PP_UPPER: 3396 if (iswupper(ch)) 3397 return 1; 3398 break; 3399 case PP_XDIGIT: 3400 if (iswxdigit(ch)) 3401 return 1; 3402 break; 3403 case PP_IDENT: 3404 if (wcsitype(ch, IIDENT)) 3405 return 1; 3406 break; 3407 case PP_IFS: 3408 if (wcsitype(ch, ISEP)) 3409 return 1; 3410 break; 3411 case PP_IFSSPACE: 3412 /* must be ASCII space character */ 3413 if (ch < 128 && iwsep((int)ch)) 3414 return 1; 3415 break; 3416 case PP_WORD: 3417 if (wcsitype(ch, IWORD)) 3418 return 1; 3419 break; 3420 case PP_RANGE: 3421 r1 = metacharinc(&range); 3422 r2 = metacharinc(&range); 3423 if (r1 <= ch && ch <= r2) { 3424 if (indptr) 3425 *indptr += ch - r1; 3426 return 1; 3427 } 3428 /* Careful not to screw up counting with bogus range */ 3429 if (indptr && r1 < r2) { 3430 /* 3431 * This gets incremented again below to get 3432 * us past the range end. This is correct. 3433 */ 3434 *indptr += r2 - r1; 3435 } 3436 break; 3437 case PP_UNKWN: 3438 DPUTS(1, "BUG: unknown posix range passed through.\n"); 3439 break; 3440 default: 3441 DPUTS(1, "BUG: unknown metacharacter in range."); 3442 break; 3443 } 3444 } else if (metacharinc(&range) == ch) { 3445 if (mtp) 3446 *mtp = 0; 3447 return 1; 3448 } 3449 if (indptr) 3450 (*indptr)++; 3451 } 3452 return 0; 3453} 3454 3455 3456/* 3457 * This is effectively the reverse of mb_patmatchrange(). 3458 * Given a range descriptor of the same form, and an index into it, 3459 * try to determine the character that is matched. If the index 3460 * points to a [:...:] generic style match, set chr to WEOF and 3461 * return the type in mtp instead. Return 1 if successful, 0 if 3462 * there was no corresponding index. Note all pointer arguments 3463 * must be non-null. 3464 */ 3465 3466/**/ 3467mod_export int 3468mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp) 3469{ 3470 wchar_t r1, r2, rchr; 3471 wint_t rdiff; 3472 3473 *chr = WEOF; 3474 *mtp = 0; 3475 3476 while (*range) { 3477 if (imeta(STOUC(*range))) { 3478 int swtype = STOUC(*range++) - STOUC(Meta); 3479 switch (swtype) { 3480 case 0: 3481 range--; 3482 rchr = metacharinc(&range); 3483 if (!ind) { 3484 *chr = (wint_t) rchr; 3485 return 1; 3486 } 3487 break; 3488 3489 case PP_ALPHA: 3490 case PP_ALNUM: 3491 case PP_ASCII: 3492 case PP_BLANK: 3493 case PP_CNTRL: 3494 case PP_DIGIT: 3495 case PP_GRAPH: 3496 case PP_LOWER: 3497 case PP_PRINT: 3498 case PP_PUNCT: 3499 case PP_SPACE: 3500 case PP_UPPER: 3501 case PP_XDIGIT: 3502 case PP_IDENT: 3503 case PP_IFS: 3504 case PP_IFSSPACE: 3505 case PP_WORD: 3506 if (!ind) { 3507 *mtp = swtype; 3508 return 1; 3509 } 3510 break; 3511 3512 case PP_RANGE: 3513 r1 = metacharinc(&range); 3514 r2 = metacharinc(&range); 3515 rdiff = (wint_t)r2 - (wint_t)r1; 3516 if (rdiff >= ind) { 3517 *chr = (wint_t)r1 + ind; 3518 return 1; 3519 } 3520 /* note the extra decrement to ind below */ 3521 ind -= rdiff; 3522 break; 3523 case PP_UNKWN: 3524 DPUTS(1, "BUG: unknown posix range passed through.\n"); 3525 break; 3526 default: 3527 DPUTS(1, "BUG: unknown metacharacter in range."); 3528 break; 3529 } 3530 } else { 3531 rchr = metacharinc(&range); 3532 if (!ind) { 3533 *chr = (wint_t)rchr; 3534 return 1; 3535 } 3536 } 3537 if (!ind--) 3538 break; 3539 } 3540 3541 /* No corresponding index. */ 3542 return 0; 3543} 3544 3545/**/ 3546#endif /* MULTIBYTE_SUPPORT */ 3547 3548/* 3549 * Identical function to mb_patmatchrange() above for single-byte 3550 * characters. 3551 */ 3552 3553/**/ 3554mod_export int 3555patmatchrange(char *range, int ch, int *indptr, int *mtp) 3556{ 3557 int r1, r2; 3558 3559 if (indptr) 3560 *indptr = 0; 3561 /* 3562 * Careful here: unlike other strings, range is a NULL-terminated, 3563 * metafied string, because we need to treat the Posix and hyphenated 3564 * ranges specially. 3565 */ 3566 for (; *range; range++) { 3567 if (imeta(STOUC(*range))) { 3568 int swtype = STOUC(*range) - STOUC(Meta); 3569 if (mtp) 3570 *mtp = swtype; 3571 switch (swtype) { 3572 case 0: 3573 if (STOUC(*++range ^ 32) == ch) 3574 return 1; 3575 break; 3576 case PP_ALPHA: 3577 if (isalpha(ch)) 3578 return 1; 3579 break; 3580 case PP_ALNUM: 3581 if (isalnum(ch)) 3582 return 1; 3583 break; 3584 case PP_ASCII: 3585 if ((ch & ~0x7f) == 0) 3586 return 1; 3587 break; 3588 case PP_BLANK: 3589 if (ch == ' ' || ch == '\t') 3590 return 1; 3591 break; 3592 case PP_CNTRL: 3593 if (iscntrl(ch)) 3594 return 1; 3595 break; 3596 case PP_DIGIT: 3597 if (isdigit(ch)) 3598 return 1; 3599 break; 3600 case PP_GRAPH: 3601 if (isgraph(ch)) 3602 return 1; 3603 break; 3604 case PP_LOWER: 3605 if (islower(ch)) 3606 return 1; 3607 break; 3608 case PP_PRINT: 3609 if (isprint(ch)) 3610 return 1; 3611 break; 3612 case PP_PUNCT: 3613 if (ispunct(ch)) 3614 return 1; 3615 break; 3616 case PP_SPACE: 3617 if (isspace(ch)) 3618 return 1; 3619 break; 3620 case PP_UPPER: 3621 if (isupper(ch)) 3622 return 1; 3623 break; 3624 case PP_XDIGIT: 3625 if (isxdigit(ch)) 3626 return 1; 3627 break; 3628 case PP_IDENT: 3629 if (iident(ch)) 3630 return 1; 3631 break; 3632 case PP_IFS: 3633 if (isep(ch)) 3634 return 1; 3635 break; 3636 case PP_IFSSPACE: 3637 if (iwsep(ch)) 3638 return 1; 3639 break; 3640 case PP_WORD: 3641 if (iword(ch)) 3642 return 1; 3643 break; 3644 case PP_RANGE: 3645 range++; 3646 r1 = STOUC(UNMETA(range)); 3647 METACHARINC(range); 3648 r2 = STOUC(UNMETA(range)); 3649 if (*range == Meta) 3650 range++; 3651 if (r1 <= ch && ch <= r2) { 3652 if (indptr) 3653 *indptr += ch - r1; 3654 return 1; 3655 } 3656 if (indptr && r1 < r2) 3657 *indptr += r2 - r1; 3658 break; 3659 case PP_UNKWN: 3660 DPUTS(1, "BUG: unknown posix range passed through.\n"); 3661 break; 3662 default: 3663 DPUTS(1, "BUG: unknown metacharacter in range."); 3664 break; 3665 } 3666 } else if (STOUC(*range) == ch) { 3667 if (mtp) 3668 *mtp = 0; 3669 return 1; 3670 } 3671 if (indptr) 3672 (*indptr)++; 3673 } 3674 return 0; 3675} 3676 3677 3678/**/ 3679#ifndef MULTIBYTE_SUPPORT 3680 3681/* 3682 * Identical function to mb_patmatchindex() above for single-byte 3683 * characters. Here -1 represents a character that needs a special type. 3684 * 3685 * Unlike patmatchrange, we only need this in ZLE, which always 3686 * uses MULTIBYTE_SUPPORT if compiled in; hence we don't use 3687 * this function in that case. 3688 */ 3689 3690/**/ 3691mod_export int 3692patmatchindex(char *range, int ind, int *chr, int *mtp) 3693{ 3694 int r1, r2, rdiff, rchr; 3695 3696 *chr = -1; 3697 *mtp = 0; 3698 3699 for (; *range; range++) { 3700 if (imeta(STOUC(*range))) { 3701 int swtype = STOUC(*range) - STOUC(Meta); 3702 switch (swtype) { 3703 case 0: 3704 /* ordinary metafied character */ 3705 rchr = STOUC(*++range) ^ 32; 3706 if (!ind) { 3707 *chr = rchr; 3708 return 1; 3709 } 3710 break; 3711 3712 case PP_ALPHA: 3713 case PP_ALNUM: 3714 case PP_ASCII: 3715 case PP_BLANK: 3716 case PP_CNTRL: 3717 case PP_DIGIT: 3718 case PP_GRAPH: 3719 case PP_LOWER: 3720 case PP_PRINT: 3721 case PP_PUNCT: 3722 case PP_SPACE: 3723 case PP_UPPER: 3724 case PP_XDIGIT: 3725 case PP_IDENT: 3726 case PP_IFS: 3727 case PP_IFSSPACE: 3728 case PP_WORD: 3729 if (!ind) { 3730 *mtp = swtype; 3731 return 1; 3732 } 3733 break; 3734 3735 case PP_RANGE: 3736 range++; 3737 r1 = STOUC(UNMETA(range)); 3738 METACHARINC(range); 3739 r2 = STOUC(UNMETA(range)); 3740 if (*range == Meta) 3741 range++; 3742 rdiff = r2 - r1; 3743 if (rdiff >= ind) { 3744 *chr = r1 + ind; 3745 return 1; 3746 } 3747 /* note the extra decrement to ind below */ 3748 ind -= rdiff; 3749 break; 3750 case PP_UNKWN: 3751 DPUTS(1, "BUG: unknown posix range passed through.\n"); 3752 break; 3753 default: 3754 DPUTS(1, "BUG: unknown metacharacter in range."); 3755 break; 3756 } 3757 } else { 3758 if (!ind) { 3759 *chr = STOUC(*range); 3760 return 1; 3761 } 3762 } 3763 if (!ind--) 3764 break; 3765 } 3766 3767 /* No corresponding index. */ 3768 return 0; 3769} 3770 3771/**/ 3772#endif /* MULTIBYTE_SUPPORT */ 3773 3774/* 3775 * Repeatedly match something simple and say how many times. 3776 * charstart is an array parallel to that starting at patinput 3777 * and records the start of (possibly multibyte) characters 3778 * to aid in later backtracking. 3779 */ 3780 3781/**/ 3782static int patrepeat(Upat p, char *charstart) 3783{ 3784 int count = 0; 3785 patint_t tch, charmatch_cache; 3786 char *scan, *opnd; 3787 3788 scan = patinput; 3789 opnd = (char *)P_OPERAND(p); 3790 3791 switch(P_OP(p)) { 3792#ifdef DEBUG 3793 case P_ANY: 3794 dputs("BUG: ?# did not get optimized to *"); 3795 return 0; 3796 break; 3797#endif 3798 case P_EXACTLY: 3799 DPUTS(P_LS_LEN(p) != 1, "closure following more than one character"); 3800 tch = CHARREF(P_LS_STR(p), P_LS_STR(p) + P_LS_LEN(p)); 3801 while (scan < patinend && 3802 CHARMATCH_EXPR(CHARREF(scan, patinend), tch)) { 3803 charstart[scan-patinput] = 1; 3804 count++; 3805 CHARINC(scan, patinend); 3806 } 3807 break; 3808 case P_ANYOF: 3809 case P_ANYBUT: 3810 while (scan < patinend) { 3811#ifdef MULTIBYTE_SUPPORT 3812 wchar_t cr = CHARREF(scan, patinend); 3813 if (patglobflags & GF_MULTIBYTE) { 3814 if (mb_patmatchrange(opnd, cr, NULL, NULL) ^ 3815 (P_OP(p) == P_ANYOF)) 3816 break; 3817 } else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^ 3818 (P_OP(p) == P_ANYOF)) 3819 break; 3820#else 3821 if (patmatchrange(opnd, CHARREF(scan, patinend), NULL, NULL) ^ 3822 (P_OP(p) == P_ANYOF)) 3823 break; 3824#endif 3825 charstart[scan-patinput] = 1; 3826 count++; 3827 CHARINC(scan, patinend); 3828 } 3829 break; 3830#ifdef DEBUG 3831 default: 3832 dputs("BUG: something very strange is happening in patrepeat"); 3833 return 0; 3834 break; 3835#endif 3836 } 3837 3838 patinput = scan; 3839 return count; 3840} 3841 3842/* Free a patprog. */ 3843 3844/**/ 3845mod_export void 3846freepatprog(Patprog prog) 3847{ 3848 if (prog && prog != dummy_patprog1 && prog != dummy_patprog2) 3849 zfree(prog, prog->size); 3850} 3851 3852/* Disable or reenable a pattern character */ 3853 3854/**/ 3855int 3856pat_enables(const char *cmd, char **patp, int enable) 3857{ 3858 int ret = 0; 3859 const char **stringp; 3860 char *disp; 3861 3862 if (!*patp) { 3863 int done = 0; 3864 for (stringp = zpc_strings, disp = zpc_disables; 3865 stringp < zpc_strings + ZPC_COUNT; 3866 stringp++, disp++) { 3867 if (!*stringp) 3868 continue; 3869 if (enable ? *disp : !*disp) 3870 continue; 3871 if (done) 3872 putc(' ', stdout); 3873 printf("'%s'", *stringp); 3874 done = 1; 3875 } 3876 if (done) 3877 putc('\n', stdout); 3878 return 0; 3879 } 3880 3881 for (; *patp; patp++) { 3882 for (stringp = zpc_strings, disp = zpc_disables; 3883 stringp < zpc_strings + ZPC_COUNT; 3884 stringp++, disp++) { 3885 if (*stringp && !strcmp(*stringp, *patp)) { 3886 *disp = (char)!enable; 3887 break; 3888 } 3889 } 3890 if (stringp == zpc_strings + ZPC_COUNT) { 3891 zerrnam(cmd, "invalid pattern: %s", *patp); 3892 ret = 1; 3893 } 3894 } 3895 3896 return ret; 3897} 3898 3899/* 3900 * Save the current state of pattern disables, returning the saved value. 3901 */ 3902 3903/**/ 3904unsigned int 3905savepatterndisables(void) 3906{ 3907 unsigned int disables, bit; 3908 char *disp; 3909 3910 disables = 0; 3911 for (bit = 1, disp = zpc_disables; 3912 disp < zpc_disables + ZPC_COUNT; 3913 bit <<= 1, disp++) { 3914 if (*disp) 3915 disables |= bit; 3916 } 3917 return disables; 3918} 3919 3920/* 3921 * Function scope saving pattern enables. 3922 */ 3923 3924/**/ 3925void 3926startpatternscope(void) 3927{ 3928 Zpc_disables_save newdis; 3929 3930 newdis = (Zpc_disables_save)zalloc(sizeof(*newdis)); 3931 newdis->next = zpc_disables_stack; 3932 newdis->disables = savepatterndisables(); 3933 3934 zpc_disables_stack = newdis; 3935} 3936 3937/* 3938 * Restore completely the state of pattern disables. 3939 */ 3940 3941/**/ 3942void 3943restorepatterndisables(unsigned int disables) 3944{ 3945 char *disp; 3946 unsigned int bit; 3947 3948 for (bit = 1, disp = zpc_disables; 3949 disp < zpc_disables + ZPC_COUNT; 3950 bit <<= 1, disp++) { 3951 if (disables & bit) 3952 *disp = 1; 3953 else 3954 *disp = 0; 3955 } 3956} 3957 3958/* 3959 * Function scope to restore pattern enables if localpatterns is turned on. 3960 */ 3961 3962/**/ 3963void 3964endpatternscope(void) 3965{ 3966 Zpc_disables_save olddis; 3967 3968 olddis = zpc_disables_stack; 3969 zpc_disables_stack = olddis->next; 3970 3971 if (isset(LOCALPATTERNS)) 3972 restorepatterndisables(olddis->disables); 3973 3974 zfree(olddis, sizeof(*olddis)); 3975} 3976 3977/* Reinitialise pattern disables */ 3978 3979/**/ 3980void 3981clearpatterndisables(void) 3982{ 3983 memset(zpc_disables, 0, ZPC_COUNT); 3984} 3985 3986 3987/* Check to see if str is eligible for filename generation. */ 3988 3989/**/ 3990mod_export int 3991haswilds(char *str) 3992{ 3993 char *start; 3994 3995 /* `[' and `]' are legal even if bad patterns are usually not. */ 3996 if ((*str == Inbrack || *str == Outbrack) && !str[1]) 3997 return 0; 3998 3999 /* If % is immediately followed by ?, then that ? is * 4000 * not treated as a wildcard. This is so you don't have * 4001 * to escape job references such as %?foo. */ 4002 if (str[0] == '%' && str[1] == Quest) 4003 str[1] = '?'; 4004 4005 /* 4006 * Note that at this point zpc_special has not been set up. 4007 */ 4008 start = str; 4009 for (; *str; str++) { 4010 switch (*str) { 4011 case Inpar: 4012 if ((!isset(SHGLOB) && !zpc_disables[ZPC_INPAR]) || 4013 (str > start && isset(KSHGLOB) && 4014 ((str[-1] == Quest && !zpc_disables[ZPC_KSH_QUEST]) || 4015 (str[-1] == Star && !zpc_disables[ZPC_KSH_STAR]) || 4016 (str[-1] == '+' && !zpc_disables[ZPC_KSH_PLUS]) || 4017 (str[-1] == '!' && !zpc_disables[ZPC_KSH_BANG]) || 4018 (str[-1] == '@' && !zpc_disables[ZPC_KSH_AT])))) 4019 return 1; 4020 break; 4021 4022 case Bar: 4023 if (!zpc_disables[ZPC_BAR]) 4024 return 1; 4025 break; 4026 4027 case Star: 4028 if (!zpc_disables[ZPC_STAR]) 4029 return 1; 4030 break; 4031 4032 case Inbrack: 4033 if (!zpc_disables[ZPC_INBRACK]) 4034 return 1; 4035 break; 4036 4037 case Inang: 4038 if (!zpc_disables[ZPC_INANG]) 4039 return 1; 4040 break; 4041 4042 case Quest: 4043 if (!zpc_disables[ZPC_QUEST]) 4044 return 1; 4045 break; 4046 4047 case Pound: 4048 if (isset(EXTENDEDGLOB) && !zpc_disables[ZPC_HASH]) 4049 return 1; 4050 break; 4051 4052 case Hat: 4053 if (isset(EXTENDEDGLOB) && !zpc_disables[ZPC_HAT]) 4054 return 1; 4055 break; 4056 } 4057 } 4058 return 0; 4059} 4060