1249261Sdim/*	$NetBSD: str.c,v 1.103 2024/04/14 15:21:20 rillig Exp $	*/
2249261Sdim
3249261Sdim/*
4249261Sdim * Copyright (c) 1988, 1989, 1990, 1993
5249261Sdim *	The Regents of the University of California.  All rights reserved.
6249261Sdim *
7249261Sdim * This code is derived from software contributed to Berkeley by
8249261Sdim * Adam de Boor.
9249261Sdim *
10249261Sdim * Redistribution and use in source and binary forms, with or without
11249261Sdim * modification, are permitted provided that the following conditions
12249261Sdim * are met:
13249261Sdim * 1. Redistributions of source code must retain the above copyright
14249261Sdim *    notice, this list of conditions and the following disclaimer.
15249261Sdim * 2. Redistributions in binary form must reproduce the above copyright
16249261Sdim *    notice, this list of conditions and the following disclaimer in the
17249261Sdim *    documentation and/or other materials provided with the distribution.
18252723Sdim * 3. Neither the name of the University nor the names of its contributors
19249261Sdim *    may be used to endorse or promote products derived from this software
20249261Sdim *    without specific prior written permission.
21249261Sdim *
22249261Sdim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23263509Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24249261Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25249261Sdim * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26249261Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27249261Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28249261Sdim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29249261Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30249261Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31249261Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32263509Sdim * SUCH DAMAGE.
33249261Sdim */
34263509Sdim
35263509Sdim/*
36263509Sdim * Copyright (c) 1989 by Berkeley Softworks
37249261Sdim * All rights reserved.
38249261Sdim *
39249261Sdim * This code is derived from software contributed to Berkeley by
40249261Sdim * Adam de Boor.
41249261Sdim *
42249261Sdim * Redistribution and use in source and binary forms, with or without
43249261Sdim * modification, are permitted provided that the following conditions
44263509Sdim * are met:
45249261Sdim * 1. Redistributions of source code must retain the above copyright
46249261Sdim *    notice, this list of conditions and the following disclaimer.
47249261Sdim * 2. Redistributions in binary form must reproduce the above copyright
48249261Sdim *    notice, this list of conditions and the following disclaimer in the
49249261Sdim *    documentation and/or other materials provided with the distribution.
50249261Sdim * 3. All advertising materials mentioning features or use of this software
51249261Sdim *    must display the following acknowledgement:
52249261Sdim *	This product includes software developed by the University of
53249261Sdim *	California, Berkeley and its contributors.
54249261Sdim * 4. Neither the name of the University nor the names of its contributors
55263509Sdim *    may be used to endorse or promote products derived from this software
56249261Sdim *    without specific prior written permission.
57263509Sdim *
58263509Sdim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59263509Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60263509Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61263509Sdim * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62263509Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63263509Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64263509Sdim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65263509Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66263509Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67263509Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68249261Sdim * SUCH DAMAGE.
69249261Sdim */
70249261Sdim
71249261Sdim#include "make.h"
72249261Sdim
73249261Sdim/*	"@(#)str.c	5.8 (Berkeley) 6/1/90"	*/
74249261SdimMAKE_RCSID("$NetBSD: str.c,v 1.103 2024/04/14 15:21:20 rillig Exp $");
75249261Sdim
76249261Sdim
77249261Sdimstatic HashTable interned_strings;
78249261Sdim
79249261Sdim
80249261Sdim/* Return the concatenation of s1 and s2, freshly allocated. */
81249261Sdimchar *
82249261Sdimstr_concat2(const char *s1, const char *s2)
83249261Sdim{
84249261Sdim	size_t len1 = strlen(s1);
85263509Sdim	size_t len2 = strlen(s2);
86249261Sdim	char *result = bmake_malloc(len1 + len2 + 1);
87249261Sdim	memcpy(result, s1, len1);
88249261Sdim	memcpy(result + len1, s2, len2 + 1);
89263509Sdim	return result;
90249261Sdim}
91263509Sdim
92263509Sdim/* Return the concatenation of s1, s2 and s3, freshly allocated. */
93249261Sdimchar *
94249261Sdimstr_concat3(const char *s1, const char *s2, const char *s3)
95249261Sdim{
96249261Sdim	size_t len1 = strlen(s1);
97263509Sdim	size_t len2 = strlen(s2);
98263509Sdim	size_t len3 = strlen(s3);
99263509Sdim	char *result = bmake_malloc(len1 + len2 + len3 + 1);
100263509Sdim	memcpy(result, s1, len1);
101263509Sdim	memcpy(result + len1, s2, len2);
102263509Sdim	memcpy(result + len1 + len2, s3, len3 + 1);
103263509Sdim	return result;
104263509Sdim}
105263509Sdim
106263509Sdim/*
107263509Sdim * Fracture a string into an array of words (as delineated by tabs or spaces)
108249261Sdim * taking quotation marks into account.
109249261Sdim *
110249261Sdim * A string that is empty or only contains whitespace nevertheless results in
111249261Sdim * a single word.  This is unexpected in many places, and the caller needs to
112249261Sdim * correct for this edge case.
113263509Sdim *
114263509Sdim * If expand is true, quotes are removed and escape sequences such as \r, \t,
115263509Sdim * etc... are expanded. In this case, return NULL on parse errors.
116249261Sdim *
117249261Sdim * Returns the fractured words, which must be freed later using Words_Free,
118249261Sdim * unless the returned Words.words was NULL.
119249261Sdim */
120249261SdimSubstringWords
121263509SdimSubstring_Words(const char *str, bool expand)
122263509Sdim{
123263509Sdim	size_t str_len;
124263509Sdim	char *words_buf;
125263509Sdim	size_t words_cap;
126263509Sdim	Substring *words;
127263509Sdim	size_t words_len;
128263509Sdim	char inquote;
129263509Sdim	char *word_start;
130263509Sdim	char *word_end;
131263509Sdim	const char *str_p;
132249261Sdim
133249261Sdim	/* XXX: why only hspace, not whitespace? */
134249261Sdim	cpp_skip_hspace(&str);	/* skip leading space chars. */
135263509Sdim
136263509Sdim	/* words_buf holds the words, separated by '\0'. */
137263509Sdim	str_len = strlen(str);
138263509Sdim	words_buf = bmake_malloc(str_len + 1);
139249261Sdim
140263509Sdim	words_cap = str_len / 5 > 50 ? str_len / 5 : 50;
141263509Sdim	words = bmake_malloc((words_cap + 1) * sizeof(words[0]));
142263509Sdim
143263509Sdim	/*
144263509Sdim	 * copy the string; at the same time, parse backslashes,
145249261Sdim	 * quotes and build the word list.
146249261Sdim	 */
147249261Sdim	words_len = 0;
148249261Sdim	inquote = '\0';
149249261Sdim	word_start = words_buf;
150249261Sdim	word_end = words_buf;
151249261Sdim	for (str_p = str;; str_p++) {
152249261Sdim		char ch = *str_p;
153249261Sdim		switch (ch) {
154249261Sdim		case '"':
155249261Sdim		case '\'':
156263509Sdim			if (inquote != '\0') {
157263509Sdim				if (inquote == ch)
158263509Sdim					inquote = '\0';
159263509Sdim				else
160263509Sdim					break;
161263509Sdim			} else {
162263509Sdim				inquote = ch;
163249261Sdim				/* Don't miss "" or '' */
164249261Sdim				if (word_start == NULL && str_p[1] == inquote) {
165249261Sdim					if (!expand) {
166249261Sdim						word_start = word_end;
167249261Sdim						*word_end++ = ch;
168249261Sdim					} else
169263509Sdim						word_start = word_end + 1;
170263509Sdim					str_p++;
171263509Sdim					inquote = '\0';
172263509Sdim					break;
173249261Sdim				}
174249261Sdim			}
175263509Sdim			if (!expand) {
176263509Sdim				if (word_start == NULL)
177249261Sdim					word_start = word_end;
178249261Sdim				*word_end++ = ch;
179249261Sdim			}
180249261Sdim			continue;
181249261Sdim		case ' ':
182249261Sdim		case '\t':
183249261Sdim		case '\n':
184249261Sdim			if (inquote != '\0')
185263509Sdim				break;
186249261Sdim			if (word_start == NULL)
187249261Sdim				continue;
188263509Sdim			/* FALLTHROUGH */
189263509Sdim		case '\0':
190249261Sdim			/*
191263509Sdim			 * end of a token -- make sure there's enough words
192249261Sdim			 * space and save off a pointer.
193249261Sdim			 */
194252723Sdim			if (word_start == NULL)
195249261Sdim				goto done;
196263509Sdim
197249261Sdim			*word_end++ = '\0';
198249261Sdim			if (words_len == words_cap) {
199263509Sdim				words_cap *= 2;
200249261Sdim				words = bmake_realloc(words,
201249261Sdim				    (words_cap + 1) * sizeof(words[0]));
202249261Sdim			}
203249261Sdim			words[words_len++] =
204263509Sdim			    Substring_Init(word_start, word_end - 1);
205263509Sdim			word_start = NULL;
206263509Sdim			if (ch == '\n' || ch == '\0') {
207263509Sdim				if (expand && inquote != '\0') {
208249261Sdim					SubstringWords res;
209249261Sdim
210249261Sdim					free(words);
211249261Sdim					free(words_buf);
212263509Sdim
213263509Sdim					res.words = NULL;
214249261Sdim					res.len = 0;
215249261Sdim					res.freeIt = NULL;
216249261Sdim					return res;
217249261Sdim				}
218249261Sdim				goto done;
219249261Sdim			}
220249261Sdim			continue;
221249261Sdim		case '\\':
222249261Sdim			if (!expand) {
223249261Sdim				if (word_start == NULL)
224249261Sdim					word_start = word_end;
225249261Sdim				*word_end++ = '\\';
226249261Sdim				/* catch lonely '\' at end of string */
227249261Sdim				if (str_p[1] == '\0')
228249261Sdim					continue;
229249261Sdim				ch = *++str_p;
230249261Sdim				break;
231249261Sdim			}
232249261Sdim
233249261Sdim			switch (ch = *++str_p) {
234249261Sdim			case '\0':
235249261Sdim			case '\n':
236249261Sdim				/* hmmm; fix it up as best we can */
237263509Sdim				ch = '\\';
238263509Sdim				str_p--;
239263509Sdim				break;
240263509Sdim			case 'b':
241263509Sdim				ch = '\b';
242263509Sdim				break;
243249261Sdim			case 'f':
244249261Sdim				ch = '\f';
245249261Sdim				break;
246249261Sdim			case 'n':
247249261Sdim				ch = '\n';
248249261Sdim				break;
249249261Sdim			case 'r':
250249261Sdim				ch = '\r';
251252723Sdim				break;
252263509Sdim			case 't':
253252723Sdim				ch = '\t';
254263509Sdim				break;
255263509Sdim			}
256252723Sdim			break;
257252723Sdim		}
258252723Sdim		if (word_start == NULL)
259252723Sdim			word_start = word_end;
260252723Sdim		*word_end++ = ch;
261252723Sdim	}
262252723Sdimdone:
263252723Sdim	words[words_len] = Substring_Init(NULL, NULL);	/* useful for argv */
264252723Sdim
265252723Sdim	{
266263509Sdim		SubstringWords result;
267263509Sdim
268252723Sdim		result.words = words;
269252723Sdim		result.len = words_len;
270249261Sdim		result.freeIt = words_buf;
271249261Sdim		return result;
272252723Sdim	}
273252723Sdim}
274249261Sdim
275249261SdimWords
276249261SdimStr_Words(const char *str, bool expand)
277263509Sdim{
278263509Sdim	SubstringWords swords;
279249261Sdim	Words words;
280263509Sdim	size_t i;
281263509Sdim
282263509Sdim	swords = Substring_Words(str, expand);
283263509Sdim	if (swords.words == NULL) {
284249261Sdim		words.words = NULL;
285263509Sdim		words.len = 0;
286249261Sdim		words.freeIt = NULL;
287249261Sdim		return words;
288249261Sdim	}
289249261Sdim
290249261Sdim	words.words = bmake_malloc((swords.len + 1) * sizeof(words.words[0]));
291249261Sdim	words.len = swords.len;
292249261Sdim	words.freeIt = swords.freeIt;
293249261Sdim	for (i = 0; i < swords.len + 1; i++)
294249261Sdim		words.words[i] = UNCONST(swords.words[i].start);
295249261Sdim	free(swords.words);
296249261Sdim	return words;
297249261Sdim}
298249261Sdim
299249261Sdim/*
300249261Sdim * Test if a string matches a pattern like "*.[ch]". The pattern matching
301249261Sdim * characters are '*', '?' and '[]', as in fnmatch(3).
302249261Sdim *
303249261Sdim * See varmod-match.mk for examples and edge cases.
304249261Sdim */
305249261SdimStrMatchResult
306249261SdimStr_Match(const char *str, const char *pat)
307249261Sdim{
308263509Sdim	StrMatchResult res = { NULL, false };
309249261Sdim	bool asterisk = false;
310249261Sdim	const char *fixed_str = str;
311249261Sdim	const char *fixed_pat = pat;
312249261Sdim
313249261Sdimmatch_fixed_length:
314249261Sdim	str = fixed_str;
315263509Sdim	pat = fixed_pat;
316249261Sdim	for (; *pat != '\0' && *pat != '*'; str++, pat++) {
317263509Sdim		if (*str == '\0')
318249261Sdim			return res;
319249261Sdim
320263509Sdim		if (*pat == '?')	/* match any single character */
321249261Sdim			continue;
322249261Sdim
323249261Sdim		if (*pat == '[') {	/* match a character from a list */
324263509Sdim			bool neg = pat[1] == '^';
325249261Sdim			pat += neg ? 2 : 1;
326249261Sdim
327263509Sdim		next_char_in_list:
328249261Sdim			if (*pat == '\0')
329263509Sdim				res.error = "Unfinished character list";
330263509Sdim			if (*pat == ']' || *pat == '\0') {
331249261Sdim				if (neg)
332263509Sdim					goto end_of_char_list;
333249261Sdim				goto no_match;
334263509Sdim			}
335263509Sdim			if (*pat == *str)
336263509Sdim				goto end_of_char_list;
337263509Sdim			if (pat[1] == '-' && pat[2] == '\0') {
338263509Sdim				res.error = "Unfinished character range";
339249261Sdim				res.matched = neg;
340263509Sdim				return res;
341249261Sdim			}
342249261Sdim			if (pat[1] == '-') {
343263509Sdim				unsigned char e1 = (unsigned char)pat[0];
344263509Sdim				unsigned char c = (unsigned char)*str;
345263509Sdim				unsigned char e2 = (unsigned char)pat[2];
346263509Sdim				if ((e1 <= c && c <= e2)
347249261Sdim				    || (e2 <= c && c <= e1))
348249261Sdim					goto end_of_char_list;
349249261Sdim				pat += 2;
350249261Sdim			}
351249261Sdim			pat++;
352249261Sdim			goto next_char_in_list;
353249261Sdim
354249261Sdim		end_of_char_list:
355249261Sdim			if (neg && *pat != ']' && *pat != '\0')
356263509Sdim				goto no_match;
357249261Sdim			while (*pat != ']' && *pat != '\0')
358249261Sdim				pat++;
359249261Sdim			if (*pat == '\0')
360249261Sdim				pat--;
361249261Sdim			continue;
362249261Sdim		}
363249261Sdim
364249261Sdim		if (*pat == '\\')	/* match the next character exactly */
365249261Sdim			pat++;
366249261Sdim		if (*pat != *str) {
367249261Sdim			if (asterisk && str == fixed_str) {
368249261Sdim				while (*str != '\0' && *str != *pat)
369263509Sdim					str++;
370263509Sdim				fixed_str = str;
371249261Sdim				goto match_fixed_length;
372249261Sdim			}
373249261Sdim			goto no_match;
374249261Sdim		}
375249261Sdim	}
376249261Sdim
377249261Sdim	if (*pat == '*') {
378249261Sdim		asterisk = true;
379249261Sdim		while (*pat == '*')
380249261Sdim			pat++;
381249261Sdim		if (*pat == '\0') {
382263509Sdim			res.matched = true;
383249261Sdim			return res;
384249261Sdim		}
385249261Sdim		fixed_str = str;
386249261Sdim		fixed_pat = pat;
387249261Sdim		goto match_fixed_length;
388249261Sdim	}
389249261Sdim	if (asterisk && *str != '\0') {
390249261Sdim		fixed_str += strlen(str);
391249261Sdim		goto match_fixed_length;
392249261Sdim	}
393249261Sdim	res.matched = *str == '\0';
394249261Sdim	return res;
395263509Sdim
396249261Sdimno_match:
397249261Sdim	if (!asterisk)
398249261Sdim		return res;
399249261Sdim	fixed_str++;
400249261Sdim	goto match_fixed_length;
401249261Sdim}
402263509Sdim
403263509Sdimvoid
404249261SdimStr_Intern_Init(void)
405249261Sdim{
406249261Sdim	HashTable_Init(&interned_strings);
407263509Sdim}
408263509Sdim
409249261Sdimvoid
410263509SdimStr_Intern_End(void)
411249261Sdim{
412263509Sdim#ifdef CLEANUP
413263509Sdim	HashTable_Done(&interned_strings);
414263509Sdim#endif
415249261Sdim}
416249261Sdim
417249261Sdim/* Return a canonical instance of str, with unlimited lifetime. */
418249261Sdimconst char *
419249261SdimStr_Intern(const char *str)
420249261Sdim{
421249261Sdim	return HashTable_CreateEntry(&interned_strings, str, NULL)->key;
422249261Sdim}
423263509Sdim