1/*	$NetBSD: msort.c,v 1.31 2016/06/01 02:37:55 kre Exp $	*/
2
3/*-
4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Ben Harris and Jaromir Dolecek.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1993
34 *	The Regents of the University of California.  All rights reserved.
35 *
36 * This code is derived from software contributed to Berkeley by
37 * Peter McIlroy.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 *    notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 *    notice, this list of conditions and the following disclaimer in the
46 *    documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 *    may be used to endorse or promote products derived from this software
49 *    without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 */
63
64#include "sort.h"
65#include "fsort.h"
66
67__RCSID("$NetBSD: msort.c,v 1.31 2016/06/01 02:37:55 kre Exp $");
68
69#include <stdlib.h>
70#include <string.h>
71#include <unistd.h>
72#include <util.h>
73
74/* Subroutines using comparisons: merge sort and check order */
75#define DELETE (1)
76
77typedef struct mfile {
78	FILE         *fp;
79	get_func_t   get;
80	RECHEADER    *rec;
81	u_char       *end;
82} MFILE;
83
84static int cmp(RECHEADER *, RECHEADER *);
85static int insert(struct mfile **, struct mfile *, int, int);
86static void merge_sort_fstack(FILE *, put_func_t, struct field *);
87
88/*
89 * Number of files merge() can merge in one pass.
90 */
91#define MERGE_FNUM      16
92
93static struct mfile fstack[MERGE_FNUM];
94static struct mfile fstack_1[MERGE_FNUM];
95static struct mfile fstack_2[MERGE_FNUM];
96static int fstack_count, fstack_1_count, fstack_2_count;
97
98void
99save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
100{
101	FILE *mfp, *mfp1, *mfp2;
102
103	if (fstack_count == MERGE_FNUM) {
104		/* Must reduce the number of temporary files */
105		mfp = ftmp();
106		merge_sort_fstack(mfp, putrec, ftbl);
107		/* Save output in next layer */
108		if (fstack_1_count == MERGE_FNUM) {
109			mfp1 = ftmp();
110			memcpy(fstack, fstack_1, sizeof fstack);
111			merge_sort_fstack(mfp1, putrec, ftbl);
112			if (fstack_2_count == MERGE_FNUM) {
113				/* More than 4096 files! */
114				mfp2 = ftmp();
115				memcpy(fstack, fstack_2, sizeof fstack);
116				merge_sort_fstack(mfp2, putrec, ftbl);
117				fstack_2[0].fp = mfp2;
118				fstack_2_count = 1;
119			}
120			fstack_2[fstack_2_count].fp = mfp1;
121			fstack_2[fstack_2_count].get = geteasy;
122			fstack_2_count++;
123			fstack_1_count = 0;
124		}
125		fstack_1[fstack_1_count].fp = mfp;
126		fstack_1[fstack_1_count].get = geteasy;
127		fstack_1_count++;
128		fstack_count = 0;
129	}
130
131	fstack[fstack_count].fp = fp;
132	fstack[fstack_count++].get = get;
133}
134
135void
136fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
137{
138	get_func_t get = SINGL_FLD ? makeline : makekey;
139	FILE *fp;
140	int i;
141
142	for (i = 0; i < nfiles; i++) {
143		fp = fopen(filelist->names[i], "r");
144		if (fp == NULL)
145			err(2, "%s", filelist->names[i]);
146		save_for_merge(fp, get, ftbl);
147	}
148
149	merge_sort(outfp, putline, ftbl);
150}
151
152void
153merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
154{
155	int count = fstack_1_count + fstack_2_count;
156	FILE *mfp;
157	int i;
158
159	if (count == 0) {
160		/* All files in initial array */
161		merge_sort_fstack(outfp, put, ftbl);
162		return;
163	}
164
165	count += fstack_count;
166
167	/* Too many files for one merge sort */
168	for (;;) {
169		/* Sort latest 16 files */
170		i = count;
171		if (i > MERGE_FNUM)
172			i = MERGE_FNUM;
173		while (fstack_count > 0)
174			fstack[--i] = fstack[--fstack_count];
175		while (i > 0 && fstack_1_count > 0)
176			fstack[--i] = fstack_1[--fstack_1_count];
177		while (i > 0)
178			fstack[--i] = fstack_2[--fstack_2_count];
179		if (count <= MERGE_FNUM) {
180			/* Got all the data */
181			fstack_count = count;
182			merge_sort_fstack(outfp, put, ftbl);
183			return;
184		}
185		mfp = ftmp();
186		fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
187		merge_sort_fstack(mfp, putrec, ftbl);
188		fstack[0].fp = mfp;
189		fstack[0].get = geteasy;
190		fstack_count = 1;
191		count -= MERGE_FNUM - 1;
192	}
193}
194
195static void
196merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
197{
198	struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
199	RECHEADER *new_rec;
200	u_char *new_end;
201	void *tmp;
202	int c, i, nfiles;
203	size_t sz;
204
205	/* Read one record from each file (read again if a duplicate) */
206	for (nfiles = i = 0; i < fstack_count; i++) {
207		cfile = &fstack[i];
208		if (cfile->rec == NULL) {
209			cfile->rec = allocrec(NULL, DEFLLEN);
210			cfile->end = (u_char *)cfile->rec + DEFLLEN;
211		}
212		rewind(cfile->fp);
213
214		for (;;) {
215			c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
216			if (c == EOF)
217				break;
218
219			if (c == BUFFEND) {
220				/* Double buffer size */
221				sz = (cfile->end - (u_char *)cfile->rec) * 2;
222				cfile->rec = allocrec(cfile->rec, sz);
223				cfile->end = (u_char *)cfile->rec + sz;
224				continue;
225			}
226
227			if (nfiles != 0) {
228				if (insert(flist, cfile, nfiles, !DELETE))
229					/* Duplicate removed */
230					continue;
231			} else
232				flist[0] = cfile;
233			nfiles++;
234			break;
235		}
236	}
237
238	if (nfiles == 0)
239		return;
240
241	/*
242	 * We now loop reading a new record from the file with the
243	 * 'sorted first' existing record.
244	 * As each record is added, the 'first' record is written to the
245	 * output file - maintaining one record from each file in the sorted
246	 * list.
247	 */
248	new_rec = allocrec(NULL, DEFLLEN);
249	new_end = (u_char *)new_rec + DEFLLEN;
250	for (;;) {
251		cfile = flist[0];
252		c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
253		if (c == EOF) {
254			/* Write out last record from now-empty input */
255			put(cfile->rec, outfp);
256			if (--nfiles == 0)
257				break;
258			/* Replace from file with now-first sorted record. */
259			/* (Moving base 'flist' saves copying everything!) */
260			flist++;
261			continue;
262		}
263		if (c == BUFFEND) {
264			/* Buffer not large enough - double in size */
265			sz = (new_end - (u_char *)new_rec) * 2;
266			new_rec = allocrec(new_rec, sz);
267			new_end = (u_char *)new_rec +sz;
268			continue;
269		}
270
271		/* Swap in new buffer, saving old */
272		tmp = cfile->rec;
273		cfile->rec = new_rec;
274		new_rec = tmp;
275		tmp = cfile->end;
276		cfile->end = new_end;
277		new_end = tmp;
278
279		/* Add into sort, removing the original first entry */
280		c = insert(flist, cfile, nfiles, DELETE);
281		if (c != 0 || (UNIQUE && cfile == flist[0]
282			    && cmp(new_rec, cfile->rec) == 0)) {
283			/* Was an unwanted duplicate, restore buffer */
284			tmp = cfile->rec;
285			cfile->rec = new_rec;
286			new_rec = tmp;
287			tmp = cfile->end;
288			cfile->end = new_end;
289			new_end = tmp;
290			continue;
291		}
292
293		/* Write out 'old' record */
294		put(new_rec, outfp);
295	}
296
297	free(new_rec);
298}
299
300/*
301 * if delete: inserts rec in flist, deletes flist[0];
302 * otherwise just inserts *rec in flist.
303 * Returns 1 if record is a duplicate to be ignored.
304 */
305static int
306insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
307{
308	int mid, top = ttop, bot = 0, cmpv = 1;
309
310	for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
311		cmpv = cmp(rec->rec, flist[mid]->rec);
312		if (cmpv == 0 ) {
313			if (UNIQUE)
314				/* Duplicate key, read another record */
315				/* NB: This doesn't guarantee to keep any
316				 * particular record. */
317				return 1;
318			/*
319			 * Apply sort by input file order.
320			 * We could truncate the sort is the fileno are
321			 * adjacent - but that is all too hard!
322			 * The fileno cannot be equal, since we only have one
323			 * record from each file (+ flist[0] which never
324			 * comes here).
325			 */
326			cmpv = rec < flist[mid] ? -1 : 1;
327			if (REVERSE)
328				cmpv = -cmpv;
329		}
330		if (cmpv < 0)
331			top = mid;
332		else
333			bot = mid;
334	}
335
336	/* At this point we haven't yet compared against flist[0] */
337
338	if (delete) {
339		/* flist[0] is ourselves, only the caller knows the old data */
340		if (bot != 0) {
341			memmove(flist, flist + 1, bot * sizeof(MFILE *));
342			flist[bot] = rec;
343		}
344		return 0;
345	}
346
347	/* Inserting original set of records */
348
349	if (bot == 0 && cmpv != 0) {
350		/* Doesn't match flist[1], must compare with flist[0] */
351		cmpv = cmp(rec->rec, flist[0]->rec);
352		if (cmpv == 0 && UNIQUE)
353			return 1;
354		/* Add matching keys in file order (ie new is later) */
355		if (cmpv < 0)
356			bot = -1;
357	}
358	bot++;
359	memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
360	flist[bot] = rec;
361	return 0;
362}
363
364/*
365 * check order on one file
366 */
367void
368order(struct filelist *filelist, struct field *ftbl, int quiet)
369{
370	get_func_t get = SINGL_FLD ? makeline : makekey;
371	RECHEADER *crec, *prec, *trec;
372	u_char *crec_end, *prec_end, *trec_end;
373	FILE *fp;
374	int c;
375
376	fp = fopen(filelist->names[0], "r");
377	if (fp == NULL)
378		err(2, "%s", filelist->names[0]);
379
380	crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
381	crec_end = crec->data + DEFLLEN;
382	prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
383	prec_end = prec->data + DEFLLEN;
384
385	/* XXX this does exit(0) for overlong lines */
386	if (get(fp, prec, prec_end, ftbl) != 0)
387		exit(0);
388	while (get(fp, crec, crec_end, ftbl) == 0) {
389		if (0 < (c = cmp(prec, crec))) {
390			if (quiet)
391				exit(1);
392			crec->data[crec->length-1] = 0;
393			errx(1, "found disorder: %s", crec->data+crec->offset);
394		}
395		if (UNIQUE && !c) {
396			if (quiet)
397				exit(1);
398			crec->data[crec->length-1] = 0;
399			errx(1, "found non-uniqueness: %s",
400			    crec->data+crec->offset);
401		}
402		/*
403		 * Swap pointers so that this record is on place pointed
404		 * to by prec and new record is read to place pointed to by
405		 * crec.
406		 */
407		trec = prec;
408		prec = crec;
409		crec = trec;
410		trec_end = prec_end;
411		prec_end = crec_end;
412		crec_end = trec_end;
413	}
414	exit(0);
415}
416
417static int
418cmp(RECHEADER *rec1, RECHEADER *rec2)
419{
420	int len;
421	int r;
422
423	/* key is weights */
424	len = min(rec1->keylen, rec2->keylen);
425	r = memcmp(rec1->data, rec2->data, len);
426	if (r == 0)
427		r = rec1->keylen - rec2->keylen;
428	if (REVERSE)
429		r = -r;
430	return r;
431}
432