1/*	$NetBSD: fields.c,v 1.31 2009/11/06 18:34:22 joerg Exp $	*/
2
3/*-
4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Ben Harris and Jaromir Dolecek.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1993
34 *	The Regents of the University of California.  All rights reserved.
35 *
36 * This code is derived from software contributed to Berkeley by
37 * Peter McIlroy.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 *    notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 *    notice, this list of conditions and the following disclaimer in the
46 *    documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 *    may be used to endorse or promote products derived from this software
49 *    without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 */
63
64/* Subroutines to generate sort keys. */
65
66#include "sort.h"
67
68__RCSID("$NetBSD: fields.c,v 1.31 2009/11/06 18:34:22 joerg Exp $");
69
70#define SKIP_BLANKS(ptr) {					\
71	if (BLANK & d_mask[*(ptr)])				\
72		while (BLANK & d_mask[*(++(ptr))]);		\
73}
74
75#define NEXTCOL(pos) {						\
76	if (!SEP_FLAG)						\
77		while (BLANK & l_d_mask[*(++pos)]);		\
78	while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
79}
80
81static u_char *enterfield(u_char *, const u_char *, struct field *, int);
82static u_char *number(u_char *, const u_char *, u_char *, u_char *, int);
83static u_char *length(u_char *, const u_char *, u_char *, u_char *, int);
84
85#define DECIMAL_POINT '.'
86
87/*
88 * constructs sort key with leading recheader, followed by the key,
89 * followed by the original line.
90 */
91length_t
92enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data,
93    size_t line_size, struct field fieldtable[])
94	/* keybuf:	 pointer to start of key */
95{
96	int i;
97	u_char *l_d_mask;
98	u_char *lineend, *pos;
99	const u_char *endkey;
100	u_char *keypos;
101	struct coldesc *clpos;
102	int col = 1;
103	struct field *ftpos;
104
105	l_d_mask = d_mask;
106	pos = line_data - 1;
107	lineend = line_data + line_size-1;
108				/* don't include rec_delimiter */
109
110	for (i = 0; i < ncols; i++) {
111		clpos = clist + i;
112		for (; (col < clpos->num) && (pos < lineend); col++) {
113			NEXTCOL(pos);
114		}
115		if (pos >= lineend)
116			break;
117		clpos->start = SEP_FLAG ? pos + 1 : pos;
118		NEXTCOL(pos);
119		clpos->end = pos;
120		col++;
121		if (pos >= lineend) {
122			clpos->end = lineend;
123			i++;
124			break;
125		}
126	}
127	for (; i <= ncols; i++)
128		clist[i].start = clist[i].end = lineend;
129	if (clist[0].start < line_data)
130		clist[0].start++;
131
132	/*
133	 * We write the sort keys (concatenated) followed by the
134	 * original line data (for output) as the 'keybuf' data.
135	 * keybuf->length is the number of key bytes + data bytes.
136	 * keybuf->offset is the number of key bytes.
137	 * We add a record separator weight after the key in case
138	 * (as is usual) we need to preserve the order of equal lines,
139	 * and for 'sort -u'.
140	 * The key itself will have had the correct weight applied.
141	 */
142	keypos = keybuf->data;
143	endkey = keybuf_end - line_size - 1;
144	if (endkey <= keypos)
145		/* No room for any key bytes */
146		return 1;
147
148	for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) {
149		if ((keypos = enterfield(keypos, endkey, ftpos,
150		    fieldtable->flags)) == NULL)
151			return (1);
152	}
153
154	keybuf->offset = keypos - keybuf->data;
155	keybuf->length = keybuf->offset + line_size;
156
157	/*
158	 * Posix requires that equal keys be further sorted by the
159	 * entire original record.
160	 * NetBSD has (at least for some time) kept equal keys in
161	 * their original order.
162	 * For 'sort -u' posix_sort is unset.
163	 */
164	keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
165
166	memcpy(keypos, line_data, line_size);
167	return (0);
168}
169
170/*
171 * constructs a field (as defined by -k) within a key
172 */
173static u_char *
174enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld,
175    int gflags)
176{
177	u_char *start, *end, *lineend, *mask, *lweight;
178	struct column icol, tcol;
179	u_int flags;
180
181	icol = cur_fld->icol;
182	tcol = cur_fld->tcol;
183	flags = cur_fld->flags;
184	start = icol.p->start;
185	lineend = clist[ncols].end;
186	if (flags & BI)
187		SKIP_BLANKS(start);
188	start += icol.indent;
189	start = min(start, lineend);
190
191	if (!tcol.num)
192		end = lineend;
193	else {
194		if (tcol.indent) {
195			end = tcol.p->start;
196			if (flags & BT)
197				SKIP_BLANKS(end);
198			end += tcol.indent;
199			end = min(end, lineend);
200		} else
201			end = tcol.p->end;
202	}
203
204	if (flags & L)
205		return length(tablepos, endkey, start, end, flags);
206	if (flags & N)
207		return number(tablepos, endkey, start, end, flags);
208
209	/* Bound check space - assuming nothing is skipped */
210	if (tablepos + (end - start) + 1 >= endkey)
211		return NULL;
212
213	mask = cur_fld->mask;
214	lweight = cur_fld->weights;
215	for (; start < end; start++) {
216		if (!mask || mask[*start]) {
217			*tablepos++ = lweight[*start];
218		}
219	}
220	/* Add extra byte (absent from lweight) to sort short keys correctly */
221	*tablepos++ = lweight[REC_D];
222	return tablepos;
223}
224
225/*
226 * Numbers are converted to a floating point format (exponent & mantissa)
227 * so that they compare correctly as sequence of unsigned bytes.
228 * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
229 * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
230 *
231 * The first byte contain the overall sign, exponent sign and some of the
232 * exponent. These have to be ordered (-ve value, decreasing exponent),
233 * zero, (+ve value, increasing exponent).
234 *
235 * The first byte is 0x80 for zero, 0xc0 for +ve with exponent 0.
236 * -ve values are the 1's compliments (so 0x7f isn't used!).
237 *
238 * This only leaves 63 byte values for +ve exponents - which isn't enough.
239 * The largest 4 exponent values are used to hold a byte count of the
240 * number of following bytes that contain 8 exponent bits per byte,
241 * This lets us sort exponents from -2^31 to +2^31.
242 *
243 * The mantissa is stored 2 digits per byte offset by 0x40, for negative
244 * numbers the order must be reversed (they are bit inverted).
245 *
246 * Reverse sorts are done by inverting the sign of the number.
247 */
248#define MAX_EXP_ENC  ((int)sizeof(int))
249
250static u_char *
251number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
252    int reverse)
253{
254	int exponent = -1;
255	int had_dp = 0;
256	u_char *tline;
257	char ch;
258	unsigned int val;
259	u_char *last_nz_pos;
260	u_char negate;
261
262	if (reverse & R)
263		negate = 0xff;
264	else
265		negate = 0;
266
267	/* Give ourselves space for the key terminator */
268	bufend--;
269
270	/* Ensure we have enough space for the exponent */
271	if (pos + 1 + MAX_EXP_ENC > bufend)
272		return (NULL);
273
274	SKIP_BLANKS(line);
275	if (*line == '-') {	/* set the sign */
276		negate ^= 0xff;
277		line++;
278	}
279	/* eat initial zeroes */
280	for (; *line == '0' && line < lineend; line++)
281		continue;
282
283	/* calculate exponents */
284	if (*line == DECIMAL_POINT) {
285		/* Decimal fraction */
286		had_dp = 1;
287		while (*++line == '0' && line < lineend)
288			exponent--;
289	} else {
290		/* Large (absolute) value, count digits */
291		for (tline = line; *tline >= '0' &&
292		    *tline <= '9' && tline < lineend; tline++)
293			exponent++;
294	}
295
296	/* If the first/next character isn't a digit, value is zero */
297	if (*line < '1' || *line > '9' || line >= lineend) {
298		/* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */
299		/* XXX what about NaN, NAN, inf and INF */
300		*pos++ = 0x80;
301		return pos;
302	}
303
304	/* Maybe here we should allow for e+12 (etc) */
305
306	if (exponent < 0x40 - MAX_EXP_ENC && -exponent < 0x40 - MAX_EXP_ENC) {
307		/* Value ok for simple encoding */
308		/* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
309		exponent += 0xc0;
310		*pos++ = negate ^ exponent;
311	} else {
312		/* Out or range for a single byte */
313		int c, t;
314		t = exponent > 0 ? exponent : -exponent;
315		/* Count how many 8-bit bytes are needed */
316		for (c = 0; ; c++) {
317			t >>= 8;
318			if (t == 0)
319				break;
320		}
321		/* 'c' better be 0..3 here - but probably 0..1 */
322		/* Offset just outside valid range */
323		t = c + 0x40 - MAX_EXP_ENC;
324		if (exponent < 0)
325			t = -t;
326		*pos++ = negate ^ (t + 0xc0);
327		/* now add each byte, most significant first */
328		for (; c >= 0; c--)
329			*pos++ = negate ^ (exponent >> (c * 8));
330	}
331
332	/* Finally add mantissa, 2 digits per byte */
333	for (last_nz_pos = pos; line < lineend; ) {
334		if (pos >= bufend)
335			return NULL;
336		ch = *line++;
337		val = (ch - '0') * 10;
338		if (val > 90) {
339			if (ch == DECIMAL_POINT && !had_dp) {
340				had_dp = 1;
341				continue;
342			}
343			break;
344		}
345		while (line < lineend) {
346			ch = *line++;
347			if (ch == DECIMAL_POINT && !had_dp) {
348				had_dp = 1;
349				continue;
350			}
351			if (ch < '0' || ch > '9')
352				line = lineend;
353			else
354				val += ch - '0';
355			break;
356		}
357		*pos++ = negate ^ (val + 0x40);
358		if (val != 0)
359			last_nz_pos = pos;
360	}
361
362	/* Add key terminator, deleting any trailing "00" */
363	*last_nz_pos++ = negate;
364
365	return (last_nz_pos);
366}
367
368static u_char *
369length(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
370    int flag)
371{
372	u_char buf[32];
373	int l;
374	SKIP_BLANKS(line);
375	l = snprintf((char *)buf, sizeof(buf), "%td", lineend - line);
376	return number(pos, bufend, buf, buf + l, flag);
377}
378