1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1985-2011 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                 Eclipse Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*          http://www.eclipse.org/org/documents/epl-v10.html           *
11*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                   Phong Vo <kpv@research.att.com>                    *
20*                                                                      *
21***********************************************************************/
22#pragma prototyped
23
24/*
25 * determine record format by sampling data in <buf,size>
26 * total is the total file size, <=0 if not available
27 * return r:
28 *	-1				could not determine
29 *	RECTYPE(r)==REC_fixed		fixed length REC_F_SIZE(r)
30 *	RECTYPE(r)==REC_delimited	variable length delimiter=REC_D_DELIMITER(r)
31 *	RECTYPE(r)==REC_variable	variable length
32 */
33
34#include <recfmt.h>
35
36typedef struct
37{
38	unsigned int	rep[4 * 1024];
39	unsigned int	hit[UCHAR_MAX + 1];
40} Sample_t;
41
42Recfmt_t
43recfmt(const void* buf, size_t size, off_t total)
44{
45	register unsigned char*		s;
46	register unsigned char*		t;
47	register Sample_t*		q;
48	register unsigned int*		h;
49	register unsigned int		i;
50	unsigned int			j;
51	unsigned int			k;
52	unsigned int			n;
53	unsigned int			m;
54	unsigned int			x;
55	unsigned long			f;
56	unsigned long			g;
57
58	static unsigned char		terminators[] = { '\n', 0x15, 0x25 };
59
60	/*
61	 * check for V format
62	 */
63
64	s = (unsigned char*)buf;
65	t = s + size;
66	while ((k = (t - s)) >= 4 && !s[2] && !s[3])
67	{
68		if ((i = (s[0]<<8)|s[1]) > k)
69			break;
70		s += i;
71	}
72	if (!k || size > 2 * k)
73		return REC_V_TYPE(4, 0, 2, 0, 1);
74	s = (unsigned char*)buf;
75
76	/*
77	 * check for terminated records
78	 */
79
80	for (i = 0; i < elementsof(terminators); i++)
81		if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n)))
82		{
83			for (j = n - 1; j < size; j += n)
84				if (s[j] != k)
85				{
86					n = 0;
87					break;
88				}
89			if (n)
90				return REC_D_TYPE(terminators[i]);
91		}
92
93	/*
94	 * check fixed length record frequencies
95	 */
96
97	if (!(q = newof(0, Sample_t, 1, 0)))
98		return REC_N_TYPE();
99	x = 0;
100	for (i = 0; i < size; i++)
101	{
102		h = q->hit + s[i];
103		m = i - *h;
104		*h = i;
105		if (m < elementsof(q->rep))
106		{
107			if (m > x)
108				x = m;
109			q->rep[m]++;
110		}
111	}
112	n = 0;
113	m = 0;
114	f = ~0;
115	for (i = x; i > 1; i--)
116	{
117		if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n])
118		{
119			m++;
120			g = 0;
121			for (j = i; j < size - i; j += i)
122				for (k = 0; k < i; k++)
123					if (s[j + k] != s[j + k - i])
124						g++;
125			g = (((g * 100) / i) * 100) / q->rep[i];
126			if (g <= f)
127			{
128				f = g;
129				n = i;
130			}
131		}
132	}
133	if (m <= 1 && n <= 2 && total > 1 && total < 256)
134	{
135		n = 0;
136		for (i = 0; i < size; i++)
137			for (j = 0; j < elementsof(terminators); j++)
138				if (s[i] == terminators[j])
139					n++;
140		n = n ? 0 : total;
141	}
142	free(q);
143	return n ? REC_F_TYPE(n) : REC_N_TYPE();
144}
145
146#if MAIN
147
148main()
149{
150	void*	s;
151	size_t	size;
152	off_t	total;
153
154	if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0)))
155	{
156		sfprintf(sfstderr, "read error\n");
157		return 1;
158	}
159	size = sfvalue(sfstdin);
160	total = sfsize(sfstdin);
161	sfprintf(sfstdout, "%d\n", recfmt(s, size, total));
162	return 0;
163}
164
165#endif
166