1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1992-2011 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                 Eclipse Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*          http://www.eclipse.org/org/documents/epl-v10.html           *
11*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                                                                      *
20***********************************************************************/
21#pragma prototyped
22/*
23 * David Korn
24 * AT&T Bell Laboratories
25 *
26 * library interface for word count
27 */
28
29#include <cmd.h>
30#include <wc.h>
31#include <ctype.h>
32
33#if _hdr_wchar && _hdr_wctype && _lib_iswctype
34
35#include <wchar.h>
36#include <wctype.h>
37#include <lc.h>
38
39#else
40
41#ifndef iswspace
42#define iswspace(x)	isspace(x)
43#endif
44
45#endif
46
47#define	WC_SP		0x08
48#define	WC_NL		0x10
49#define	WC_MB		0x20
50#define	WC_ERR		0x40
51
52#define eol(c)		((c)&WC_NL)
53#define mbc(c)		((c)&WC_MB)
54#define spc(c)		((c)&WC_SP)
55#define mb2wc(w,p,n)	(*ast.mb_towc)(&w,(char*)p,n)
56
57Wc_t* wc_init(int mode)
58{
59	register int	n;
60	register int	w;
61	Wc_t*		wp;
62
63	if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
64		return 0;
65	if (!mbwide())
66		wp->mb = 0;
67#if _hdr_wchar && _hdr_wctype && _lib_iswctype
68	else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
69		wp->mb = 1;
70#endif
71	else
72		wp->mb = -1;
73	w = mode & WC_WORDS;
74	for (n = (1<<CHAR_BIT); --n >= 0;)
75		wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
76	wp->type['\n'] = WC_SP|WC_NL;
77	if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
78	{
79		for (n = 0; n < 64; n++)
80		{
81			wp->type[0x80+n] |= WC_MB;
82			if (n<32)
83				wp->type[0xc0+n] |= WC_MB+1;
84			else if (n<48)
85				wp->type[0xc0+n] |= WC_MB+2;
86			else if (n<56)
87				wp->type[0xc0+n] |= WC_MB+3;
88			else if (n<60)
89				wp->type[0xc0+n] |= WC_MB+4;
90			else if (n<62)
91				wp->type[0xc0+n] |= WC_MB+5;
92		}
93		wp->type[0xc0] = WC_MB|WC_ERR;
94		wp->type[0xc1] = WC_MB|WC_ERR;
95		wp->type[0xfe] = WC_MB|WC_ERR;
96		wp->type[0xff] = WC_MB|WC_ERR;
97	}
98	wp->mode = mode;
99	return wp;
100}
101
102static int invalid(const char *file, int nlines)
103{
104	error_info.file = (char*)file;
105	error_info.line = nlines;
106	error(ERROR_SYSTEM|1, "invalid multibyte character");
107	error_info.file = 0;
108	error_info.line = 0;
109	return nlines;
110}
111
112/*
113 * handle utf space characters
114 */
115
116static int chkstate(int state, register unsigned int c)
117{
118	switch(state)
119	{
120	case 1:
121		state = (c==0x9a?4:0);
122		break;
123	case 2:
124		state = ((c==0x80||c==0x81)?6+(c&1):0);
125		break;
126	case 3:
127		state = (c==0x80?5:0);
128		break;
129	case 4:
130		state = (c==0x80?10:0);
131		break;
132	case 5:
133		state = (c==0x80?10:0);
134		break;
135	case 6:
136		state = 0;
137		if(c==0xa0 || c==0xa1)
138			return(10);
139		else if((c&0xf0)== 0x80)
140		{
141			if((c&=0xf)==7)
142				return(iswspace(0x2007)?10:0);
143			if(c<=0xb)
144				return(10);
145		}
146		else if(c==0xaf && iswspace(0x202f))
147			return(10);
148		break;
149	case 7:
150		state = (c==0x9f?10:0);
151		break;
152	case 8:
153		return (iswspace(c)?10:0);
154	}
155	return state;
156}
157
158/*
159 * compute the line, word, and character count for file <fd>
160 */
161
162int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
163{
164	register char*		type = wp->type;
165	register unsigned char*	cp;
166	register Sfoff_t	nbytes;
167	register Sfoff_t	nchars;
168	register Sfoff_t	nwords;
169	register Sfoff_t	nlines;
170	register Sfoff_t	eline = -1;
171	register Sfoff_t	longest = 0;
172	register ssize_t	c;
173	register unsigned char*	endbuff;
174	register int		lasttype = WC_SP;
175	unsigned int		lastchar;
176	ssize_t			n;
177	ssize_t			o;
178	unsigned char*		buff;
179	wchar_t			x;
180	unsigned char		side[32];
181
182	sfset(fd,SF_WRITE,1);
183	nlines = nwords = nchars = nbytes = 0;
184	wp->longest = 0;
185	if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
186	{
187		cp = buff = endbuff = 0;
188		for (;;)
189		{
190			if (cp >= endbuff || (n = mb2wc(x, cp, endbuff-cp)) < 0)
191			{
192				if ((o = endbuff-cp) < sizeof(side))
193				{
194					if (buff)
195					{
196						if (o)
197							memcpy(side, cp, o);
198						mbinit();
199					}
200					else
201						o = 0;
202					cp = side + o;
203					if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
204					{
205						if ((nchars - longest) > wp->longest)
206							wp->longest = nchars - longest;
207						break;
208					}
209					nbytes += n;
210					if ((c = sizeof(side) - o) > n)
211						c = n;
212					if (c)
213						memcpy(cp, buff, c);
214					endbuff = buff + n;
215					cp = side;
216					x = mbchar(cp);
217					if ((cp-side) < o)
218					{
219						cp = buff;
220						nchars += (cp-side) - 1;
221					}
222					else
223						cp = buff + (cp-side) - o;
224				}
225				else
226				{
227					cp++;
228					x = -1;
229				}
230				if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
231					eline = invalid(file, nlines);
232			}
233			else
234				cp += n ? n : 1;
235			if (x == '\n')
236			{
237				if ((nchars - longest) > wp->longest)
238					wp->longest = nchars - longest;
239				longest = nchars + 1;
240				nlines++;
241				lasttype = 1;
242			}
243			else if (iswspace(x))
244				lasttype = 1;
245			else if (lasttype)
246			{
247				lasttype = 0;
248				nwords++;
249			}
250			nchars++;
251		}
252		if (!(wp->mode & WC_MBYTE))
253			nchars = nbytes;
254	}
255	else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
256	{
257		if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
258		{
259			while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
260			{
261				nchars += c;
262				endbuff = cp + c;
263				if (*--endbuff == '\n')
264					nlines++;
265				else
266					*endbuff = '\n';
267				for (;;)
268					if (*cp++ == '\n')
269					{
270						if (cp > endbuff)
271							break;
272						nlines++;
273					}
274			}
275		}
276		else
277		{
278			while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
279			{
280				nchars += c;
281				/* check to see whether first character terminates word */
282				if (c==1)
283				{
284					if (eol(lasttype))
285						nlines++;
286					if ((c = type[*cp]) && !lasttype)
287						nwords++;
288					lasttype = c;
289					continue;
290				}
291				if (!lasttype && type[*cp])
292					nwords++;
293				lastchar = cp[--c];
294				*(endbuff = cp+c) = '\n';
295				c = lasttype;
296				/* process each buffer */
297				for (;;)
298				{
299					/* process spaces and new-lines */
300					do
301					{
302						if (eol(c))
303							for (;;)
304							{
305								/* check for end of buffer */
306								if (cp > endbuff)
307									goto beob;
308								nlines++;
309								if (*cp != '\n')
310									break;
311								cp++;
312							}
313					} while (c = type[*cp++]);
314					/* skip over word characters */
315					while (!(c = type[*cp++]));
316					nwords++;
317				}
318			beob:
319				if ((cp -= 2) >= buff)
320					c = type[*cp];
321				else
322					c = lasttype;
323				lasttype = type[lastchar];
324				/* see if was in word */
325				if (!c && !lasttype)
326					nwords--;
327			}
328			if (eol(lasttype))
329				nlines++;
330			else if (!lasttype)
331				nwords++;
332		}
333	}
334	else
335	{
336		int		lineoff=0;
337		int		skip=0;
338		int		adjust=0;
339		int		state=0;
340		int		oldc;
341		int		xspace;
342		int		wasspace = 1;
343		unsigned char*	start;
344
345		lastchar = 0;
346		start = (endbuff = side) + 1;
347		xspace = iswspace(0xa0) || iswspace(0x85);
348		while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
349		{
350			nbytes += c;
351			nchars += c;
352			start = cp-lineoff;
353			/* check to see whether first character terminates word */
354			if(c==1)
355			{
356				if(eol(lasttype))
357					nlines++;
358				if((c = type[*cp]) && !lasttype)
359					nwords++;
360				lasttype = c;
361				endbuff = start;
362				continue;
363			}
364			lastchar = cp[--c];
365			endbuff = cp+c;
366			cp[c] = '\n';
367			if(mbc(lasttype))
368			{
369				c = lasttype;
370				goto mbyte;
371			}
372			if(!lasttype && spc(type[*cp]))
373				nwords++;
374			c = lasttype;
375			/* process each buffer */
376			for (;;)
377			{
378				/* process spaces and new-lines */
379			spaces:
380				do
381				{
382					if (eol(c))
383					{
384						/* check for end of buffer */
385						if (cp > endbuff)
386							goto eob;
387						if(wp->mode&WC_LONGEST)
388						{
389							if((cp-start)-adjust > longest)
390								longest = (cp-start)-adjust-1;
391							start = cp;
392						}
393						nlines++;
394						nchars -= adjust;
395						adjust = 0;
396					}
397				} while (spc(c = type[*cp++]));
398				wasspace=1;
399				if(mbc(c))
400				{
401				mbyte:
402					do
403					{
404						if(c&WC_ERR)
405							goto err;
406						if(skip && (c&7))
407							break;
408						if(!skip)
409						{
410							if(!(c&7))
411							{
412								skip=1;
413								break;
414							}
415							skip = (c&7);
416							adjust += skip;
417							state = 0;
418							if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
419								oldc = *cp;
420							else if(xspace && cp[-1]==0xc2)
421							{
422								state = 8;
423								oldc = *cp;
424							}
425						}
426						else
427						{
428							skip--;
429							if(state && (state=chkstate(state,oldc)))
430							{
431								if(state==10)
432								{
433									if(!wasspace)
434										nwords++;
435									wasspace = 1;
436									state=0;
437									goto spaces;
438								}
439								oldc = *cp;
440							}
441						}
442					} while (mbc(c = type[*cp++]));
443					wasspace = 0;
444					if(skip)
445					{
446						if(eol(c) && (cp > endbuff))
447							goto eob;
448				err:
449						skip = 0;
450						state = 0;
451						if(eline!=nlines && !(wp->mode & WC_QUIET))
452							eline = invalid(file, nlines);
453						while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
454							c=type[*cp++];
455						if(eol(c) && (cp > endbuff))
456						{
457							c = WC_MB|WC_ERR;
458							goto eob;
459						}
460						if(mbc(c))
461							goto mbyte;
462						else if(c&WC_SP)
463							goto spaces;
464					}
465					if(spc(c))
466					{
467						nwords++;
468						continue;
469					}
470				}
471				/* skip over word characters */
472				while(!(c = type[*cp++]));
473				if(mbc(c))
474					goto mbyte;
475				nwords++;
476			}
477		eob:
478			lineoff = cp-start;
479			if((cp -= 2) >= buff)
480				c = type[*cp];
481			else
482				c = lasttype;
483			lasttype = type[lastchar];
484			/* see if was in word */
485			if(!c && !lasttype)
486				nwords--;
487		}
488		if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
489			longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
490		wp->longest = longest;
491		if (eol(lasttype))
492			nlines++;
493		else if (!lasttype)
494			nwords++;
495		if (wp->mode & WC_MBYTE)
496			nchars -= adjust;
497		else
498			nchars = nbytes;
499	}
500	wp->chars = nchars;
501	wp->words = nwords;
502	wp->lines = nlines;
503	return 0;
504}
505
506