in_cksum.c revision 50477
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	from tahoe:	in_cksum.c	1.2	86/01/05
34 *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
35 * $FreeBSD: head/sys/i386/i386/in_cksum.c 50477 1999-08-28 01:08:13Z peter $
36 */
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/mbuf.h>
41
42#include <netinet/in.h>
43#include <netinet/in_systm.h>
44#include <netinet/ip.h>
45
46#include <machine/in_cksum.h>
47
48/*
49 * Checksum routine for Internet Protocol family headers.
50 *
51 * This routine is very heavily used in the network
52 * code and should be modified for each CPU to be as fast as possible.
53 *
54 * This implementation is 386 version.
55 */
56
57#undef	ADDCARRY
58#define ADDCARRY(x)     if ((x) > 0xffff) (x) -= 0xffff
59#define REDUCE          {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);}
60
61/*
62 * Thanks to gcc we don't have to guess
63 * which registers contain sum & w.
64 */
65#define ADD(n)	__asm __volatile \
66		("addl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w))
67#define ADDC(n)	__asm __volatile \
68		("adcl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w))
69#define LOAD(n)	__asm __volatile \
70		("movb " #n "(%1), %0" : "=r" (junk) :           "r" (w))
71#define MOP	__asm __volatile \
72		("adcl         $0, %0" : "=r" (sum) : "0" (sum))
73
74int
75in_cksum(m, len)
76	register struct mbuf *m;
77	register int len;
78{
79	register u_short *w;
80	register unsigned sum = 0;
81	register int mlen = 0;
82	int byte_swapped = 0;
83	union { char	c[2]; u_short	s; } su;
84
85	for (;m && len; m = m->m_next) {
86		if (m->m_len == 0)
87			continue;
88		w = mtod(m, u_short *);
89		if (mlen == -1) {
90			/*
91			 * The first byte of this mbuf is the continuation
92			 * of a word spanning between this mbuf and the
93			 * last mbuf.
94			 */
95
96			/* su.c[0] is already saved when scanning previous
97			 * mbuf.  sum was REDUCEd when we found mlen == -1
98			 */
99			su.c[1] = *(u_char *)w;
100			sum += su.s;
101			w = (u_short *)((char *)w + 1);
102			mlen = m->m_len - 1;
103			len--;
104		} else
105			mlen = m->m_len;
106		if (len < mlen)
107			mlen = len;
108		len -= mlen;
109		/*
110		 * Force to long boundary so we do longword aligned
111		 * memory operations
112		 */
113		if (3 & (int) w) {
114			REDUCE;
115			if ((1 & (int) w) && (mlen > 0)) {
116				sum <<= 8;
117				su.c[0] = *(char *)w;
118				w = (u_short *)((char *)w + 1);
119				mlen--;
120				byte_swapped = 1;
121			}
122			if ((2 & (int) w) && (mlen >= 2)) {
123				sum += *w++;
124				mlen -= 2;
125			}
126		}
127		/*
128		 * Advance to a 486 cache line boundary.
129		 */
130		if (4 & (int) w && mlen >= 4) {
131			ADD(0);
132			MOP;
133			w += 2;
134			mlen -= 4;
135		}
136		if (8 & (int) w && mlen >= 8) {
137			ADD(0);
138			ADDC(4);
139			MOP;
140			w += 4;
141			mlen -= 8;
142		}
143		/*
144		 * Do as much of the checksum as possible 32 bits at at time.
145		 * In fact, this loop is unrolled to make overhead from
146		 * branches &c small.
147		 */
148		mlen -= 1;
149		while ((mlen -= 32) >= 0) {
150			u_char junk;
151			/*
152			 * Add with carry 16 words and fold in the last
153			 * carry by adding a 0 with carry.
154			 *
155			 * The early ADD(16) and the LOAD(32) are to load
156			 * the next 2 cache lines in advance on 486's.  The
157			 * 486 has a penalty of 2 clock cycles for loading
158			 * a cache line, plus whatever time the external
159			 * memory takes to load the first word(s) addressed.
160			 * These penalties are unavoidable.  Subsequent
161			 * accesses to a cache line being loaded (and to
162			 * other external memory?) are delayed until the
163			 * whole load finishes.  These penalties are mostly
164			 * avoided by not accessing external memory for
165			 * 8 cycles after the ADD(16) and 12 cycles after
166			 * the LOAD(32).  The loop terminates when mlen
167			 * is initially 33 (not 32) to guaranteed that
168			 * the LOAD(32) is within bounds.
169			 */
170			ADD(16);
171			ADDC(0);
172			ADDC(4);
173			ADDC(8);
174			ADDC(12);
175			LOAD(32);
176			ADDC(20);
177			ADDC(24);
178			ADDC(28);
179			MOP;
180			w += 16;
181		}
182		mlen += 32 + 1;
183		if (mlen >= 32) {
184			ADD(16);
185			ADDC(0);
186			ADDC(4);
187			ADDC(8);
188			ADDC(12);
189			ADDC(20);
190			ADDC(24);
191			ADDC(28);
192			MOP;
193			w += 16;
194			mlen -= 32;
195		}
196		if (mlen >= 16) {
197			ADD(0);
198			ADDC(4);
199			ADDC(8);
200			ADDC(12);
201			MOP;
202			w += 8;
203			mlen -= 16;
204		}
205		if (mlen >= 8) {
206			ADD(0);
207			ADDC(4);
208			MOP;
209			w += 4;
210			mlen -= 8;
211		}
212		if (mlen == 0 && byte_swapped == 0)
213			continue;       /* worth 1% maybe ?? */
214		REDUCE;
215		while ((mlen -= 2) >= 0) {
216			sum += *w++;
217		}
218		if (byte_swapped) {
219			sum <<= 8;
220			byte_swapped = 0;
221			if (mlen == -1) {
222				su.c[1] = *(char *)w;
223				sum += su.s;
224				mlen = 0;
225			} else
226				mlen = -1;
227		} else if (mlen == -1)
228			/*
229			 * This mbuf has odd number of bytes.
230			 * There could be a word split betwen
231			 * this mbuf and the next mbuf.
232			 * Save the last byte (to prepend to next mbuf).
233			 */
234			su.c[0] = *(char *)w;
235	}
236
237	if (len)
238		printf("cksum: out of data\n");
239	if (mlen == -1) {
240		/* The last mbuf has odd # of bytes. Follow the
241		   standard (the odd byte is shifted left by 8 bits) */
242		su.c[1] = 0;
243		sum += su.s;
244	}
245	REDUCE;
246	return (~sum & 0xffff);
247}
248
249/*
250 * This is the exact same algorithm as above with a few exceptions:
251 * (1) it is designed to operate on buffers, not mbufs
252 * (2) it returns an intermediate form of the sum which has to be
253 *     explicitly finalized (but this can be delayed)
254 * (3) it accepts an intermediate sum
255 *
256 * This is particularly useful when building packets quickly,
257 * since one can compute the checksum of the pseudoheader ahead of
258 * time and then use this function to complete the work.  That way,
259 * the pseudoheader never actually has to exist in the packet buffer,
260 * which avoids needless duplication of work.
261 */
262in_psum_t
263in_cksum_partial(psum, w, len)
264	in_psum_t psum;
265	const u_short *w;
266	int len;
267{
268	register in_psum_t sum = psum;
269	int byte_swapped = 0;
270	union { char	c[2]; u_short	s; } su;
271
272	/*
273	 * Force to long boundary so we do longword aligned
274	 * memory operations
275	 */
276	if (3 & (int) w) {
277		REDUCE;
278		if ((1 & (int) w) && (len > 0)) {
279			sum <<= 8;
280			su.c[0] = *(const char *)w;
281			w = (const u_short *)((const char *)w + 1);
282			len--;
283			byte_swapped = 1;
284		}
285		if ((2 & (int) w) && (len >= 2)) {
286			sum += *w++;
287			len -= 2;
288		}
289	}
290	/*
291	 * Advance to a 486 cache line boundary.
292	 */
293	if (4 & (int) w && len >= 4) {
294		ADD(0);
295		MOP;
296		w += 2;
297		len -= 4;
298	}
299	if (8 & (int) w && len >= 8) {
300		ADD(0);
301		ADDC(4);
302		MOP;
303		w += 4;
304		len -= 8;
305	}
306	/*
307	 * Do as much of the checksum as possible 32 bits at at time.
308	 * In fact, this loop is unrolled to make overhead from
309	 * branches &c small.
310	 */
311	len -= 1;
312	while ((len -= 32) >= 0) {
313		u_char junk;
314		/*
315		 * Add with carry 16 words and fold in the last
316		 * carry by adding a 0 with carry.
317		 *
318		 * The early ADD(16) and the LOAD(32) are to load
319		 * the next 2 cache lines in advance on 486's.  The
320		 * 486 has a penalty of 2 clock cycles for loading
321		 * a cache line, plus whatever time the external
322		 * memory takes to load the first word(s) addressed.
323		 * These penalties are unavoidable.  Subsequent
324		 * accesses to a cache line being loaded (and to
325		 * other external memory?) are delayed until the
326		 * whole load finishes.  These penalties are mostly
327		 * avoided by not accessing external memory for
328		 * 8 cycles after the ADD(16) and 12 cycles after
329		 * the LOAD(32).  The loop terminates when len
330		 * is initially 33 (not 32) to guaranteed that
331		 * the LOAD(32) is within bounds.
332		 */
333		ADD(16);
334		ADDC(0);
335		ADDC(4);
336		ADDC(8);
337		ADDC(12);
338		LOAD(32);
339		ADDC(20);
340		ADDC(24);
341		ADDC(28);
342		MOP;
343		w += 16;
344	}
345	len += 32 + 1;
346	if (len >= 32) {
347		ADD(16);
348		ADDC(0);
349		ADDC(4);
350		ADDC(8);
351		ADDC(12);
352		ADDC(20);
353		ADDC(24);
354		ADDC(28);
355		MOP;
356		w += 16;
357		len -= 32;
358	}
359	if (len >= 16) {
360		ADD(0);
361		ADDC(4);
362		ADDC(8);
363		ADDC(12);
364		MOP;
365		w += 8;
366		len -= 16;
367	}
368	if (len >= 8) {
369		ADD(0);
370		ADDC(4);
371		MOP;
372		w += 4;
373		len -= 8;
374	}
375	if (len == 0 && byte_swapped == 0)
376		goto out;
377	REDUCE;
378	while ((len -= 2) >= 0) {
379		sum += *w++;
380	}
381	if (byte_swapped) {
382		sum <<= 8;
383		byte_swapped = 0;
384		if (len == -1) {
385			su.c[1] = *(const char *)w;
386			sum += su.s;
387			len = 0;
388		} else
389			len = -1;
390	} else if (len == -1) {
391		/*
392		 * This buffer has odd number of bytes.
393		 * There could be a word split betwen
394		 * this buffer and the next.
395		 */
396		su.c[0] = *(const char *)w;
397	}
398out:
399	if (len == -1) {
400		/* The last buffer has odd # of bytes. Follow the
401		   standard (the odd byte is shifted left by 8 bits) */
402		su.c[1] = 0;
403		sum += su.s;
404	}
405	return sum;
406}
407
408int
409in_cksum_finalize(psum)
410	in_psum_t psum;
411{
412	in_psum_t sum = psum;
413	REDUCE;
414	return (sum & 0xffff);
415}
416