1/*
2 * Copyright (c) 2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*-
30 * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 *
37 * 1. Redistributions of source code must retain the above copyright
38 *    notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 *    notice, this list of conditions and the following disclaimer in
41 *    the documentation and/or other materials provided with the
42 *    distribution.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
47 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
48 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
49 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
50 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
51 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
52 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
53 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
54 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 */
57
58#include <sys/param.h>
59#include <mach/boolean.h>
60#include <machine/endian.h>
61#include <sys/mcache.h>
62#include <sys/mbuf.h>
63#include <kern/debug.h>
64#include <netinet/in.h>
65#include <libkern/libkern.h>
66
67int cpu_in_cksum(struct mbuf *, int, int, uint32_t);
68
69#define	PREDICT_FALSE(_exp)	__builtin_expect((_exp), 0)
70
71/*
72 * Checksum routine for Internet Protocol family headers (Portable Version).
73 *
74 * This routine is very heavily used in the network
75 * code and should be modified for each CPU to be as fast as possible.
76 *
77 * A discussion of different implementation techniques can be found in
78 * RFC 1071.
79 *
80 * The default implementation for 32-bit architectures is using
81 * a 32-bit accumulator and operating on 16-bit operands.
82 *
83 * The default implementation for 64-bit architectures is using
84 * a 64-bit accumulator and operating on 32-bit operands.
85 *
86 * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
87 * of the inner loop. After each iteration of the inner loop, a partial
88 * reduction is done to avoid carry in long packets.
89 */
90
91#if ULONG_MAX == 0xffffffffUL
92/* 32-bit version */
93int
94cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
95{
96	int mlen;
97	uint32_t sum, partial;
98	unsigned int final_acc;
99	uint8_t *data;
100	boolean_t needs_swap, started_on_odd;
101
102	VERIFY(len >= 0);
103	VERIFY(off >= 0);
104
105	needs_swap = FALSE;
106	started_on_odd = FALSE;
107	sum = (initial_sum >> 16) + (initial_sum & 0xffff);
108
109	for (;;) {
110		if (PREDICT_FALSE(m == NULL)) {
111			printf("%s: out of data\n", __func__);
112			return (-1);
113		}
114		mlen = m->m_len;
115		if (mlen > off) {
116			mlen -= off;
117			data = mtod(m, uint8_t *) + off;
118			goto post_initial_offset;
119		}
120		off -= mlen;
121		if (len == 0)
122			break;
123		m = m->m_next;
124	}
125
126	for (; len > 0; m = m->m_next) {
127		if (PREDICT_FALSE(m == NULL)) {
128			printf("%s: out of data\n", __func__);
129			return (-1);
130		}
131		mlen = m->m_len;
132		data = mtod(m, uint8_t *);
133post_initial_offset:
134		if (mlen == 0)
135			continue;
136		if (mlen > len)
137			mlen = len;
138		len -= mlen;
139
140		partial = 0;
141		if ((uintptr_t)data & 1) {
142			/* Align on word boundary */
143			started_on_odd = !started_on_odd;
144#if BYTE_ORDER == LITTLE_ENDIAN
145			partial = *data << 8;
146#else
147			partial = *data;
148#endif
149			++data;
150			--mlen;
151		}
152		needs_swap = started_on_odd;
153		while (mlen >= 32) {
154			__builtin_prefetch(data + 32);
155			partial += *(uint16_t *)(void *)data;
156			partial += *(uint16_t *)(void *)(data + 2);
157			partial += *(uint16_t *)(void *)(data + 4);
158			partial += *(uint16_t *)(void *)(data + 6);
159			partial += *(uint16_t *)(void *)(data + 8);
160			partial += *(uint16_t *)(void *)(data + 10);
161			partial += *(uint16_t *)(void *)(data + 12);
162			partial += *(uint16_t *)(void *)(data + 14);
163			partial += *(uint16_t *)(void *)(data + 16);
164			partial += *(uint16_t *)(void *)(data + 18);
165			partial += *(uint16_t *)(void *)(data + 20);
166			partial += *(uint16_t *)(void *)(data + 22);
167			partial += *(uint16_t *)(void *)(data + 24);
168			partial += *(uint16_t *)(void *)(data + 26);
169			partial += *(uint16_t *)(void *)(data + 28);
170			partial += *(uint16_t *)(void *)(data + 30);
171			data += 32;
172			mlen -= 32;
173			if (PREDICT_FALSE(partial & 0xc0000000)) {
174				if (needs_swap)
175					partial = (partial << 8) +
176					    (partial >> 24);
177				sum += (partial >> 16);
178				sum += (partial & 0xffff);
179				partial = 0;
180			}
181		}
182		if (mlen & 16) {
183			partial += *(uint16_t *)(void *)data;
184			partial += *(uint16_t *)(void *)(data + 2);
185			partial += *(uint16_t *)(void *)(data + 4);
186			partial += *(uint16_t *)(void *)(data + 6);
187			partial += *(uint16_t *)(void *)(data + 8);
188			partial += *(uint16_t *)(void *)(data + 10);
189			partial += *(uint16_t *)(void *)(data + 12);
190			partial += *(uint16_t *)(void *)(data + 14);
191			data += 16;
192			mlen -= 16;
193		}
194		/*
195		 * mlen is not updated below as the remaining tests
196		 * are using bit masks, which are not affected.
197		 */
198		if (mlen & 8) {
199			partial += *(uint16_t *)(void *)data;
200			partial += *(uint16_t *)(void *)(data + 2);
201			partial += *(uint16_t *)(void *)(data + 4);
202			partial += *(uint16_t *)(void *)(data + 6);
203			data += 8;
204		}
205		if (mlen & 4) {
206			partial += *(uint16_t *)(void *)data;
207			partial += *(uint16_t *)(void *)(data + 2);
208			data += 4;
209		}
210		if (mlen & 2) {
211			partial += *(uint16_t *)(void *)data;
212			data += 2;
213		}
214		if (mlen & 1) {
215#if BYTE_ORDER == LITTLE_ENDIAN
216			partial += *data;
217#else
218			partial += *data << 8;
219#endif
220			started_on_odd = !started_on_odd;
221		}
222
223		if (needs_swap)
224			partial = (partial << 8) + (partial >> 24);
225		sum += (partial >> 16) + (partial & 0xffff);
226		/*
227		 * Reduce sum to allow potential byte swap
228		 * in the next iteration without carry.
229		 */
230		sum = (sum >> 16) + (sum & 0xffff);
231	}
232	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
233	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
234	return (~final_acc & 0xffff);
235}
236
237#else
238/* 64-bit version */
239int
240cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
241{
242	int mlen;
243	uint64_t sum, partial;
244	unsigned int final_acc;
245	uint8_t *data;
246	boolean_t needs_swap, started_on_odd;
247
248	VERIFY(len >= 0);
249	VERIFY(off >= 0);
250
251	needs_swap = FALSE;
252	started_on_odd = FALSE;
253	sum = initial_sum;
254
255	for (;;) {
256		if (PREDICT_FALSE(m == NULL)) {
257			printf("%s: out of data\n", __func__);
258			return (-1);
259		}
260		mlen = m->m_len;
261		if (mlen > off) {
262			mlen -= off;
263			data = mtod(m, uint8_t *) + off;
264			goto post_initial_offset;
265		}
266		off -= mlen;
267		if (len == 0)
268			break;
269		m = m->m_next;
270	}
271
272	for (; len > 0; m = m->m_next) {
273		if (PREDICT_FALSE(m == NULL)) {
274			printf("%s: out of data\n", __func__);
275			return (-1);
276		}
277		mlen = m->m_len;
278		data = mtod(m, uint8_t *);
279post_initial_offset:
280		if (mlen == 0)
281			continue;
282		if (mlen > len)
283			mlen = len;
284		len -= mlen;
285
286		partial = 0;
287		if ((uintptr_t)data & 1) {
288			/* Align on word boundary */
289			started_on_odd = !started_on_odd;
290#if BYTE_ORDER == LITTLE_ENDIAN
291			partial = *data << 8;
292#else
293			partial = *data;
294#endif
295			++data;
296			--mlen;
297		}
298		needs_swap = started_on_odd;
299		if ((uintptr_t)data & 2) {
300			if (mlen < 2)
301				goto trailing_bytes;
302			partial += *(uint16_t *)(void *)data;
303			data += 2;
304			mlen -= 2;
305		}
306		while (mlen >= 64) {
307			__builtin_prefetch(data + 32);
308			__builtin_prefetch(data + 64);
309			partial += *(uint32_t *)(void *)data;
310			partial += *(uint32_t *)(void *)(data + 4);
311			partial += *(uint32_t *)(void *)(data + 8);
312			partial += *(uint32_t *)(void *)(data + 12);
313			partial += *(uint32_t *)(void *)(data + 16);
314			partial += *(uint32_t *)(void *)(data + 20);
315			partial += *(uint32_t *)(void *)(data + 24);
316			partial += *(uint32_t *)(void *)(data + 28);
317			partial += *(uint32_t *)(void *)(data + 32);
318			partial += *(uint32_t *)(void *)(data + 36);
319			partial += *(uint32_t *)(void *)(data + 40);
320			partial += *(uint32_t *)(void *)(data + 44);
321			partial += *(uint32_t *)(void *)(data + 48);
322			partial += *(uint32_t *)(void *)(data + 52);
323			partial += *(uint32_t *)(void *)(data + 56);
324			partial += *(uint32_t *)(void *)(data + 60);
325			data += 64;
326			mlen -= 64;
327			if (PREDICT_FALSE(partial & (3ULL << 62))) {
328				if (needs_swap)
329					partial = (partial << 8) +
330					    (partial >> 56);
331				sum += (partial >> 32);
332				sum += (partial & 0xffffffff);
333				partial = 0;
334			}
335		}
336		/*
337		 * mlen is not updated below as the remaining tests
338		 * are using bit masks, which are not affected.
339		 */
340		if (mlen & 32) {
341			partial += *(uint32_t *)(void *)data;
342			partial += *(uint32_t *)(void *)(data + 4);
343			partial += *(uint32_t *)(void *)(data + 8);
344			partial += *(uint32_t *)(void *)(data + 12);
345			partial += *(uint32_t *)(void *)(data + 16);
346			partial += *(uint32_t *)(void *)(data + 20);
347			partial += *(uint32_t *)(void *)(data + 24);
348			partial += *(uint32_t *)(void *)(data + 28);
349			data += 32;
350		}
351		if (mlen & 16) {
352			partial += *(uint32_t *)(void *)data;
353			partial += *(uint32_t *)(void *)(data + 4);
354			partial += *(uint32_t *)(void *)(data + 8);
355			partial += *(uint32_t *)(void *)(data + 12);
356			data += 16;
357		}
358		if (mlen & 8) {
359			partial += *(uint32_t *)(void *)data;
360			partial += *(uint32_t *)(void *)(data + 4);
361			data += 8;
362		}
363		if (mlen & 4) {
364			partial += *(uint32_t *)(void *)data;
365			data += 4;
366		}
367		if (mlen & 2) {
368			partial += *(uint16_t *)(void *)data;
369			data += 2;
370		}
371trailing_bytes:
372		if (mlen & 1) {
373#if BYTE_ORDER == LITTLE_ENDIAN
374			partial += *data;
375#else
376			partial += *data << 8;
377#endif
378			started_on_odd = !started_on_odd;
379		}
380
381		if (needs_swap)
382			partial = (partial << 8) + (partial >> 56);
383		sum += (partial >> 32) + (partial & 0xffffffff);
384		/*
385		 * Reduce sum to allow potential byte swap
386		 * in the next iteration without carry.
387		 */
388		sum = (sum >> 32) + (sum & 0xffffffff);
389	}
390	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
391	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
392	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
393	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
394	return (~final_acc & 0xffff);
395}
396#endif /* ULONG_MAX != 0xffffffffUL */
397