1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IP/TCP/UDP checksumming routines
7 *
8 * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
9 *                  Optimized by Joe Taylor
10 *
11 *		This program is free software; you can redistribute it and/or
12 *		modify it under the terms of the GNU General Public License
13 *		as published by the Free Software Foundation; either version
14 *		2 of the License, or (at your option) any later version.
15 */
16
17#include <asm/errno.h>
18#include <linux/linkage.h>
19#include <asm/variant/core.h>
20
21/*
22 * computes a partial checksum, e.g. for TCP/UDP fragments
23 */
24
25/*
26 * unsigned int csum_partial(const unsigned char *buf, int len,
27 *                           unsigned int sum);
28 *    a2 = buf
29 *    a3 = len
30 *    a4 = sum
31 *
32 * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
33 */
34
35/* ONES_ADD converts twos-complement math to ones-complement. */
36#define ONES_ADD(sum, val)	  \
37	add	sum, sum, val	; \
38	bgeu	sum, val, 99f	; \
39	addi	sum, sum, 1	; \
4099:				;
41
42.text
43ENTRY(csum_partial)
44	  /*
45	   * Experiments with Ethernet and SLIP connections show that buf
46	   * is aligned on either a 2-byte or 4-byte boundary.
47	   */
48	entry	sp, 32
49	extui	a5, a2, 0, 2
50	bnez	a5, 8f		/* branch if 2-byte aligned */
51	/* Fall-through on common case, 4-byte alignment */
521:
53	srli	a5, a3, 5	/* 32-byte chunks */
54#if XCHAL_HAVE_LOOPS
55	loopgtz	a5, 2f
56#else
57	beqz	a5, 2f
58	slli	a5, a5, 5
59	add	a5, a5, a2	/* a5 = end of last 32-byte chunk */
60.Loop1:
61#endif
62	l32i	a6, a2, 0
63	l32i	a7, a2, 4
64	ONES_ADD(a4, a6)
65	ONES_ADD(a4, a7)
66	l32i	a6, a2, 8
67	l32i	a7, a2, 12
68	ONES_ADD(a4, a6)
69	ONES_ADD(a4, a7)
70	l32i	a6, a2, 16
71	l32i	a7, a2, 20
72	ONES_ADD(a4, a6)
73	ONES_ADD(a4, a7)
74	l32i	a6, a2, 24
75	l32i	a7, a2, 28
76	ONES_ADD(a4, a6)
77	ONES_ADD(a4, a7)
78	addi	a2, a2, 4*8
79#if !XCHAL_HAVE_LOOPS
80	blt	a2, a5, .Loop1
81#endif
822:
83	extui	a5, a3, 2, 3	/* remaining 4-byte chunks */
84#if XCHAL_HAVE_LOOPS
85	loopgtz	a5, 3f
86#else
87	beqz	a5, 3f
88	slli	a5, a5, 2
89	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
90.Loop2:
91#endif
92	l32i	a6, a2, 0
93	ONES_ADD(a4, a6)
94	addi	a2, a2, 4
95#if !XCHAL_HAVE_LOOPS
96	blt	a2, a5, .Loop2
97#endif
983:
99	_bbci.l	a3, 1, 5f	/* remaining 2-byte chunk */
100	l16ui	a6, a2, 0
101	ONES_ADD(a4, a6)
102	addi	a2, a2, 2
1035:
104	_bbci.l	a3, 0, 7f	/* remaining 1-byte chunk */
1056:	l8ui	a6, a2, 0
106#ifdef __XTENSA_EB__
107	slli	a6, a6, 8	/* load byte into bits 8..15 */
108#endif
109	ONES_ADD(a4, a6)
1107:
111	mov	a2, a4
112	retw
113
114	/* uncommon case, buf is 2-byte aligned */
1158:
116	beqz	a3, 7b		/* branch if len == 0 */
117	beqi	a3, 1, 6b	/* branch if len == 1 */
118
119	extui	a5, a2, 0, 1
120	bnez	a5, 8f		/* branch if 1-byte aligned */
121
122	l16ui	a6, a2, 0	/* common case, len >= 2 */
123	ONES_ADD(a4, a6)
124	addi	a2, a2, 2	/* adjust buf */
125	addi	a3, a3, -2	/* adjust len */
126	j	1b		/* now buf is 4-byte aligned */
127
128	/* case: odd-byte aligned, len > 1
129	 * This case is dog slow, so don't give us an odd address.
130	 * (I don't think this ever happens, but just in case.)
131	 */
1328:
133	srli	a5, a3, 2	/* 4-byte chunks */
134#if XCHAL_HAVE_LOOPS
135	loopgtz	a5, 2f
136#else
137	beqz	a5, 2f
138	slli	a5, a5, 2
139	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
140.Loop3:
141#endif
142	l8ui	a6, a2, 0	/* bits 24..31 */
143	l16ui	a7, a2, 1	/* bits  8..23 */
144	l8ui	a8, a2, 3	/* bits  0.. 8 */
145#ifdef	__XTENSA_EB__
146	slli	a6, a6, 24
147#else
148	slli	a8, a8, 24
149#endif
150	slli	a7, a7, 8
151	or	a7, a7, a6
152	or	a7, a7, a8
153	ONES_ADD(a4, a7)
154	addi	a2, a2, 4
155#if !XCHAL_HAVE_LOOPS
156	blt	a2, a5, .Loop3
157#endif
1582:
159	_bbci.l	a3, 1, 3f	/* remaining 2-byte chunk, still odd addr */
160	l8ui	a6, a2, 0
161	l8ui	a7, a2, 1
162#ifdef	__XTENSA_EB__
163	slli	a6, a6, 8
164#else
165	slli	a7, a7, 8
166#endif
167	or	a7, a7, a6
168	ONES_ADD(a4, a7)
169	addi	a2, a2, 2
1703:
171	j	5b		/* branch to handle the remaining byte */
172
173
174
175/*
176 * Copy from ds while checksumming, otherwise like csum_partial
177 *
178 * The macros SRC and DST specify the type of access for the instruction.
179 * thus we can call a custom exception handler for each access type.
180 */
181
182#define SRC(y...)			\
183	9999: y;			\
184	.section __ex_table, "a";	\
185	.long 9999b, 6001f	;	\
186	.previous
187
188#define DST(y...)			\
189	9999: y;			\
190	.section __ex_table, "a";	\
191	.long 9999b, 6002f	;	\
192	.previous
193
194/*
195unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
196					int sum, int *src_err_ptr, int *dst_err_ptr)
197	a2  = src
198	a3  = dst
199	a4  = len
200	a5  = sum
201	a6  = src_err_ptr
202	a7  = dst_err_ptr
203	a8  = temp
204	a9  = temp
205	a10 = temp
206	a11 = original len for exception handling
207	a12 = original dst for exception handling
208
209    This function is optimized for 4-byte aligned addresses.  Other
210    alignments work, but not nearly as efficiently.
211 */
212
213ENTRY(csum_partial_copy_generic)
214	entry	sp, 32
215	mov	a12, a3
216	mov	a11, a4
217	or	a10, a2, a3
218
219	/* We optimize the following alignment tests for the 4-byte
220	aligned case.  Two bbsi.l instructions might seem more optimal
221	(commented out below).  However, both labels 5: and 3: are out
222	of the imm8 range, so the assembler relaxes them into
223	equivalent bbci.l, j combinations, which is actually
224	slower. */
225
226	extui	a9, a10, 0, 2
227	beqz	a9, 1f		/* branch if both are 4-byte aligned */
228	bbsi.l	a10, 0, 5f	/* branch if one address is odd */
229	j	3f		/* one address is 2-byte aligned */
230
231/*	_bbsi.l	a10, 0, 5f */	/* branch if odd address */
232/*	_bbsi.l	a10, 1, 3f */	/* branch if 2-byte-aligned address */
233
2341:
235	/* src and dst are both 4-byte aligned */
236	srli	a10, a4, 5	/* 32-byte chunks */
237#if XCHAL_HAVE_LOOPS
238	loopgtz	a10, 2f
239#else
240	beqz	a10, 2f
241	slli	a10, a10, 5
242	add	a10, a10, a2	/* a10 = end of last 32-byte src chunk */
243.Loop5:
244#endif
245SRC(	l32i	a9, a2, 0	)
246SRC(	l32i	a8, a2, 4	)
247DST(	s32i	a9, a3, 0	)
248DST(	s32i	a8, a3, 4	)
249	ONES_ADD(a5, a9)
250	ONES_ADD(a5, a8)
251SRC(	l32i	a9, a2, 8	)
252SRC(	l32i	a8, a2, 12	)
253DST(	s32i	a9, a3, 8	)
254DST(	s32i	a8, a3, 12	)
255	ONES_ADD(a5, a9)
256	ONES_ADD(a5, a8)
257SRC(	l32i	a9, a2, 16	)
258SRC(	l32i	a8, a2, 20	)
259DST(	s32i	a9, a3, 16	)
260DST(	s32i	a8, a3, 20	)
261	ONES_ADD(a5, a9)
262	ONES_ADD(a5, a8)
263SRC(	l32i	a9, a2, 24	)
264SRC(	l32i	a8, a2, 28	)
265DST(	s32i	a9, a3, 24	)
266DST(	s32i	a8, a3, 28	)
267	ONES_ADD(a5, a9)
268	ONES_ADD(a5, a8)
269	addi	a2, a2, 32
270	addi	a3, a3, 32
271#if !XCHAL_HAVE_LOOPS
272	blt	a2, a10, .Loop5
273#endif
2742:
275	extui	a10, a4, 2, 3	/* remaining 4-byte chunks */
276	extui	a4, a4, 0, 2	/* reset len for general-case, 2-byte chunks */
277#if XCHAL_HAVE_LOOPS
278	loopgtz	a10, 3f
279#else
280	beqz	a10, 3f
281	slli	a10, a10, 2
282	add	a10, a10, a2	/* a10 = end of last 4-byte src chunk */
283.Loop6:
284#endif
285SRC(	l32i	a9, a2, 0	)
286DST(	s32i	a9, a3, 0	)
287	ONES_ADD(a5, a9)
288	addi	a2, a2, 4
289	addi	a3, a3, 4
290#if !XCHAL_HAVE_LOOPS
291	blt	a2, a10, .Loop6
292#endif
2933:
294	/*
295	Control comes to here in two cases: (1) It may fall through
296	to here from the 4-byte alignment case to process, at most,
297	one 2-byte chunk.  (2) It branches to here from above if
298	either src or dst is 2-byte aligned, and we process all bytes
299	here, except for perhaps a trailing odd byte.  It's
300	inefficient, so align your addresses to 4-byte boundaries.
301
302	a2 = src
303	a3 = dst
304	a4 = len
305	a5 = sum
306	*/
307	srli	a10, a4, 1	/* 2-byte chunks */
308#if XCHAL_HAVE_LOOPS
309	loopgtz	a10, 4f
310#else
311	beqz	a10, 4f
312	slli	a10, a10, 1
313	add	a10, a10, a2	/* a10 = end of last 2-byte src chunk */
314.Loop7:
315#endif
316SRC(	l16ui	a9, a2, 0	)
317DST(	s16i	a9, a3, 0	)
318	ONES_ADD(a5, a9)
319	addi	a2, a2, 2
320	addi	a3, a3, 2
321#if !XCHAL_HAVE_LOOPS
322	blt	a2, a10, .Loop7
323#endif
3244:
325	/* This section processes a possible trailing odd byte. */
326	_bbci.l	a4, 0, 8f	/* 1-byte chunk */
327SRC(	l8ui	a9, a2, 0	)
328DST(	s8i	a9, a3, 0	)
329#ifdef __XTENSA_EB__
330	slli	a9, a9, 8	/* shift byte to bits 8..15 */
331#endif
332	ONES_ADD(a5, a9)
3338:
334	mov	a2, a5
335	retw
336
3375:
338	/* Control branch to here when either src or dst is odd.  We
339	process all bytes using 8-bit accesses.  Grossly inefficient,
340	so don't feed us an odd address. */
341
342	srli	a10, a4, 1	/* handle in pairs for 16-bit csum */
343#if XCHAL_HAVE_LOOPS
344	loopgtz	a10, 6f
345#else
346	beqz	a10, 6f
347	slli	a10, a10, 1
348	add	a10, a10, a2	/* a10 = end of last odd-aligned, 2-byte src chunk */
349.Loop8:
350#endif
351SRC(	l8ui	a9, a2, 0	)
352SRC(	l8ui	a8, a2, 1	)
353DST(	s8i	a9, a3, 0	)
354DST(	s8i	a8, a3, 1	)
355#ifdef __XTENSA_EB__
356	slli	a9, a9, 8	/* combine into a single 16-bit value */
357#else				/* for checksum computation */
358	slli	a8, a8, 8
359#endif
360	or	a9, a9, a8
361	ONES_ADD(a5, a9)
362	addi	a2, a2, 2
363	addi	a3, a3, 2
364#if !XCHAL_HAVE_LOOPS
365	blt	a2, a10, .Loop8
366#endif
3676:
368	j	4b		/* process the possible trailing odd byte */
369
370
371# Exception handler:
372.section .fixup, "ax"
373/*
374	a6  = src_err_ptr
375	a7  = dst_err_ptr
376	a11 = original len for exception handling
377	a12 = original dst for exception handling
378*/
379
3806001:
381	_movi	a2, -EFAULT
382	s32i	a2, a6, 0	/* src_err_ptr */
383
384	# clear the complete destination - computing the rest
385	# is too much work
386	movi	a2, 0
387#if XCHAL_HAVE_LOOPS
388	loopgtz	a11, 2f
389#else
390	beqz	a11, 2f
391	add	a11, a11, a12	/* a11 = ending address */
392.Leloop:
393#endif
394	s8i	a2, a12, 0
395	addi	a12, a12, 1
396#if !XCHAL_HAVE_LOOPS
397	blt	a12, a11, .Leloop
398#endif
3992:
400	retw
401
4026002:
403	movi	a2, -EFAULT
404	s32i	a2, a7, 0	/* dst_err_ptr */
405	movi	a2, 0
406	retw
407
408.previous
409