1/*	$NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 *
37 */
38
39/*
40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e
41 */
42
43#include "opt_inet.h"
44
45#include <machine/asm.h>
46#include "assym.s"
47__FBSDID("$FreeBSD$");
48
49	.syntax	unified
50/*
51 * int in_cksum(struct mbuf *m, int len)
52 *
53 * Entry:
54 *	r0	m
55 *	r1	len
56 *
57 * NOTE: Assumes 'm' is *never* NULL.
58 */
59/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
60ENTRY(in_cksum)
61	stmfd	sp!, {r4-r11,lr}
62	mov	r8, #0x00
63	mov	r9, r1
64	mov	r10, #0x00
65	mov	ip, r0
66
67.Lin_cksum_loop:
68	ldr	r1, [ip, #(M_LEN)]
69	ldr	r0, [ip, #(M_DATA)]
70	ldr	ip, [ip, #(M_NEXT)]
71.Lin_cksum_entry4:
72	cmp	r9, r1
73	movlt	r1, r9
74	sub	r9, r9, r1
75	eor	r11, r10, r0
76	add	r10, r10, r1
77	adds	r2, r1, #0x00
78	blne	_ASM_LABEL(L_cksumdata)
79	tst	r11, #0x01
80	movne	r2, r2, ror #8
81	adds	r8, r8, r2
82	adc	r8, r8, #0x00
83	cmp	ip, #0x00
84	bne	.Lin_cksum_loop
85
86	mov	r1, #0xff
87	orr	r1, r1, #0xff00
88	and	r0, r8, r1
89	add	r0, r0, r8, lsr #16
90	add	r0, r0, r0, lsr #16
91	and	r0, r0, r1
92	eor	r0, r0, r1
93	ldmfd	sp!, {r4-r11,pc}
94END(in_cksum)
95
96ENTRY(do_cksum)
97	stmfd	sp!, {r4-r7, lr}
98	bl	L_cksumdata
99	mov	r0, r2
100	ldmfd	sp!, {r4-r7, pc}
101END(do_cksum)
102
103/*
104 * The main in*_cksum() workhorse...
105 *
106 * Entry parameters:
107 *	r0	Pointer to buffer
108 *	r1	Buffer length
109 *	lr	Return address
110 *
111 * Returns:
112 *	r2	Accumulated 32-bit sum
113 *
114 * Clobbers:
115 *	r0-r7
116 */
117/* LINTSTUB: Ignore */
118ASENTRY_NP(L_cksumdata)
119#ifdef _ARM_ARCH_5E
120	pld	[r0]			/* Pre-fetch the start of the buffer */
121#endif
122	mov	r2, #0
123
124	/* We first have to word-align the buffer.  */
125	ands	r7, r0, #0x03
126	beq	.Lcksumdata_wordaligned
127	rsb	r7, r7, #0x04
128	cmp	r1, r7			/* Enough bytes left to make it? */
129	blt	.Lcksumdata_endgame
130	cmp	r7, #0x02
131	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
132	ldrbge	r5, [r0], #0x01		/* Fetch 2nd byte */
133	movlt	r5, #0x00
134	ldrbgt	r6, [r0], #0x01		/* Fetch 3rd byte */
135	movle	r6, #0x00
136	/* Combine the three bytes depending on endianness and alignment */
137#ifdef __ARMEB__
138	orreq	r2, r5, r4, lsl #8
139	orreq	r2, r2, r6, lsl #24
140	orrne	r2, r4, r5, lsl #8
141	orrne	r2, r2, r6, lsl #16
142#else
143	orreq	r2, r4, r5, lsl #8
144	orreq	r2, r2, r6, lsl #16
145	orrne	r2, r5, r4, lsl #8
146	orrne	r2, r2, r6, lsl #24
147#endif
148	subs	r1, r1, r7		/* Update length */
149	RETeq			/* All done? */
150
151	/* Buffer is now word aligned */
152.Lcksumdata_wordaligned:
153#ifdef _ARM_ARCH_5E
154	cmp	r1, #0x04		/* Less than 4 bytes left? */
155	blt	.Lcksumdata_endgame	/* Yup */
156
157	/* Now quad-align, if necessary */
158	ands	r7, r0, #0x04
159	ldrne	r7, [r0], #0x04
160	subne	r1, r1, #0x04
161	subs	r1, r1, #0x40
162	blt	.Lcksumdata_bigloop_end	/* Note: C flag clear if branch taken */
163
164	/*
165	 * Buffer is now quad aligned. Sum 64 bytes at a time.
166	 * Note: First ldrd is hoisted above the loop, together with
167	 * setting r6 to zero to avoid stalling for results in the
168	 * loop. (r7 is live, from above).
169	 */
170	ldrd	r4, [r0], #0x08
171	mov	r6, #0x00
172.Lcksumdata_bigloop:
173	pld	[r0, #0x18]
174	adds	r2, r2, r6
175	adcs	r2, r2, r7
176	ldrd	r6, [r0], #0x08
177	adcs	r2, r2, r4
178	adcs	r2, r2, r5
179	ldrd	r4, [r0], #0x08
180	adcs	r2, r2, r6
181	adcs	r2, r2, r7
182	ldrd	r6, [r0], #0x08
183	adcs	r2, r2, r4
184	adcs	r2, r2, r5
185	ldrd	r4, [r0], #0x08
186	adcs	r2, r2, r6
187	adcs	r2, r2, r7
188	pld	[r0, #0x18]
189	ldrd	r6, [r0], #0x08
190	adcs	r2, r2, r4
191	adcs	r2, r2, r5
192	ldrd	r4, [r0], #0x08
193	adcs	r2, r2, r6
194	adcs	r2, r2, r7
195	ldrd	r6, [r0], #0x08
196	adcs	r2, r2, r4
197	adcs	r2, r2, r5
198	adc	r2, r2, #0x00
199	subs	r1, r1, #0x40
200	ldrdge	r4, [r0], #0x08
201	bge	.Lcksumdata_bigloop
202
203	adds	r2, r2, r6		/* r6/r7 still need summing */
204.Lcksumdata_bigloop_end:
205	adcs	r2, r2, r7
206	adc	r2, r2, #0x00
207
208#else	/* !_ARM_ARCH_5E */
209
210	subs	r1, r1, #0x40
211	blt	.Lcksumdata_bigloop_end
212
213.Lcksumdata_bigloop:
214	ldmia	r0!, {r3, r4, r5, r6}
215	adds	r2, r2, r3
216	adcs	r2, r2, r4
217	adcs	r2, r2, r5
218	ldmia	r0!, {r3, r4, r5, r7}
219	adcs	r2, r2, r6
220	adcs	r2, r2, r3
221	adcs	r2, r2, r4
222	adcs	r2, r2, r5
223	ldmia	r0!, {r3, r4, r5, r6}
224	adcs	r2, r2, r7
225	adcs	r2, r2, r3
226	adcs	r2, r2, r4
227	adcs	r2, r2, r5
228	ldmia	r0!, {r3, r4, r5, r7}
229	adcs	r2, r2, r6
230	adcs	r2, r2, r3
231	adcs	r2, r2, r4
232	adcs	r2, r2, r5
233	adcs	r2, r2, r7
234	adc	r2, r2, #0x00
235	subs	r1, r1, #0x40
236	bge	.Lcksumdata_bigloop
237.Lcksumdata_bigloop_end:
238#endif
239
240	adds	r1, r1, #0x40
241	RETeq
242	cmp	r1, #0x20
243
244#ifdef _ARM_ARCH_5E
245	ldrdge	r4, [r0], #0x08		/* Avoid stalling pld and result */
246	blt	.Lcksumdata_less_than_32
247	pld	[r0, #0x18]
248	ldrd	r6, [r0], #0x08
249	adds	r2, r2, r4
250	adcs	r2, r2, r5
251	ldrd	r4, [r0], #0x08
252	adcs	r2, r2, r6
253	adcs	r2, r2, r7
254	ldrd	r6, [r0], #0x08
255	adcs	r2, r2, r4
256	adcs	r2, r2, r5
257	adcs	r2, r2, r6		/* XXX: Unavoidable result stall */
258	adcs	r2, r2, r7
259#else
260	blt	.Lcksumdata_less_than_32
261	ldmia	r0!, {r3, r4, r5, r6}
262	adds	r2, r2, r3
263	adcs	r2, r2, r4
264	adcs	r2, r2, r5
265	ldmia	r0!, {r3, r4, r5, r7}
266	adcs	r2, r2, r6
267	adcs	r2, r2, r3
268	adcs	r2, r2, r4
269	adcs	r2, r2, r5
270	adcs	r2, r2, r7
271#endif
272	adc	r2, r2, #0x00
273	subs	r1, r1, #0x20
274	RETeq
275
276.Lcksumdata_less_than_32:
277	/* There are less than 32 bytes left */
278	and	r3, r1, #0x18
279	rsb	r4, r3, #0x18
280	sub	r1, r1, r3
281	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
282	addne	pc, pc, r4
283	nop
284
285/*
286 * Note: We use ldm here, even on armv5e, since the combined issue/result
287 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
288 */
289	/* At least 24 bytes remaining... */
290	ldmia	r0!, {r4, r5}
291	adcs	r2, r2, r4
292	adcs	r2, r2, r5
293
294	/* At least 16 bytes remaining... */
295	ldmia	r0!, {r4, r5}
296	adcs	r2, r2, r4
297	adcs	r2, r2, r5
298
299	/* At least 8 bytes remaining... */
300	ldmia	r0!, {r4, r5}
301	adcs	r2, r2, r4
302	adcs	r2, r2, r5
303
304	/* Less than 8 bytes remaining... */
305	adc	r2, r2, #0x00
306	subs	r1, r1, #0x04
307	blt	.Lcksumdata_lessthan4
308
309	ldr	r4, [r0], #0x04
310	sub	r1, r1, #0x04
311	adds	r2, r2, r4
312	adc	r2, r2, #0x00
313
314	/* Deal with < 4 bytes remaining */
315.Lcksumdata_lessthan4:
316	adds	r1, r1, #0x04
317	RETeq
318
319	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
320.Lcksumdata_endgame:
321	ldrb	r3, [r0]		/* Fetch first byte */
322	cmp	r1, #0x02
323	ldrbge	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
324	movlt	r4, #0x00
325	ldrbgt	r5, [r0, #0x02]
326	movle	r5, #0x00
327	/* Combine the three bytes depending on endianness and alignment */
328	tst	r0, #0x01
329#ifdef __ARMEB__
330	orreq	r3, r4, r3, lsl #8
331	orreq	r3, r3, r5, lsl #24
332	orrne	r3, r3, r4, lsl #8
333	orrne	r3, r3, r5, lsl #16
334#else
335	orreq	r3, r3, r4, lsl #8
336	orreq	r3, r3, r5, lsl #16
337	orrne	r3, r4, r3, lsl #8
338	orrne	r3, r3, r5, lsl #24
339#endif
340	adds	r2, r2, r3
341	adc	r2, r2, #0x00
342	RET
343END(L_cksumdata)
344
345