1/*
2 *  linux/arch/arm26/lib/csumpartialcopygeneric.S
3 *
4 *  Copyright (C) 1995-2001 Russell King
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * JMA 01/06/03 Commented out some shl0s; probobly irrelevant to arm26
11 *
12 */
13
14/*
15 * unsigned int
16 * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, )
17 *  r0 = src, r1 = dst, r2 = len, r3 = sum
18 *  Returns : r0 = checksum
19 *
20 * Note that 'tst' and 'teq' preserve the carry flag.
21 */
22
23/* Quick hack */
24                .macro  save_regs
25                stmfd   sp!, {r1, r4 - r8, fp, ip, lr, pc}
26                .endm
27
28/* end Quick Hack */
29
30src	.req	r0
31dst	.req	r1
32len	.req	r2
33sum	.req	r3
34
35.zero:		mov	r0, sum
36		load_regs	ea
37
38		/*
39		 * Align an unaligned destination pointer.  We know that
40		 * we have >= 8 bytes here, so we don't need to check
41		 * the length.  Note that the source pointer hasn't been
42		 * aligned yet.
43		 */
44.dst_unaligned:	tst	dst, #1
45		beq	.dst_16bit
46
47		load1b	ip
48		sub	len, len, #1
49		adcs	sum, sum, ip, lsl #byte(1)	@ update checksum
50		strb	ip, [dst], #1
51		tst	dst, #2
52		moveq	pc, lr			@ dst is now 32bit aligned
53
54.dst_16bit:	load2b	r8, ip
55		sub	len, len, #2
56		adcs	sum, sum, r8, lsl #byte(0)
57		strb	r8, [dst], #1
58		adcs	sum, sum, ip, lsl #byte(1)
59		strb	ip, [dst], #1
60		mov	pc, lr			@ dst is now 32bit aligned
61
62		/*
63		 * Handle 0 to 7 bytes, with any alignment of source and
64		 * destination pointers.  Note that when we get here, C = 0
65		 */
66.less8:		teq	len, #0			@ check for zero count
67		beq	.zero
68
69		/* we must have at least one byte. */
70		tst	dst, #1			@ dst 16-bit aligned
71		beq	.less8_aligned
72
73		/* Align dst */
74		load1b	ip
75		sub	len, len, #1
76		adcs	sum, sum, ip, lsl #byte(1)	@ update checksum
77		strb	ip, [dst], #1
78		tst	len, #6
79		beq	.less8_byteonly
80
811:		load2b	r8, ip
82		sub	len, len, #2
83		adcs	sum, sum, r8, lsl #byte(0)
84		strb	r8, [dst], #1
85		adcs	sum, sum, ip, lsl #byte(1)
86		strb	ip, [dst], #1
87.less8_aligned:	tst	len, #6
88		bne	1b
89.less8_byteonly:
90		tst	len, #1
91		beq	.done
92		load1b	r8
93		adcs	sum, sum, r8, lsl #byte(0)	@ update checksum
94		strb	r8, [dst], #1
95		b	.done
96
97FN_ENTRY
98		mov	ip, sp
99		save_regs
100		sub	fp, ip, #4
101
102		cmp	len, #8			@ Ensure that we have at least
103		blo	.less8			@ 8 bytes to copy.
104
105		adds	sum, sum, #0		@ C = 0
106		tst	dst, #3			@ Test destination alignment
107		blne	.dst_unaligned		@ align destination, return here
108
109		/*
110		 * Ok, the dst pointer is now 32bit aligned, and we know
111		 * that we must have more than 4 bytes to copy.  Note
112		 * that C contains the carry from the dst alignment above.
113		 */
114
115		tst	src, #3			@ Test source alignment
116		bne	.src_not_aligned
117
118		/* Routine for src & dst aligned */
119
120		bics	ip, len, #15
121		beq	2f
122
1231:		load4l	r4, r5, r6, r7
124		stmia	dst!, {r4, r5, r6, r7}
125		adcs	sum, sum, r4
126		adcs	sum, sum, r5
127		adcs	sum, sum, r6
128		adcs	sum, sum, r7
129		sub	ip, ip, #16
130		teq	ip, #0
131		bne	1b
132
1332:		ands	ip, len, #12
134		beq	4f
135		tst	ip, #8
136		beq	3f
137		load2l	r4, r5
138		stmia	dst!, {r4, r5}
139		adcs	sum, sum, r4
140		adcs	sum, sum, r5
141		tst	ip, #4
142		beq	4f
143
1443:		load1l	r4
145		str	r4, [dst], #4
146		adcs	sum, sum, r4
147
1484:		ands	len, len, #3
149		beq	.done
150		load1l	r4
151		tst	len, #2
152		beq	.exit
153		adcs	sum, sum, r4, push #16
154		strb	r5, [dst], #1
155		mov	r5, r4, lsr #byte(1)
156		strb	r5, [dst], #1
157		mov	r5, r4, lsr #byte(2)
158.exit:		tst	len, #1
159		strneb	r5, [dst], #1
160		andne	r5, r5, #255
161		adcnes	sum, sum, r5, lsl #byte(0)
162
163		/*
164		 * If the dst pointer was not 16-bit aligned, we
165		 * need to rotate the checksum here to get around
166		 * the inefficient byte manipulations in the
167		 * architecture independent code.
168		 */
169.done:		adc	r0, sum, #0
170		ldr	sum, [sp, #0]		@ dst
171		tst	sum, #1
172		movne	sum, r0, lsl #8
173		orrne	r0, sum, r0, lsr #24
174		load_regs	ea
175
176.src_not_aligned:
177		adc	sum, sum, #0		@ include C from dst alignment
178		and	ip, src, #3
179		bic	src, src, #3
180		load1l	r5
181		cmp	ip, #2
182		beq	.src2_aligned
183		bhi	.src3_aligned
184		mov	r4, r5, pull #8		@ C = 0
185		bics	ip, len, #15
186		beq	2f
1871:		load4l	r5, r6, r7, r8
188		orr	r4, r4, r5, push #24
189		mov	r5, r5, pull #8
190		orr	r5, r5, r6, push #24
191		mov	r6, r6, pull #8
192		orr	r6, r6, r7, push #24
193		mov	r7, r7, pull #8
194		orr	r7, r7, r8, push #24
195		stmia	dst!, {r4, r5, r6, r7}
196		adcs	sum, sum, r4
197		adcs	sum, sum, r5
198		adcs	sum, sum, r6
199		adcs	sum, sum, r7
200		mov	r4, r8, pull #8
201		sub	ip, ip, #16
202		teq	ip, #0
203		bne	1b
2042:		ands	ip, len, #12
205		beq	4f
206		tst	ip, #8
207		beq	3f
208		load2l	r5, r6
209		orr	r4, r4, r5, push #24
210		mov	r5, r5, pull #8
211		orr	r5, r5, r6, push #24
212		stmia	dst!, {r4, r5}
213		adcs	sum, sum, r4
214		adcs	sum, sum, r5
215		mov	r4, r6, pull #8
216		tst	ip, #4
217		beq	4f
2183:		load1l	r5
219		orr	r4, r4, r5, push #24
220		str	r4, [dst], #4
221		adcs	sum, sum, r4
222		mov	r4, r5, pull #8
2234:		ands	len, len, #3
224		beq	.done
225		tst	len, #2
226		beq	.exit
227		adcs	sum, sum, r4, push #16
228		strb	r5, [dst], #1
229		mov	r5, r4, lsr #byte(1)
230		strb	r5, [dst], #1
231		mov	r5, r4, lsr #byte(2)
232		b	.exit
233
234.src2_aligned:	mov	r4, r5, pull #16
235		adds	sum, sum, #0
236		bics	ip, len, #15
237		beq	2f
2381:		load4l	r5, r6, r7, r8
239		orr	r4, r4, r5, push #16
240		mov	r5, r5, pull #16
241		orr	r5, r5, r6, push #16
242		mov	r6, r6, pull #16
243		orr	r6, r6, r7, push #16
244		mov	r7, r7, pull #16
245		orr	r7, r7, r8, push #16
246		stmia	dst!, {r4, r5, r6, r7}
247		adcs	sum, sum, r4
248		adcs	sum, sum, r5
249		adcs	sum, sum, r6
250		adcs	sum, sum, r7
251		mov	r4, r8, pull #16
252		sub	ip, ip, #16
253		teq	ip, #0
254		bne	1b
2552:		ands	ip, len, #12
256		beq	4f
257		tst	ip, #8
258		beq	3f
259		load2l	r5, r6
260		orr	r4, r4, r5, push #16
261		mov	r5, r5, pull #16
262		orr	r5, r5, r6, push #16
263		stmia	dst!, {r4, r5}
264		adcs	sum, sum, r4
265		adcs	sum, sum, r5
266		mov	r4, r6, pull #16
267		tst	ip, #4
268		beq	4f
2693:		load1l	r5
270		orr	r4, r4, r5, push #16
271		str	r4, [dst], #4
272		adcs	sum, sum, r4
273		mov	r4, r5, pull #16
2744:		ands	len, len, #3
275		beq	.done
276		tst	len, #2
277		beq	.exit
278		adcs	sum, sum, r4
279		strb	r5, [dst], #1
280		mov	r5, r4, lsr #byte(1)
281		strb	r5, [dst], #1
282		tst	len, #1
283		beq	.done
284		load1b	r5
285		b	.exit
286
287.src3_aligned:	mov	r4, r5, pull #24
288		adds	sum, sum, #0
289		bics	ip, len, #15
290		beq	2f
2911:		load4l	r5, r6, r7, r8
292		orr	r4, r4, r5, push #8
293		mov	r5, r5, pull #24
294		orr	r5, r5, r6, push #8
295		mov	r6, r6, pull #24
296		orr	r6, r6, r7, push #8
297		mov	r7, r7, pull #24
298		orr	r7, r7, r8, push #8
299		stmia	dst!, {r4, r5, r6, r7}
300		adcs	sum, sum, r4
301		adcs	sum, sum, r5
302		adcs	sum, sum, r6
303		adcs	sum, sum, r7
304		mov	r4, r8, pull #24
305		sub	ip, ip, #16
306		teq	ip, #0
307		bne	1b
3082:		ands	ip, len, #12
309		beq	4f
310		tst	ip, #8
311		beq	3f
312		load2l	r5, r6
313		orr	r4, r4, r5, push #8
314		mov	r5, r5, pull #24
315		orr	r5, r5, r6, push #8
316		stmia	dst!, {r4, r5}
317		adcs	sum, sum, r4
318		adcs	sum, sum, r5
319		mov	r4, r6, pull #24
320		tst	ip, #4
321		beq	4f
3223:		load1l	r5
323		orr	r4, r4, r5, push #8
324		str	r4, [dst], #4
325		adcs	sum, sum, r4
326		mov	r4, r5, pull #24
3274:		ands	len, len, #3
328		beq	.done
329		tst	len, #2
330		beq	.exit
331		strb	r5, [dst], #1
332		adcs	sum, sum, r4
333		load1l	r4
334		strb	r5, [dst], #1
335		adcs	sum, sum, r4, push #24
336		mov	r5, r4, lsr #byte(1)
337		b	.exit
338