1/*	$NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $	*/
2
3/*-
4 * Copyright (c) 2001 Ben Harris.
5 * Copyright (c) 1994 Mark Brinicombe.
6 * Copyright (c) 1994 Brini.
7 * All rights reserved.
8 *
9 * This code is derived from software written for Brini by Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by Brini.
22 * 4. The name of the company nor the name of the author may be used to
23 *    endorse or promote products derived from this software without specific
24 *    prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
27 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
28 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
29 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
30 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * RiscBSD kernel project
39 *
40 * blockio.S
41 *
42 * optimised block read/write from/to IO routines.
43 *
44 * Created      : 08/10/94
45 * Modified	: 22/01/99  -- R.Earnshaw
46 *			       Faster, and small tweaks for StrongARM
47 */
48
49#include <machine/asm.h>
50
51__FBSDID("$FreeBSD$");
52
53	.syntax	unified
54
55/*
56 * Read bytes from an I/O address into a block of memory
57 *
58 * r0 = address to read from (IO)
59 * r1 = address to write to (memory)
60 * r2 = length
61 */
62
63/* This code will look very familiar if you've read _memcpy(). */
64ENTRY(read_multi_1)
65	mov	ip, sp
66	stmfd	sp!, {fp, ip, lr, pc}
67	sub	fp, ip, #4
68	subs	r2, r2, #4		/* r2 = length - 4 */
69	blt	.Lrm1_l4			/* less than 4 bytes */
70	ands	r12, r1, #3
71	beq	.Lrm1_main		/* aligned destination */
72	rsb	r12, r12, #4
73	cmp	r12, #2
74	ldrb	r3, [r0]
75	strb	r3, [r1], #1
76	ldrbge	r3, [r0]
77	strbge	r3, [r1], #1
78	ldrbgt	r3, [r0]
79	strbgt	r3, [r1], #1
80	subs	r2, r2, r12
81	blt	.Lrm1_l4
82.Lrm1_main:
83.Lrm1loop:
84	ldrb	r3, [r0]
85	ldrb	r12, [r0]
86	orr	r3, r3, r12, lsl #8
87	ldrb	r12, [r0]
88	orr	r3, r3, r12, lsl #16
89	ldrb	r12, [r0]
90	orr	r3, r3, r12, lsl #24
91	str	r3, [r1], #4
92	subs	r2, r2, #4
93	bge	.Lrm1loop
94.Lrm1_l4:
95	adds	r2, r2, #4			/* r2 = length again */
96	ldmdbeq	fp, {fp, sp, pc}
97	RETeq
98	cmp	r2, #2
99	ldrb	r3, [r0]
100	strb	r3, [r1], #1
101	ldrbge	r3, [r0]
102	strbge	r3, [r1], #1
103	ldrbgt	r3, [r0]
104	strbgt	r3, [r1], #1
105	ldmdb	fp, {fp, sp, pc}
106END(read_multi_1)
107
108/*
109 * Write bytes to an I/O address from a block of memory
110 *
111 * r0 = address to write to (IO)
112 * r1 = address to read from (memory)
113 * r2 = length
114 */
115
116/* This code will look very familiar if you've read _memcpy(). */
117ENTRY(write_multi_1)
118	mov	ip, sp
119	stmfd	sp!, {fp, ip, lr, pc}
120	sub	fp, ip, #4
121	subs	r2, r2, #4		/* r2 = length - 4 */
122	blt	.Lwm1_l4		/* less than 4 bytes */
123	ands	r12, r1, #3
124	beq	.Lwm1_main		/* aligned source */
125	rsb	r12, r12, #4
126	cmp	r12, #2
127	ldrb	r3, [r1], #1
128	strb	r3, [r0]
129	ldrbge	r3, [r1], #1
130	strbge	r3, [r0]
131	ldrbgt	r3, [r1], #1
132	strbgt	r3, [r0]
133	subs	r2, r2, r12
134	blt	.Lwm1_l4
135.Lwm1_main:
136.Lwm1loop:
137	ldr	r3, [r1], #4
138	strb	r3, [r0]
139	mov	r3, r3, lsr #8
140	strb	r3, [r0]
141	mov	r3, r3, lsr #8
142	strb	r3, [r0]
143	mov	r3, r3, lsr #8
144	strb	r3, [r0]
145	subs	r2, r2, #4
146	bge	.Lwm1loop
147.Lwm1_l4:
148	adds	r2, r2, #4			/* r2 = length again */
149	ldmdbeq	fp, {fp, sp, pc}
150	cmp	r2, #2
151	ldrb	r3, [r1], #1
152	strb	r3, [r0]
153	ldrbge	r3, [r1], #1
154	strbge	r3, [r0]
155	ldrbgt	r3, [r1], #1
156	strbgt	r3, [r0]
157	ldmdb	fp, {fp, sp, pc}
158END(write_multi_1)
159
160/*
161 * Reads short ints (16 bits) from an I/O address into a block of memory
162 *
163 * r0 = address to read from (IO)
164 * r1 = address to write to (memory)
165 * r2 = length
166 */
167
168ENTRY(insw)
169/* Make sure that we have a positive length */
170	cmp	r2, #0x00000000
171	movle	pc, lr
172
173/* If the destination address and the size is word aligned, do it fast */
174
175	tst	r2, #0x00000001
176	tsteq	r1, #0x00000003
177	beq	.Lfastinsw
178
179/* Non aligned insw */
180
181.Linswloop:
182	ldr	r3, [r0]
183	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
184	strb	r3, [r1], #0x0001
185	mov	r3, r3, lsr #8
186	strb	r3, [r1], #0x0001
187	bgt	.Linswloop
188
189	RET
190
191/* Word aligned insw */
192
193.Lfastinsw:
194
195.Lfastinswloop:
196	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
197					 * word accesses */
198	ldr	ip, [r0]
199	mov	r3, r3, lsr #16		/* Put the two shorts together */
200	orr	r3, r3, ip, lsl #16
201	str	r3, [r1], #0x0004	/* Store */
202	subs	r2, r2, #0x00000002	/* Next */
203	bgt	.Lfastinswloop
204
205	RET
206END(insw)
207
208/*
209 * Writes short ints (16 bits) from a block of memory to an I/O address
210 *
211 * r0 = address to write to (IO)
212 * r1 = address to read from (memory)
213 * r2 = length
214 */
215
216ENTRY(outsw)
217/* Make sure that we have a positive length */
218	cmp	r2, #0x00000000
219	movle	pc, lr
220
221/* If the destination address and the size is word aligned, do it fast */
222
223	tst	r2, #0x00000001
224	tsteq	r1, #0x00000003
225	beq	.Lfastoutsw
226
227/* Non aligned outsw */
228
229.Loutswloop:
230	ldrb	r3, [r1], #0x0001
231	ldrb	ip, [r1], #0x0001
232	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
233	orr	r3, r3, ip, lsl #8
234	orr	r3, r3, r3, lsl #16
235	str	r3, [r0]
236	bgt	.Loutswloop
237
238	RET
239
240/* Word aligned outsw */
241
242.Lfastoutsw:
243
244.Lfastoutswloop:
245	ldr	r3, [r1], #0x0004	/* r3 = (H)(L) */
246	subs	r2, r2, #0x00000002	/* Loop test in load delay slot */
247
248	eor	ip, r3, r3, lsr #16	/* ip = (H)(H^L) */
249	eor	r3, r3, ip, lsl #16	/* r3 = (H^H^L)(L) = (L)(L) */
250	eor	ip, ip, r3, lsr #16	/* ip = (H)(H^L^L) = (H)(H) */
251
252	str	r3, [r0]
253	str	ip, [r0]
254
255/*	mov	ip, r3, lsl #16
256 *	orr	ip, ip, ip, lsr #16
257 *	str	ip, [r0]
258 *
259 *	mov	ip, r3, lsr #16
260 *	orr	ip, ip, ip, lsl #16
261 *	str	ip, [r0]
262 */
263
264	bgt	.Lfastoutswloop
265
266	RET
267END(outsw)
268
269/*
270 * reads short ints (16 bits) from an I/O address into a block of memory
271 * with a length garenteed to be a multiple of 16 bytes
272 * with a word aligned destination address
273 *
274 * r0 = address to read from (IO)
275 * r1 = address to write to (memory)
276 * r2 = length
277 */
278
279ENTRY(insw16)
280/* Make sure that we have a positive length */
281	cmp	r2, #0x00000000
282	movle	pc, lr
283
284/* If the destination address is word aligned and the size suitably
285   aligned, do it fast */
286
287	tst	r2, #0x00000007
288	tsteq	r1, #0x00000003
289
290	bne	_C_LABEL(insw)
291
292/* Word aligned insw */
293
294	stmfd	sp!, {r4,r5,lr}
295
296.Linsw16loop:
297	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
298					 * word accesses */
299	ldr	lr, [r0]
300	mov	r3, r3, lsr #16		/* Put the two shorts together */
301	orr	r3, r3, lr, lsl #16
302
303	ldr	r4, [r0, #0x0002]	/* take advantage of nonaligned
304					 * word accesses */
305	ldr	lr, [r0]
306	mov	r4, r4, lsr #16		/* Put the two shorts together */
307	orr	r4, r4, lr, lsl #16
308
309	ldr	r5, [r0, #0x0002]	/* take advantage of nonaligned
310					 * word accesses */
311	ldr	lr, [r0]
312	mov	r5, r5, lsr #16		/* Put the two shorts together */
313	orr	r5, r5, lr, lsl #16
314
315	ldr	ip, [r0, #0x0002]	/* take advantage of nonaligned
316					 * word accesses */
317	ldr	lr, [r0]
318	mov	ip, ip, lsr #16		/* Put the two shorts together */
319	orr	ip, ip, lr, lsl #16
320
321	stmia	r1!, {r3-r5,ip}
322	subs	r2, r2, #0x00000008	/* Next */
323	bgt	.Linsw16loop
324
325	ldmfd	sp!, {r4,r5,pc}		/* Restore regs and go home */
326END(insw16)
327
328/*
329 * Writes short ints (16 bits) from a block of memory to an I/O address
330 *
331 * r0 = address to write to (IO)
332 * r1 = address to read from (memory)
333 * r2 = length
334 */
335
336ENTRY(outsw16)
337/* Make sure that we have a positive length */
338	cmp	r2, #0x00000000
339	movle	pc, lr
340
341/* If the destination address is word aligned and the size suitably
342   aligned, do it fast */
343
344	tst	r2, #0x00000007
345	tsteq	r1, #0x00000003
346
347	bne	_C_LABEL(outsw)
348
349/* Word aligned outsw */
350
351	stmfd	sp!, {r4,r5,lr}
352
353.Loutsw16loop:
354	ldmia	r1!, {r4,r5,ip,lr}
355
356	eor	r3, r4, r4, lsl #16	/* r3 = (A^B)(B) */
357	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
358	eor	r3, r3, r4, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
359	str	r3, [r0]
360	str	r4, [r0]
361
362/*	mov	r3, r4, lsl #16
363 *	orr	r3, r3, r3, lsr #16
364 *	str	r3, [r0]
365 *
366 *	mov	r3, r4, lsr #16
367 *	orr	r3, r3, r3, lsl #16
368 *	str	r3, [r0]
369 */
370
371	eor	r3, r5, r5, lsl #16	/* r3 = (A^B)(B) */
372	eor	r5, r5, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
373	eor	r3, r3, r5, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
374	str	r3, [r0]
375	str	r5, [r0]
376
377	eor	r3, ip, ip, lsl #16	/* r3 = (A^B)(B) */
378	eor	ip, ip, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
379	eor	r3, r3, ip, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
380	str	r3, [r0]
381	str	ip, [r0]
382
383	eor	r3, lr, lr, lsl #16	/* r3 = (A^B)(B) */
384	eor	lr, lr, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
385	eor	r3, r3, lr, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
386	str	r3, [r0]
387	str	lr, [r0]
388
389	subs	r2, r2, #0x00000008
390	bgt	.Loutsw16loop
391
392	ldmfd	sp!, {r4,r5,pc}		/* and go home */
393END(outsw16)
394
395/*
396 * reads short ints (16 bits) from an I/O address into a block of memory
397 * The I/O address is assumed to be mapped multiple times in a block of
398 * 8 words.
399 * The destination address should be word aligned.
400 *
401 * r0 = address to read from (IO)
402 * r1 = address to write to (memory)
403 * r2 = length
404 */
405
406ENTRY(inswm8)
407/* Make sure that we have a positive length */
408	cmp	r2, #0x00000000
409	movle	pc, lr
410
411/* If the destination address is word aligned and the size suitably
412   aligned, do it fast */
413
414	tst	r1, #0x00000003
415
416	bne	_C_LABEL(insw)
417
418/* Word aligned insw */
419
420	stmfd	sp!, {r4-r9,lr}
421
422	mov	lr, #0xff000000
423	orr	lr, lr, #0x00ff0000
424
425.Linswm8_loop8:
426	cmp	r2, #8
427	bcc	.Linswm8_l8
428
429	ldmia	r0, {r3-r9,ip}
430
431	bic	r3, r3, lr
432	orr	r3, r3, r4, lsl #16
433	bic	r5, r5, lr
434	orr	r4, r5, r6, lsl #16
435	bic	r7, r7, lr
436	orr	r5, r7, r8, lsl #16
437	bic	r9, r9, lr
438	orr	r6, r9, ip, lsl #16
439
440	stmia	r1!, {r3-r6}
441
442	subs	r2, r2, #0x00000008	/* Next */
443	bne	.Linswm8_loop8
444	beq	.Linswm8_l1
445
446.Linswm8_l8:
447	cmp	r2, #4
448	bcc	.Linswm8_l4
449
450	ldmia	r0, {r3-r6}
451
452	bic	r3, r3, lr
453	orr	r3, r3, r4, lsl #16
454	bic	r5, r5, lr
455	orr	r4, r5, r6, lsl #16
456
457	stmia	r1!, {r3-r4}
458
459	subs	r2, r2, #0x00000004
460	beq	.Linswm8_l1
461
462.Linswm8_l4:
463	cmp	r2, #2
464	bcc	.Linswm8_l2
465
466	ldmia	r0, {r3-r4}
467
468	bic	r3, r3, lr
469	orr	r3, r3, r4, lsl #16
470	str	r3, [r1], #0x0004
471
472	subs	r2, r2, #0x00000002
473	beq	.Linswm8_l1
474
475.Linswm8_l2:
476	cmp	r2, #1
477	bcc	.Linswm8_l1
478
479	ldr	r3, [r0]
480	subs	r2, r2, #0x00000001	/* Test in load delay slot */
481					/* XXX, why don't we use result?  */
482
483	strb	r3, [r1], #0x0001
484	mov	r3, r3, lsr #8
485	strb	r3, [r1], #0x0001
486
487
488.Linswm8_l1:
489	ldmfd	sp!, {r4-r9,pc}		/* And go home */
490END(inswm8)
491
492/*
493 * write short ints (16 bits) to an I/O address from a block of memory
494 * The I/O address is assumed to be mapped multiple times in a block of
495 * 8 words.
496 * The source address should be word aligned.
497 *
498 * r0 = address to read to (IO)
499 * r1 = address to write from (memory)
500 * r2 = length
501 */
502
503ENTRY(outswm8)
504/* Make sure that we have a positive length */
505	cmp	r2, #0x00000000
506	movle	pc, lr
507
508/* If the destination address is word aligned and the size suitably
509   aligned, do it fast */
510
511	tst	r1, #0x00000003
512
513	bne	_C_LABEL(outsw)
514
515/* Word aligned outsw */
516
517	stmfd	sp!, {r4-r8,lr}
518
519.Loutswm8_loop8:
520	cmp	r2, #8
521	bcc	.Loutswm8_l8
522
523	ldmia	r1!, {r3,r5,r7,ip}
524
525	eor	r4, r3, r3, lsr #16	/* r4 = (A)(A^B) */
526	eor	r3, r3, r4, lsl #16	/* r3 = (A^A^B)(B) = (B)(B) */
527	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
528
529	eor	r6, r5, r5, lsr #16	/* r6 = (A)(A^B) */
530	eor	r5, r5, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
531	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
532
533	eor	r8, r7, r7, lsr #16	/* r8 = (A)(A^B) */
534	eor	r7, r7, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
535	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
536
537	eor	lr, ip, ip, lsr #16	/* lr = (A)(A^B) */
538	eor	ip, ip, lr, lsl #16	/* ip = (A^A^B)(B) = (B)(B) */
539	eor	lr, lr, ip, lsr #16	/* lr = (A)(B^A^B) = (A)(A) */
540
541	stmia	r0, {r3-r8,ip,lr}
542
543	subs	r2, r2, #0x00000008	/* Next */
544	bne	.Loutswm8_loop8
545	beq	.Loutswm8_l1
546
547.Loutswm8_l8:
548	cmp	r2, #4
549	bcc	.Loutswm8_l4
550
551	ldmia	r1!, {r3-r4}
552
553	eor	r6, r3, r3, lsr #16	/* r6 = (A)(A^B) */
554	eor	r5, r3, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
555	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
556
557	eor	r8, r4, r4, lsr #16	/* r8 = (A)(A^B) */
558	eor	r7, r4, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
559	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
560
561	stmia	r0, {r5-r8}
562
563	subs	r2, r2, #0x00000004
564	beq	.Loutswm8_l1
565
566.Loutswm8_l4:
567	cmp	r2, #2
568	bcc	.Loutswm8_l2
569
570	ldr	r3, [r1], #0x0004	/* r3 = (A)(B) */
571	subs	r2, r2, #0x00000002	/* Done test in Load delay slot */
572
573	eor	r5, r3, r3, lsr #16	/* r5 = (A)(A^B)*/
574	eor	r4, r3, r5, lsl #16	/* r4 = (A^A^B)(B) = (B)(B) */
575	eor	r5, r5, r4, lsr #16	/* r5 = (A)(B^A^B) = (A)(A) */
576
577	stmia	r0, {r4, r5}
578
579	beq	.Loutswm8_l1
580
581.Loutswm8_l2:
582	cmp	r2, #1
583	bcc	.Loutswm8_l1
584
585	ldrb	r3, [r1], #0x0001
586	ldrb	r4, [r1], #0x0001
587	subs	r2, r2, #0x00000001	/* Done test in load delay slot */
588					/* XXX This test isn't used?  */
589	orr	r3, r3, r4, lsl #8
590	orr	r3, r3, r3, lsl #16
591	str	r3, [r0]
592
593.Loutswm8_l1:
594	ldmfd	sp!, {r4-r8,pc}		/* And go home */
595END(outswm8)
596
597