1/*	$OpenBSD: memset.S,v 1.2 2015/08/31 02:53:57 guenther Exp $	*/
2/*	$NetBSD: memset.S,v 1.1 2005/12/20 19:28:50 christos Exp $	*/
3
4/*-
5 * Copyright (c) 2002 SHIMIZU Ryo.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. The name of the author may not be used to endorse or promote products
16 *    derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include "SYS.h"
31
32#define	REG_PTR				r0
33#define	REG_TMP1			r1
34
35#ifdef BZERO
36# define	REG_C			r2
37# define	REG_DST			r4
38# define	REG_LEN			r5
39#else
40# define	REG_DST0		r3
41# define	REG_DST			r4
42# define	REG_C			r5
43# define	REG_LEN			r6
44#endif
45
46#ifdef BZERO
47ENTRY(bzero)
48#else
49ENTRY(memset)
50	mov	REG_DST,REG_DST0	/* for return value */
51#endif
52	/* small amount to fill ? */
53	mov	#28,REG_TMP1
54	cmp/hs	REG_TMP1,REG_LEN	/* if (len >= 28) goto large; */
55	bt/s	large
56	mov	#12,REG_TMP1		/* if (len >= 12) goto small; */
57	cmp/hs	REG_TMP1,REG_LEN
58	bt/s	small
59#ifdef BZERO
60	mov	#0,REG_C
61#endif
62	/* very little fill (0 ~ 11 bytes) */
63	tst	REG_LEN,REG_LEN
64	add	REG_DST,REG_LEN
65	bt/s	done
66	add	#1,REG_DST
67
68	/* unroll 4 loops */
69	cmp/eq	REG_DST,REG_LEN
701:	mov.b	REG_C,@-REG_LEN
71	bt/s	done
72	cmp/eq	REG_DST,REG_LEN
73	mov.b	REG_C,@-REG_LEN
74	bt/s	done
75	cmp/eq	REG_DST,REG_LEN
76	mov.b	REG_C,@-REG_LEN
77	bt/s	done
78	cmp/eq	REG_DST,REG_LEN
79	mov.b	REG_C,@-REG_LEN
80	bf/s	1b
81	cmp/eq	REG_DST,REG_LEN
82done:
83#ifdef BZERO
84	rts
85	nop
86#else
87	rts
88	mov	REG_DST0,r0
89#endif
90
91
92small:
93	mov	REG_DST,r0
94	tst	#1,r0
95	bt/s	small_aligned
96	mov	REG_DST,REG_TMP1
97	shll	REG_LEN
98	mova	1f,r0			/* 1f must be 4bytes aligned! */
99	add	#16,REG_TMP1		/* REG_TMP1 = dst+16; */
100	sub	REG_LEN,r0
101	jmp	@r0
102	mov	REG_C,r0
103
104	.align	2
105	mov.b	r0,@(15,REG_TMP1)
106	mov.b	r0,@(14,REG_TMP1)
107	mov.b	r0,@(13,REG_TMP1)
108	mov.b	r0,@(12,REG_TMP1)
109	mov.b	r0,@(11,REG_TMP1)
110	mov.b	r0,@(10,REG_TMP1)
111	mov.b	r0,@(9,REG_TMP1)
112	mov.b	r0,@(8,REG_TMP1)
113	mov.b	r0,@(7,REG_TMP1)
114	mov.b	r0,@(6,REG_TMP1)
115	mov.b	r0,@(5,REG_TMP1)
116	mov.b	r0,@(4,REG_TMP1)
117	mov.b	r0,@(3,REG_TMP1)
118	mov.b	r0,@(2,REG_TMP1)
119	mov.b	r0,@(1,REG_TMP1)
120	mov.b	r0,@REG_TMP1
121	mov.b	r0,@(15,REG_DST)
122	mov.b	r0,@(14,REG_DST)
123	mov.b	r0,@(13,REG_DST)
124	mov.b	r0,@(12,REG_DST)
125	mov.b	r0,@(11,REG_DST)
126	mov.b	r0,@(10,REG_DST)
127	mov.b	r0,@(9,REG_DST)
128	mov.b	r0,@(8,REG_DST)
129	mov.b	r0,@(7,REG_DST)
130	mov.b	r0,@(6,REG_DST)
131	mov.b	r0,@(5,REG_DST)
132	mov.b	r0,@(4,REG_DST)
133	mov.b	r0,@(3,REG_DST)
134	mov.b	r0,@(2,REG_DST)
135	mov.b	r0,@(1,REG_DST)
136#ifdef BZERO
137	rts
1381:	mov.b	r0,@REG_DST
139#else
140	mov.b	r0,@REG_DST
1411:	rts
142	mov	REG_DST0,r0
143#endif
144
145
146/* 2 bytes aligned small fill */
147small_aligned:
148#ifndef BZERO
149	extu.b	REG_C,REG_TMP1		/* REG_C = ??????xx, REG_TMP1 = ????00xx */
150	shll8	REG_C			/* REG_C = ????xx00, REG_TMP1 = ????00xx */
151	or	REG_TMP1,REG_C		/* REG_C = ????xxxx */
152#endif
153
154	mov	REG_LEN,r0
155	tst	#1,r0			/* len is aligned? */
156	bt/s	1f
157	add	#-1,r0
158	mov.b	REG_C,@(r0,REG_DST)	/* fill last a byte */
159	mov	r0,REG_LEN
1601:
161
162	mova	1f,r0			/* 1f must be 4bytes aligned! */
163	sub	REG_LEN,r0
164	jmp	@r0
165	mov	REG_C,r0
166
167	.align	2
168	mov.w	r0,@(30,REG_DST)
169	mov.w	r0,@(28,REG_DST)
170	mov.w	r0,@(26,REG_DST)
171	mov.w	r0,@(24,REG_DST)
172	mov.w	r0,@(22,REG_DST)
173	mov.w	r0,@(20,REG_DST)
174	mov.w	r0,@(18,REG_DST)
175	mov.w	r0,@(16,REG_DST)
176	mov.w	r0,@(14,REG_DST)
177	mov.w	r0,@(12,REG_DST)
178	mov.w	r0,@(10,REG_DST)
179	mov.w	r0,@(8,REG_DST)
180	mov.w	r0,@(6,REG_DST)
181	mov.w	r0,@(4,REG_DST)
182	mov.w	r0,@(2,REG_DST)
183#ifdef BZERO
184	rts
1851:	mov.w	r0,@REG_DST
186#else
187	mov.w	r0,@REG_DST
1881:	rts
189	mov	REG_DST0,r0
190#endif
191
192
193
194	.align	2
195large:
196#ifdef BZERO
197	mov	#0,REG_C
198#else
199	extu.b	REG_C,REG_TMP1		/* REG_C = ??????xx, REG_TMP1 = ????00xx */
200	shll8	REG_C			/* REG_C = ????xx00, REG_TMP1 = ????00xx */
201	or	REG_C,REG_TMP1		/* REG_C = ????xx00, REG_TMP1 = ????xxxx */
202	swap.w	REG_TMP1,REG_C		/* REG_C = xxxx????, REG_TMP1 = ????xxxx */
203	xtrct	REG_TMP1,REG_C		/* REG_C = xxxxxxxx */
204#endif
205
206	mov	#3,REG_TMP1
207	tst	REG_TMP1,REG_DST
208	mov	REG_DST,REG_PTR
209	bf/s	unaligned_dst
210	add	REG_LEN,REG_PTR		/* REG_PTR = dst + len; */
211	tst	REG_TMP1,REG_LEN
212	bf/s	unaligned_len
213
214aligned:
215	/* fill 32*n bytes */
216	mov	#32,REG_TMP1
217	cmp/hi	REG_LEN,REG_TMP1
218	bt	9f
219	.align	2
2201:	sub	REG_TMP1,REG_PTR
221	mov.l	REG_C,@REG_PTR
222	sub	REG_TMP1,REG_LEN
223	mov.l	REG_C,@(4,REG_PTR)
224	cmp/hi	REG_LEN,REG_TMP1
225	mov.l	REG_C,@(8,REG_PTR)
226	mov.l	REG_C,@(12,REG_PTR)
227	mov.l	REG_C,@(16,REG_PTR)
228	mov.l	REG_C,@(20,REG_PTR)
229	mov.l	REG_C,@(24,REG_PTR)
230	bf/s	1b
231	mov.l	REG_C,@(28,REG_PTR)
2329:
233
234	/* fill left 4*n bytes */
235	cmp/eq	REG_DST,REG_PTR
236	bt	9f
237	add	#4,REG_DST
238	cmp/eq	REG_DST,REG_PTR
2391:	mov.l	REG_C,@-REG_PTR
240	bt/s	9f
241	cmp/eq	REG_DST,REG_PTR
242	mov.l	REG_C,@-REG_PTR
243	bt/s	9f
244	cmp/eq	REG_DST,REG_PTR
245	mov.l	REG_C,@-REG_PTR
246	bt/s	9f
247	cmp/eq	REG_DST,REG_PTR
248	mov.l	REG_C,@-REG_PTR
249	bf/s	1b
250	cmp/eq	REG_DST,REG_PTR
2519:
252#ifdef BZERO
253	rts
254	nop
255#else
256	rts
257	mov	REG_DST0,r0
258#endif
259
260
261unaligned_dst:
262	mov	#1,REG_TMP1
263	tst	REG_TMP1,REG_DST	/* if (dst & 1) {               */
264	add	#1,REG_TMP1
265	bt/s	2f
266	tst	REG_TMP1,REG_DST
267	mov.b	REG_C,@REG_DST		/*   *dst++ = c;                */
268	add	#1,REG_DST
269	tst	REG_TMP1,REG_DST
2702:					/* }                            */
271					/* if (dst & 2) {               */
272	bt	4f
273	mov.w	REG_C,@REG_DST		/*   *(u_int16_t*)dst++ = c;    */
274	add	#2,REG_DST
2754:					/* }                            */
276
277
278	tst	#3,REG_PTR		/* if (ptr & 3) {               */
279	bt/s	4f			/*                              */
280unaligned_len:
281	tst	#1,REG_PTR		/*   if (ptr & 1) {             */
282	bt/s	2f
283	tst	#2,REG_PTR
284	mov.b	REG_C,@-REG_PTR		/*     --ptr = c;               */
2852:					/*   }                          */
286					/*   if (ptr & 2) {             */
287	bt	4f
288	mov.w	REG_C,@-REG_PTR		/*     *--(u_int16_t*)ptr = c;  */
2894:					/*   }                          */
290					/* }                            */
291
292	mov	REG_PTR,REG_LEN
293	bra	aligned
294	sub	REG_DST,REG_LEN
295
296#ifdef BZERO
297END_WEAK(bzero)
298#else
299END_STRONG(memset)
300#endif
301