1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64
30 * Unaligned accesses
31 *
32 */
33
34
35/* By default we assume that the DC instruction can be used to zero
36   data blocks more efficiently.  In some circumstances this might be
37   unsafe, for example in an asymmetric multiprocessor environment with
38   different DC clear lengths (neither the upper nor lower lengths are
39   safe to use).  The feature can be disabled by defining DONT_USE_DC.
40
41   If code may be run in a virtualized environment, then define
42   MAYBE_VIRT.  This will cause the code to cache the system register
43   values rather than re-reading them each call.  */
44
45#define dstin		x0
46#define val		w1
47#define count		x2
48#define tmp1		x3
49#define tmp1w		w3
50#define tmp2		x4
51#define tmp2w		w4
52#define zva_len_x	x5
53#define zva_len		w5
54#define zva_bits_x	x6
55
56#define A_l		x7
57#define A_lw		w7
58#define dst		x8
59#define tmp3w		w9
60
61
62	.macro def_fn f p2align=0
63	.text
64	.p2align \p2align
65	.global \f
66	.type \f, %function
67\f:
68	.endm
69
70def_fn memset p2align=6
71
72	mov	dst, dstin		/* Preserve return value.  */
73	ands	A_lw, val, #255
74#ifndef DONT_USE_DC
75	b.eq	.Lzero_mem
76#endif
77	orr	A_lw, A_lw, A_lw, lsl #8
78	orr	A_lw, A_lw, A_lw, lsl #16
79	orr	A_l, A_l, A_l, lsl #32
80.Ltail_maybe_long:
81	cmp	count, #64
82	b.ge	.Lnot_short
83.Ltail_maybe_tiny:
84	cmp	count, #15
85	b.le	.Ltail15tiny
86.Ltail63:
87	ands	tmp1, count, #0x30
88	b.eq	.Ltail15
89	add	dst, dst, tmp1
90	cmp	tmp1w, #0x20
91	b.eq	1f
92	b.lt	2f
93	stp	A_l, A_l, [dst, #-48]
941:
95	stp	A_l, A_l, [dst, #-32]
962:
97	stp	A_l, A_l, [dst, #-16]
98
99.Ltail15:
100	and	count, count, #15
101	add	dst, dst, count
102	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
103	ret
104
105.Ltail15tiny:
106	/* Set up to 15 bytes.  Does not assume earlier memory
107	   being set.  */
108	tbz	count, #3, 1f
109	str	A_l, [dst], #8
1101:
111	tbz	count, #2, 1f
112	str	A_lw, [dst], #4
1131:
114	tbz	count, #1, 1f
115	strh	A_lw, [dst], #2
1161:
117	tbz	count, #0, 1f
118	strb	A_lw, [dst]
1191:
120	ret
121
122	/* Critical loop.  Start at a new cache line boundary.  Assuming
123	 * 64 bytes per line, this ensures the entire loop is in one line.  */
124	.p2align 6
125.Lnot_short:
126	neg	tmp2, dst
127	ands	tmp2, tmp2, #15
128	b.eq	2f
129	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
130	 * more than that to set, so we simply store 16 bytes and advance by
131	 * the amount required to reach alignment.  */
132	sub	count, count, tmp2
133	stp	A_l, A_l, [dst]
134	add	dst, dst, tmp2
135	/* There may be less than 63 bytes to go now.  */
136	cmp	count, #63
137	b.le	.Ltail63
1382:
139	sub	dst, dst, #16		/* Pre-bias.  */
140	sub	count, count, #64
1411:
142	stp	A_l, A_l, [dst, #16]
143	stp	A_l, A_l, [dst, #32]
144	stp	A_l, A_l, [dst, #48]
145	stp	A_l, A_l, [dst, #64]!
146	subs	count, count, #64
147	b.ge	1b
148	tst	count, #0x3f
149	add	dst, dst, #16
150	b.ne	.Ltail63
151	ret
152
153#ifndef DONT_USE_DC
154	/* For zeroing memory, check to see if we can use the ZVA feature to
155	 * zero entire 'cache' lines.  */
156.Lzero_mem:
157	mov	A_l, #0
158	cmp	count, #63
159	b.le	.Ltail_maybe_tiny
160	neg	tmp2, dst
161	ands	tmp2, tmp2, #15
162	b.eq	1f
163	sub	count, count, tmp2
164	stp	A_l, A_l, [dst]
165	add	dst, dst, tmp2
166	cmp	count, #63
167	b.le	.Ltail63
1681:
169	/* For zeroing small amounts of memory, it's not worth setting up
170	 * the line-clear code.  */
171	cmp	count, #128
172	b.lt	.Lnot_short
173#ifdef MAYBE_VIRT
174	/* For efficiency when virtualized, we cache the ZVA capability.  */
175	adrp	tmp2, .Lcache_clear
176	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
177	tbnz	zva_len, #31, .Lnot_short
178	cbnz	zva_len, .Lzero_by_line
179	mrs	tmp1, dczid_el0
180	tbz	tmp1, #4, 1f
181	/* ZVA not available.  Remember this for next time.  */
182	mov	zva_len, #~0
183	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
184	b	.Lnot_short
1851:
186	mov	tmp3w, #4
187	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
188	lsl	zva_len, tmp3w, zva_len
189	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
190#else
191	mrs	tmp1, dczid_el0
192	tbnz	tmp1, #4, .Lnot_short
193	mov	tmp3w, #4
194	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
195	lsl	zva_len, tmp3w, zva_len
196#endif
197
198.Lzero_by_line:
199	/* Compute how far we need to go to become suitably aligned.  We're
200	 * already at quad-word alignment.  */
201	cmp	count, zva_len_x
202	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
203	sub	zva_bits_x, zva_len_x, #1
204	neg	tmp2, dst
205	ands	tmp2, tmp2, zva_bits_x
206	b.eq	1f			/* Already aligned.  */
207	/* Not aligned, check that there's enough to copy after alignment.  */
208	sub	tmp1, count, tmp2
209	cmp	tmp1, #64
210	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
211	b.lt	.Lnot_short
212	/* We know that there's at least 64 bytes to zero and that it's safe
213	 * to overrun by 64 bytes.  */
214	mov	count, tmp1
2152:
216	stp	A_l, A_l, [dst]
217	stp	A_l, A_l, [dst, #16]
218	stp	A_l, A_l, [dst, #32]
219	subs	tmp2, tmp2, #64
220	stp	A_l, A_l, [dst, #48]
221	add	dst, dst, #64
222	b.ge	2b
223	/* We've overrun a bit, so adjust dst downwards.  */
224	add	dst, dst, tmp2
2251:
226	sub	count, count, zva_len_x
2273:
228	dc	zva, dst
229	add	dst, dst, zva_len_x
230	subs	count, count, zva_len_x
231	b.ge	3b
232	ands	count, count, zva_bits_x
233	b.ne	.Ltail_maybe_long
234	ret
235#ifdef MAYBE_VIRT
236	.bss
237	.p2align 2
238.Lcache_clear:
239	.space 4
240#endif
241#endif /* DONT_USE_DC */
242