1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64
30 * Unaligned accesses
31 *
32 */
33
34#include <sys/elf_common.h>
35
36#include <machine/asm.h>
37
38#define dstin		x0
39#define val		w1
40#define count		x2
41#define tmp1		x3
42#define tmp1w		w3
43#define tmp2		x4
44#define tmp2w		w4
45#define zva_len_x	x5
46#define zva_len		w5
47#define zva_bits_x	x6
48
49#define A_l		x7
50#define A_lw		w7
51#define dst		x8
52#define tmp3w		w9
53
54ENTRY(memset)
55
56	mov	dst, dstin		/* Preserve return value.  */
57	ands	A_lw, val, #255
58#ifndef DONT_USE_DC
59	b.eq	.Lzero_mem
60#endif
61	orr	A_lw, A_lw, A_lw, lsl #8
62	orr	A_lw, A_lw, A_lw, lsl #16
63	orr	A_l, A_l, A_l, lsl #32
64.Ltail_maybe_long:
65	cmp	count, #64
66	b.ge	.Lnot_short
67.Ltail_maybe_tiny:
68	cmp	count, #15
69	b.le	.Ltail15tiny
70.Ltail63:
71	ands	tmp1, count, #0x30
72	b.eq	.Ltail15
73	add	dst, dst, tmp1
74	cmp	tmp1w, #0x20
75	b.eq	1f
76	b.lt	2f
77	stp	A_l, A_l, [dst, #-48]
781:
79	stp	A_l, A_l, [dst, #-32]
802:
81	stp	A_l, A_l, [dst, #-16]
82
83.Ltail15:
84	and	count, count, #15
85	add	dst, dst, count
86	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
87	ret
88
89.Ltail15tiny:
90	/* Set up to 15 bytes.  Does not assume earlier memory
91	   being set.  */
92	tbz	count, #3, 1f
93	str	A_l, [dst], #8
941:
95	tbz	count, #2, 1f
96	str	A_lw, [dst], #4
971:
98	tbz	count, #1, 1f
99	strh	A_lw, [dst], #2
1001:
101	tbz	count, #0, 1f
102	strb	A_lw, [dst]
1031:
104	ret
105
106	/* Critical loop.  Start at a new cache line boundary.  Assuming
107	 * 64 bytes per line, this ensures the entire loop is in one line.  */
108	.p2align 6
109.Lnot_short:
110	neg	tmp2, dst
111	ands	tmp2, tmp2, #15
112	b.eq	2f
113	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
114	 * more than that to set, so we simply store 16 bytes and advance by
115	 * the amount required to reach alignment.  */
116	sub	count, count, tmp2
117	stp	A_l, A_l, [dst]
118	add	dst, dst, tmp2
119	/* There may be less than 63 bytes to go now.  */
120	cmp	count, #63
121	b.le	.Ltail63
1222:
123	sub	dst, dst, #16		/* Pre-bias.  */
124	sub	count, count, #64
1251:
126	stp	A_l, A_l, [dst, #16]
127	stp	A_l, A_l, [dst, #32]
128	stp	A_l, A_l, [dst, #48]
129	stp	A_l, A_l, [dst, #64]!
130	subs	count, count, #64
131	b.ge	1b
132	tst	count, #0x3f
133	add	dst, dst, #16
134	b.ne	.Ltail63
135	ret
136
137	/* For zeroing memory, check to see if we can use the ZVA feature to
138	 * zero entire 'cache' lines.  */
139.Lzero_mem:
140	mov	A_l, #0
141	cmp	count, #63
142	b.le	.Ltail_maybe_tiny
143	neg	tmp2, dst
144	ands	tmp2, tmp2, #15
145	b.eq	1f
146	sub	count, count, tmp2
147	stp	A_l, A_l, [dst]
148	add	dst, dst, tmp2
149	cmp	count, #63
150	b.le	.Ltail63
1511:
152	/* For zeroing small amounts of memory, it's not worth setting up
153	 * the line-clear code.  */
154	cmp	count, #128
155	b.lt	.Lnot_short
156
157	adrp	tmp2, dczva_line_size
158	add	tmp2, tmp2, :lo12:dczva_line_size
159	ldr	zva_len, [tmp2]
160	cbz	zva_len, .Lnot_short
161
162.Lzero_by_line:
163	/* Compute how far we need to go to become suitably aligned.  We're
164	 * already at quad-word alignment.  */
165	cmp	count, zva_len_x
166	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
167	sub	zva_bits_x, zva_len_x, #1
168	neg	tmp2, dst
169	ands	tmp2, tmp2, zva_bits_x
170	b.eq	1f			/* Already aligned.  */
171	/* Not aligned, check that there's enough to copy after alignment.  */
172	sub	tmp1, count, tmp2
173	cmp	tmp1, #64
174	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
175	b.lt	.Lnot_short
176	/* We know that there's at least 64 bytes to zero and that it's safe
177	 * to overrun by 64 bytes.  */
178	mov	count, tmp1
1792:
180	stp	A_l, A_l, [dst]
181	stp	A_l, A_l, [dst, #16]
182	stp	A_l, A_l, [dst, #32]
183	subs	tmp2, tmp2, #64
184	stp	A_l, A_l, [dst, #48]
185	add	dst, dst, #64
186	b.ge	2b
187	/* We've overrun a bit, so adjust dst downwards.  */
188	add	dst, dst, tmp2
1891:
190	sub	count, count, zva_len_x
1913:
192	dc	zva, dst
193	add	dst, dst, zva_len_x
194	subs	count, count, zva_len_x
195	b.ge	3b
196	ands	count, count, zva_bits_x
197	b.ne	.Ltail_maybe_long
198	ret
199END(memset)
200
201GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL)
202