1/*-
2 * Copyright (C) 2016 Cavium Inc.
3 * All rights reserved.
4 *
5 * Developed by Semihalf.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/asm.h>
30__FBSDID("$FreeBSD$");
31
32
33#include "assym.s"
34
35	/*
36	 * void bzero(void *p, size_t size)
37	 *
38	 *  x0 - p
39	 *  x1 - size
40	 */
41ENTRY(bzero)
42	cbz	x1, ending
43
44	/*
45	 * x5 is number of cache lines to zero - calculated later and
46	 * will become non-zero if  buffer is long enough to zero by
47	 * cache lines (and if it is allowed.)
48	 * We need to zero it before proceeding with buffers of size
49	 * smaller than 16 bytes - otherwise the x5 will not be
50	 * calculated and will retain random value.
51	 * "normal" is used for buffers <= 16 bytes and to align buffer
52	 * to cache line for buffers bigger than cache line; non-0 x5
53	 * after "normal" has completed indicates that it has been used
54	 * to align buffer to cache line and now zero by cache lines will
55	 * be performed, and x5 is amount of cache lines to loop through.
56	 */
57	mov	x5, xzr
58
59	/* No use of cache assisted zero for buffers with size <= 16 */
60	cmp	x1, #0x10
61	b.le	normal
62
63	/*
64	 * Load size of line that will be cleaned by dc zva call.
65	 * 0 means that the instruction is not allowed
66	 */
67	ldr	x7, =dczva_line_size
68	ldr	x7, [x7]
69	cbz	x7, normal
70
71	/*
72	 * Buffer must be larger than cache line for using cache zeroing
73	 * (and cache line aligned but this is checked after jump)
74	 */
75	cmp	x1, x7
76	b.lt	normal
77
78	/*
79	 * Calculate number of bytes to cache aligned address (x4) nad
80	 * number of full cache lines (x5). x6 is final address to zero.
81	 */
82	sub	x2, x7, #0x01
83	mov	x3, -1
84	eor	x3, x3, x2
85	add	x4, x0, x2
86	and	x4, x4, x3
87	subs	x4, x4, x0
88	b.eq	normal
89
90	/* Calculate number of "lines" in buffer */
91	sub	x5, x1, x4
92	rbit	x2, x7
93	clz	x2, x2
94	lsr	x5, x5, x2
95
96	/*
97	 * If number of cache lines is 0, we will not be able to zero
98	 * by cache lines, so go normal way.
99	 */
100	cbz	x5, normal
101	/* x6 is final address to zero */
102	add	x6, x0, x1
103
104	/*
105	 * We are here because x5 is non-0 so normal will be used to
106	 * align buffer before cache zeroing. x4 holds number of bytes
107	 * needed for alignment.
108	 */
109	mov	x1, x4
110
111	/* When jumping here: x0 holds pointer, x1 holds size */
112normal:
113	/*
114	 * Get buffer offset into 16 byte aligned address; 0 means pointer
115	 * is aligned.
116	 */
117	ands	x2, x0, #0x0f
118	b.eq	aligned_to_16
119	/* Calculate one-byte loop runs to 8 byte aligned address. */
120	ands	x2, x2, #0x07
121	mov	x3, #0x08
122	sub	x2, x3, x2
123	/* x2 is number of bytes missing for alignment, x1 is buffer size */
124	cmp	x1, x2
125	csel	x2, x1, x2, le
126	sub	x1, x1, x2
127
128	/*
129	 * Byte by byte copy will copy at least enough bytes to align
130	 * pointer and at most "size".
131	 */
132align:
133	strb	wzr, [x0], #0x01
134	subs	x2, x2, #0x01
135	b.ne	align
136
137	/* Now pointer is aligned to 8 bytes */
138	cmp	x1, #0x10
139	b.lt	lead_out
140	/*
141	 * Check if copy of another 8 bytes is needed to align to 16 byte
142	 * address and do it
143	 */
144	tbz	x0, #0x03, aligned_to_16
145	str	xzr, [x0], #0x08
146	sub	x1, x1, #0x08
147
148	/* While jumping here: x0 is 16 byte alligned address, x1 is size */
149aligned_to_16:
150	/* If size is less than 16 bytes, use lead_out to copy what remains */
151	cmp	x1, #0x10
152	b.lt	lead_out
153
154	lsr	x2, x1, #0x04
155zero_by_16:
156	stp	xzr, xzr, [x0], #0x10
157	subs	x2, x2, #0x01
158	b.ne	zero_by_16
159
160	/*
161	 * Lead out requires addresses to be aligned to 8 bytes. It is used to
162	 * zero buffers with sizes < 16 and what can not be zeroed by
163	 * zero_by_16 loop.
164	 */
165	ands	x1, x1, #0x0f
166	b.eq	lead_out_end
167lead_out:
168	tbz	x1, #0x03, lead_out_dword
169	str	xzr, [x0], #0x08
170lead_out_dword:
171	tbz	x1, #0x02, lead_out_word
172	str	wzr, [x0], #0x04
173lead_out_word:
174	tbz	x1, #0x01, lead_out_byte
175	strh	wzr, [x0], #0x02
176lead_out_byte:
177	tbz	x1, #0x00, lead_out_end
178	strb	wzr, [x0], #0x01
179
180lead_out_end:
181	/*
182	 * If x5 is non-zero, this means that normal has been used as
183	 * a lead in to align buffer address to cache size
184	 */
185	cbz	x5, ending
186
187	/*
188	 * Here x5 holds number of lines to zero; x6 is final address of
189	 * buffer. x0 is cache line aligned pointer. x7 is cache line size
190	 * in bytes
191	 */
192cache_line_zero:
193	dc	zva, x0
194	add	x0, x0, x7
195	subs	x5, x5, #0x01
196	b.ne	cache_line_zero
197
198	/* Need to zero remaining bytes? */
199	subs	x1, x6, x0
200	b.ne	normal
201
202ending:
203	ret
204
205END(bzero)
206
207