1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (c) 2011, The Linux Foundation. All rights reserved.
4 */
5
6
7/* HEXAGON assembly optimized memset */
8/* Replaces the standard library function memset */
9
10
11        .macro HEXAGON_OPT_FUNC_BEGIN name
12	.text
13	.p2align 4
14	.globl \name
15	.type  \name, @function
16\name:
17	.endm
18
19	.macro HEXAGON_OPT_FUNC_FINISH name
20	.size  \name, . - \name
21	.endm
22
23/* FUNCTION: memset (v2 version) */
24#if __HEXAGON_ARCH__ < 3
25HEXAGON_OPT_FUNC_BEGIN memset
26	{
27		r6 = #8
28		r7 = extractu(r0, #3 , #0)
29		p0 = cmp.eq(r2, #0)
30		p1 = cmp.gtu(r2, #7)
31	}
32	{
33		r4 = vsplatb(r1)
34		r8 = r0           /* leave r0 intact for return val  */
35		r9 = sub(r6, r7)  /* bytes until double alignment  */
36		if p0 jumpr r31   /* count == 0, so return  */
37	}
38	{
39		r3 = #0
40		r7 = #0
41		p0 = tstbit(r9, #0)
42		if p1 jump 2f /* skip byte loop */
43	}
44
45/* less than 8 bytes to set, so just set a byte at a time and return  */
46
47		loop0(1f, r2) /* byte loop */
48	.falign
491: /* byte loop */
50	{
51		memb(r8++#1) = r4
52	}:endloop0
53		jumpr r31
54	.falign
552: /* skip byte loop */
56	{
57		r6 = #1
58		p0 = tstbit(r9, #1)
59		p1 = cmp.eq(r2, #1)
60		if !p0 jump 3f /* skip initial byte store */
61	}
62	{
63		memb(r8++#1) = r4
64		r3:2 = sub(r3:2, r7:6)
65		if p1 jumpr r31
66	}
67	.falign
683: /* skip initial byte store */
69	{
70		r6 = #2
71		p0 = tstbit(r9, #2)
72		p1 = cmp.eq(r2, #2)
73		if !p0 jump 4f /* skip initial half store */
74	}
75	{
76		memh(r8++#2) = r4
77		r3:2 = sub(r3:2, r7:6)
78		if p1 jumpr r31
79	}
80	.falign
814: /* skip initial half store */
82	{
83		r6 = #4
84		p0 = cmp.gtu(r2, #7)
85		p1 = cmp.eq(r2, #4)
86		if !p0 jump 5f /* skip initial word store */
87	}
88	{
89		memw(r8++#4) = r4
90		r3:2 = sub(r3:2, r7:6)
91		p0 = cmp.gtu(r2, #11)
92		if p1 jumpr r31
93	}
94	.falign
955: /* skip initial word store */
96	{
97		r10 = lsr(r2, #3)
98		p1 = cmp.eq(r3, #1)
99		if !p0 jump 7f /* skip double loop */
100	}
101	{
102		r5 = r4
103		r6 = #8
104		loop0(6f, r10) /* double loop */
105	}
106
107/* set bytes a double word at a time  */
108
109	.falign
1106: /* double loop */
111	{
112		memd(r8++#8) = r5:4
113		r3:2 = sub(r3:2, r7:6)
114		p1 = cmp.eq(r2, #8)
115	}:endloop0
116	.falign
1177: /* skip double loop */
118	{
119		p0 = tstbit(r2, #2)
120		if p1 jumpr r31
121	}
122	{
123		r6 = #4
124		p0 = tstbit(r2, #1)
125		p1 = cmp.eq(r2, #4)
126		if !p0 jump 8f /* skip final word store */
127	}
128	{
129		memw(r8++#4) = r4
130		r3:2 = sub(r3:2, r7:6)
131		if p1 jumpr r31
132	}
133	.falign
1348: /* skip final word store */
135	{
136		p1 = cmp.eq(r2, #2)
137		if !p0 jump 9f /* skip final half store */
138	}
139	{
140		memh(r8++#2) = r4
141		if p1 jumpr r31
142	}
143	.falign
1449: /* skip final half store */
145	{
146		memb(r8++#1) = r4
147		jumpr r31
148	}
149HEXAGON_OPT_FUNC_FINISH memset
150#endif
151
152
153/*  FUNCTION: memset (v3 and higher version)  */
154#if __HEXAGON_ARCH__ >= 3
155HEXAGON_OPT_FUNC_BEGIN memset
156	{
157		r7=vsplatb(r1)
158		r6 = r0
159		if (r2==#0) jump:nt .L1
160	}
161	{
162		r5:4=combine(r7,r7)
163		p0 = cmp.gtu(r2,#8)
164		if (p0.new) jump:nt .L3
165	}
166	{
167		r3 = r0
168		loop0(.L47,r2)
169	}
170	.falign
171.L47:
172	{
173		memb(r3++#1) = r1
174	}:endloop0 /* start=.L47 */
175		jumpr r31
176.L3:
177	{
178		p0 = tstbit(r0,#0)
179		if (!p0.new) jump:nt .L8
180		p1 = cmp.eq(r2, #1)
181	}
182	{
183		r6 = add(r0, #1)
184		r2 = add(r2,#-1)
185		memb(r0) = r1
186		if (p1) jump .L1
187	}
188.L8:
189	{
190		p0 = tstbit(r6,#1)
191		if (!p0.new) jump:nt .L10
192	}
193	{
194		r2 = add(r2,#-2)
195		memh(r6++#2) = r7
196		p0 = cmp.eq(r2, #2)
197		if (p0.new) jump:nt .L1
198	}
199.L10:
200	{
201		p0 = tstbit(r6,#2)
202		if (!p0.new) jump:nt .L12
203	}
204	{
205		r2 = add(r2,#-4)
206		memw(r6++#4) = r7
207		p0 = cmp.eq(r2, #4)
208		if (p0.new) jump:nt .L1
209	}
210.L12:
211	{
212		p0 = cmp.gtu(r2,#127)
213		if (!p0.new) jump:nt .L14
214	}
215		r3 = and(r6,#31)
216		if (r3==#0) jump:nt .L17
217	{
218		memd(r6++#8) = r5:4
219		r2 = add(r2,#-8)
220	}
221		r3 = and(r6,#31)
222		if (r3==#0) jump:nt .L17
223	{
224		memd(r6++#8) = r5:4
225		r2 = add(r2,#-8)
226	}
227		r3 = and(r6,#31)
228		if (r3==#0) jump:nt .L17
229	{
230		memd(r6++#8) = r5:4
231		r2 = add(r2,#-8)
232	}
233.L17:
234	{
235		r3 = lsr(r2,#5)
236		if (r1!=#0) jump:nt .L18
237	}
238	{
239		r8 = r3
240		r3 = r6
241		loop0(.L46,r3)
242	}
243	.falign
244.L46:
245	{
246		dczeroa(r6)
247		r6 = add(r6,#32)
248		r2 = add(r2,#-32)
249	}:endloop0 /* start=.L46 */
250.L14:
251	{
252		p0 = cmp.gtu(r2,#7)
253		if (!p0.new) jump:nt .L28
254		r8 = lsr(r2,#3)
255	}
256		loop0(.L44,r8)
257	.falign
258.L44:
259	{
260		memd(r6++#8) = r5:4
261		r2 = add(r2,#-8)
262	}:endloop0 /* start=.L44 */
263.L28:
264	{
265		p0 = tstbit(r2,#2)
266		if (!p0.new) jump:nt .L33
267	}
268	{
269		r2 = add(r2,#-4)
270		memw(r6++#4) = r7
271	}
272.L33:
273	{
274		p0 = tstbit(r2,#1)
275		if (!p0.new) jump:nt .L35
276	}
277	{
278		r2 = add(r2,#-2)
279		memh(r6++#2) = r7
280	}
281.L35:
282		p0 = cmp.eq(r2,#1)
283		if (p0) memb(r6) = r1
284.L1:
285		jumpr r31
286.L18:
287		loop0(.L45,r3)
288	.falign
289.L45:
290		dczeroa(r6)
291	{
292		memd(r6++#8) = r5:4
293		r2 = add(r2,#-32)
294	}
295		memd(r6++#8) = r5:4
296		memd(r6++#8) = r5:4
297	{
298		memd(r6++#8) = r5:4
299	}:endloop0 /* start=.L45  */
300		jump .L14
301HEXAGON_OPT_FUNC_FINISH memset
302#endif
303