1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2013 Regents of the University of California
4 */
5
6#include <linux/linkage.h>
7#include <asm/asm.h>
8
9/* void *memcpy(void *, const void *, size_t) */
10ENTRY(__memcpy)
11WEAK(memcpy)
12	beq	a0, a1, .copy_end
13	/* Save for return value */
14	mv	t6, a0
15
16	/*
17	 * Register allocation for code below:
18	 * a0 - start of uncopied dst
19	 * a1 - start of uncopied src
20	 * t0 - end of uncopied dst
21	 */
22	add	t0, a0, a2
23
24	/*
25	 * Use bytewise copy if too small.
26	 *
27	 * This threshold must be at least 2*SZREG to ensure at least one
28	 * wordwise copy is performed. It is chosen to be 16 because it will
29	 * save at least 7 iterations of bytewise copy, which pays off the
30	 * fixed overhead.
31	 */
32	li	a3, 16
33	bltu	a2, a3, .Lbyte_copy_tail
34
35	/*
36	 * Bytewise copy first to align a0 to word boundary.
37	 */
38	addi	a2, a0, SZREG-1
39	andi	a2, a2, ~(SZREG-1)
40	beq	a0, a2, 2f
411:
42	lb	a5, 0(a1)
43	addi	a1, a1, 1
44	sb	a5, 0(a0)
45	addi	a0, a0, 1
46	bne	a0, a2, 1b
472:
48
49	/*
50	 * Now a0 is word-aligned. If a1 is also word aligned, we could perform
51	 * aligned word-wise copy. Otherwise we need to perform misaligned
52	 * word-wise copy.
53	 */
54	andi	a3, a1, SZREG-1
55	bnez	a3, .Lmisaligned_word_copy
56
57	/* Unrolled wordwise copy */
58	addi	t0, t0, -(16*SZREG-1)
59	bgeu	a0, t0, 2f
601:
61	REG_L	a2,        0(a1)
62	REG_L	a3,    SZREG(a1)
63	REG_L	a4,  2*SZREG(a1)
64	REG_L	a5,  3*SZREG(a1)
65	REG_L	a6,  4*SZREG(a1)
66	REG_L	a7,  5*SZREG(a1)
67	REG_L	t1,  6*SZREG(a1)
68	REG_L	t2,  7*SZREG(a1)
69	REG_L	t3,  8*SZREG(a1)
70	REG_L	t4,  9*SZREG(a1)
71	REG_L	t5, 10*SZREG(a1)
72	REG_S	a2,        0(a0)
73	REG_S	a3,    SZREG(a0)
74	REG_S	a4,  2*SZREG(a0)
75	REG_S	a5,  3*SZREG(a0)
76	REG_S	a6,  4*SZREG(a0)
77	REG_S	a7,  5*SZREG(a0)
78	REG_S	t1,  6*SZREG(a0)
79	REG_S	t2,  7*SZREG(a0)
80	REG_S	t3,  8*SZREG(a0)
81	REG_S	t4,  9*SZREG(a0)
82	REG_S	t5, 10*SZREG(a0)
83	REG_L	a2, 11*SZREG(a1)
84	REG_L	a3, 12*SZREG(a1)
85	REG_L	a4, 13*SZREG(a1)
86	REG_L	a5, 14*SZREG(a1)
87	REG_L	a6, 15*SZREG(a1)
88	addi	a1, a1, 16*SZREG
89	REG_S	a2, 11*SZREG(a0)
90	REG_S	a3, 12*SZREG(a0)
91	REG_S	a4, 13*SZREG(a0)
92	REG_S	a5, 14*SZREG(a0)
93	REG_S	a6, 15*SZREG(a0)
94	addi	a0, a0, 16*SZREG
95	bltu	a0, t0, 1b
962:
97	/* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
98	addi	t0, t0, 15*SZREG
99
100	/* Wordwise copy */
101	bgeu	a0, t0, 2f
1021:
103	REG_L	a5, 0(a1)
104	addi	a1, a1, SZREG
105	REG_S	a5, 0(a0)
106	addi	a0, a0, SZREG
107	bltu	a0, t0, 1b
1082:
109	addi	t0, t0, SZREG-1
110
111.Lbyte_copy_tail:
112	/*
113	 * Bytewise copy anything left.
114	 */
115	beq	a0, t0, 2f
1161:
117	lb	a5, 0(a1)
118	addi	a1, a1, 1
119	sb	a5, 0(a0)
120	addi	a0, a0, 1
121	bne	a0, t0, 1b
1222:
123
124	mv	a0, t6
125.copy_end:
126	ret
127
128.Lmisaligned_word_copy:
129	/*
130	 * Misaligned word-wise copy.
131	 * For misaligned copy we still perform word-wise copy, but we need to
132	 * use the value fetched from the previous iteration and do some shifts.
133	 * This is safe because we wouldn't access more words than necessary.
134	 */
135
136	/* Calculate shifts */
137	slli	t3, a3, 3
138	sub	t4, x0, t3 /* negate is okay as shift will only look at LSBs */
139
140	/* Load the initial value and align a1 */
141	andi	a1, a1, ~(SZREG-1)
142	REG_L	a5, 0(a1)
143
144	addi	t0, t0, -(SZREG-1)
145	/* At least one iteration will be executed here, no check */
1461:
147	srl	a4, a5, t3
148	REG_L	a5, SZREG(a1)
149	addi	a1, a1, SZREG
150	sll	a2, a5, t4
151	or	a2, a2, a4
152	REG_S	a2, 0(a0)
153	addi	a0, a0, SZREG
154	bltu	a0, t0, 1b
155
156	/* Update pointers to correct value */
157	addi	t0, t0, SZREG-1
158	add	a1, a1, a3
159
160	j	.Lbyte_copy_tail
161END(__memcpy)
162