1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2013 Regents of the University of California 4 */ 5 6#include <linux/linkage.h> 7#include <asm/asm.h> 8 9/* void *memcpy(void *, const void *, size_t) */ 10ENTRY(__memcpy) 11WEAK(memcpy) 12 beq a0, a1, .copy_end 13 /* Save for return value */ 14 mv t6, a0 15 16 /* 17 * Register allocation for code below: 18 * a0 - start of uncopied dst 19 * a1 - start of uncopied src 20 * t0 - end of uncopied dst 21 */ 22 add t0, a0, a2 23 24 /* 25 * Use bytewise copy if too small. 26 * 27 * This threshold must be at least 2*SZREG to ensure at least one 28 * wordwise copy is performed. It is chosen to be 16 because it will 29 * save at least 7 iterations of bytewise copy, which pays off the 30 * fixed overhead. 31 */ 32 li a3, 16 33 bltu a2, a3, .Lbyte_copy_tail 34 35 /* 36 * Bytewise copy first to align a0 to word boundary. 37 */ 38 addi a2, a0, SZREG-1 39 andi a2, a2, ~(SZREG-1) 40 beq a0, a2, 2f 411: 42 lb a5, 0(a1) 43 addi a1, a1, 1 44 sb a5, 0(a0) 45 addi a0, a0, 1 46 bne a0, a2, 1b 472: 48 49 /* 50 * Now a0 is word-aligned. If a1 is also word aligned, we could perform 51 * aligned word-wise copy. Otherwise we need to perform misaligned 52 * word-wise copy. 53 */ 54 andi a3, a1, SZREG-1 55 bnez a3, .Lmisaligned_word_copy 56 57 /* Unrolled wordwise copy */ 58 addi t0, t0, -(16*SZREG-1) 59 bgeu a0, t0, 2f 601: 61 REG_L a2, 0(a1) 62 REG_L a3, SZREG(a1) 63 REG_L a4, 2*SZREG(a1) 64 REG_L a5, 3*SZREG(a1) 65 REG_L a6, 4*SZREG(a1) 66 REG_L a7, 5*SZREG(a1) 67 REG_L t1, 6*SZREG(a1) 68 REG_L t2, 7*SZREG(a1) 69 REG_L t3, 8*SZREG(a1) 70 REG_L t4, 9*SZREG(a1) 71 REG_L t5, 10*SZREG(a1) 72 REG_S a2, 0(a0) 73 REG_S a3, SZREG(a0) 74 REG_S a4, 2*SZREG(a0) 75 REG_S a5, 3*SZREG(a0) 76 REG_S a6, 4*SZREG(a0) 77 REG_S a7, 5*SZREG(a0) 78 REG_S t1, 6*SZREG(a0) 79 REG_S t2, 7*SZREG(a0) 80 REG_S t3, 8*SZREG(a0) 81 REG_S t4, 9*SZREG(a0) 82 REG_S t5, 10*SZREG(a0) 83 REG_L a2, 11*SZREG(a1) 84 REG_L a3, 12*SZREG(a1) 85 REG_L a4, 13*SZREG(a1) 86 REG_L a5, 14*SZREG(a1) 87 REG_L a6, 15*SZREG(a1) 88 addi a1, a1, 16*SZREG 89 REG_S a2, 11*SZREG(a0) 90 REG_S a3, 12*SZREG(a0) 91 REG_S a4, 13*SZREG(a0) 92 REG_S a5, 14*SZREG(a0) 93 REG_S a6, 15*SZREG(a0) 94 addi a0, a0, 16*SZREG 95 bltu a0, t0, 1b 962: 97 /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */ 98 addi t0, t0, 15*SZREG 99 100 /* Wordwise copy */ 101 bgeu a0, t0, 2f 1021: 103 REG_L a5, 0(a1) 104 addi a1, a1, SZREG 105 REG_S a5, 0(a0) 106 addi a0, a0, SZREG 107 bltu a0, t0, 1b 1082: 109 addi t0, t0, SZREG-1 110 111.Lbyte_copy_tail: 112 /* 113 * Bytewise copy anything left. 114 */ 115 beq a0, t0, 2f 1161: 117 lb a5, 0(a1) 118 addi a1, a1, 1 119 sb a5, 0(a0) 120 addi a0, a0, 1 121 bne a0, t0, 1b 1222: 123 124 mv a0, t6 125.copy_end: 126 ret 127 128.Lmisaligned_word_copy: 129 /* 130 * Misaligned word-wise copy. 131 * For misaligned copy we still perform word-wise copy, but we need to 132 * use the value fetched from the previous iteration and do some shifts. 133 * This is safe because we wouldn't access more words than necessary. 134 */ 135 136 /* Calculate shifts */ 137 slli t3, a3, 3 138 sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */ 139 140 /* Load the initial value and align a1 */ 141 andi a1, a1, ~(SZREG-1) 142 REG_L a5, 0(a1) 143 144 addi t0, t0, -(SZREG-1) 145 /* At least one iteration will be executed here, no check */ 1461: 147 srl a4, a5, t3 148 REG_L a5, SZREG(a1) 149 addi a1, a1, SZREG 150 sll a2, a5, t4 151 or a2, a2, a4 152 REG_S a2, 0(a0) 153 addi a0, a0, SZREG 154 bltu a0, t0, 1b 155 156 /* Update pointers to correct value */ 157 addi t0, t0, SZREG-1 158 add a1, a1, a3 159 160 j .Lbyte_copy_tail 161END(__memcpy) 162