1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* Assumptions: 28 * 29 * ARMv8-a, AArch64 30 * Unaligned accesses 31 * 32 */ 33 34 35/* By default we assume that the DC instruction can be used to zero 36 data blocks more efficiently. In some circumstances this might be 37 unsafe, for example in an asymmetric multiprocessor environment with 38 different DC clear lengths (neither the upper nor lower lengths are 39 safe to use). The feature can be disabled by defining DONT_USE_DC. 40 41 If code may be run in a virtualized environment, then define 42 MAYBE_VIRT. This will cause the code to cache the system register 43 values rather than re-reading them each call. */ 44 45#define dstin x0 46#define val w1 47#define count x2 48#define tmp1 x3 49#define tmp1w w3 50#define tmp2 x4 51#define tmp2w w4 52#define zva_len_x x5 53#define zva_len w5 54#define zva_bits_x x6 55 56#define A_l x7 57#define A_lw w7 58#define dst x8 59#define tmp3w w9 60 61 62 .macro def_fn f p2align=0 63 .text 64 .p2align \p2align 65 .global \f 66 .type \f, %function 67\f: 68 .endm 69 70def_fn memset p2align=6 71 72 mov dst, dstin /* Preserve return value. */ 73 ands A_lw, val, #255 74#ifndef DONT_USE_DC 75 b.eq .Lzero_mem 76#endif 77 orr A_lw, A_lw, A_lw, lsl #8 78 orr A_lw, A_lw, A_lw, lsl #16 79 orr A_l, A_l, A_l, lsl #32 80.Ltail_maybe_long: 81 cmp count, #64 82 b.ge .Lnot_short 83.Ltail_maybe_tiny: 84 cmp count, #15 85 b.le .Ltail15tiny 86.Ltail63: 87 ands tmp1, count, #0x30 88 b.eq .Ltail15 89 add dst, dst, tmp1 90 cmp tmp1w, #0x20 91 b.eq 1f 92 b.lt 2f 93 stp A_l, A_l, [dst, #-48] 941: 95 stp A_l, A_l, [dst, #-32] 962: 97 stp A_l, A_l, [dst, #-16] 98 99.Ltail15: 100 and count, count, #15 101 add dst, dst, count 102 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 103 ret 104 105.Ltail15tiny: 106 /* Set up to 15 bytes. Does not assume earlier memory 107 being set. */ 108 tbz count, #3, 1f 109 str A_l, [dst], #8 1101: 111 tbz count, #2, 1f 112 str A_lw, [dst], #4 1131: 114 tbz count, #1, 1f 115 strh A_lw, [dst], #2 1161: 117 tbz count, #0, 1f 118 strb A_lw, [dst] 1191: 120 ret 121 122 /* Critical loop. Start at a new cache line boundary. Assuming 123 * 64 bytes per line, this ensures the entire loop is in one line. */ 124 .p2align 6 125.Lnot_short: 126 neg tmp2, dst 127 ands tmp2, tmp2, #15 128 b.eq 2f 129 /* Bring DST to 128-bit (16-byte) alignment. We know that there's 130 * more than that to set, so we simply store 16 bytes and advance by 131 * the amount required to reach alignment. */ 132 sub count, count, tmp2 133 stp A_l, A_l, [dst] 134 add dst, dst, tmp2 135 /* There may be less than 63 bytes to go now. */ 136 cmp count, #63 137 b.le .Ltail63 1382: 139 sub dst, dst, #16 /* Pre-bias. */ 140 sub count, count, #64 1411: 142 stp A_l, A_l, [dst, #16] 143 stp A_l, A_l, [dst, #32] 144 stp A_l, A_l, [dst, #48] 145 stp A_l, A_l, [dst, #64]! 146 subs count, count, #64 147 b.ge 1b 148 tst count, #0x3f 149 add dst, dst, #16 150 b.ne .Ltail63 151 ret 152 153#ifndef DONT_USE_DC 154 /* For zeroing memory, check to see if we can use the ZVA feature to 155 * zero entire 'cache' lines. */ 156.Lzero_mem: 157 mov A_l, #0 158 cmp count, #63 159 b.le .Ltail_maybe_tiny 160 neg tmp2, dst 161 ands tmp2, tmp2, #15 162 b.eq 1f 163 sub count, count, tmp2 164 stp A_l, A_l, [dst] 165 add dst, dst, tmp2 166 cmp count, #63 167 b.le .Ltail63 1681: 169 /* For zeroing small amounts of memory, it's not worth setting up 170 * the line-clear code. */ 171 cmp count, #128 172 b.lt .Lnot_short 173#ifdef MAYBE_VIRT 174 /* For efficiency when virtualized, we cache the ZVA capability. */ 175 adrp tmp2, .Lcache_clear 176 ldr zva_len, [tmp2, #:lo12:.Lcache_clear] 177 tbnz zva_len, #31, .Lnot_short 178 cbnz zva_len, .Lzero_by_line 179 mrs tmp1, dczid_el0 180 tbz tmp1, #4, 1f 181 /* ZVA not available. Remember this for next time. */ 182 mov zva_len, #~0 183 str zva_len, [tmp2, #:lo12:.Lcache_clear] 184 b .Lnot_short 1851: 186 mov tmp3w, #4 187 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 188 lsl zva_len, tmp3w, zva_len 189 str zva_len, [tmp2, #:lo12:.Lcache_clear] 190#else 191 mrs tmp1, dczid_el0 192 tbnz tmp1, #4, .Lnot_short 193 mov tmp3w, #4 194 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 195 lsl zva_len, tmp3w, zva_len 196#endif 197 198.Lzero_by_line: 199 /* Compute how far we need to go to become suitably aligned. We're 200 * already at quad-word alignment. */ 201 cmp count, zva_len_x 202 b.lt .Lnot_short /* Not enough to reach alignment. */ 203 sub zva_bits_x, zva_len_x, #1 204 neg tmp2, dst 205 ands tmp2, tmp2, zva_bits_x 206 b.eq 1f /* Already aligned. */ 207 /* Not aligned, check that there's enough to copy after alignment. */ 208 sub tmp1, count, tmp2 209 cmp tmp1, #64 210 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 211 b.lt .Lnot_short 212 /* We know that there's at least 64 bytes to zero and that it's safe 213 * to overrun by 64 bytes. */ 214 mov count, tmp1 2152: 216 stp A_l, A_l, [dst] 217 stp A_l, A_l, [dst, #16] 218 stp A_l, A_l, [dst, #32] 219 subs tmp2, tmp2, #64 220 stp A_l, A_l, [dst, #48] 221 add dst, dst, #64 222 b.ge 2b 223 /* We've overrun a bit, so adjust dst downwards. */ 224 add dst, dst, tmp2 2251: 226 sub count, count, zva_len_x 2273: 228 dc zva, dst 229 add dst, dst, zva_len_x 230 subs count, count, zva_len_x 231 b.ge 3b 232 ands count, count, zva_bits_x 233 b.ne .Ltail_maybe_long 234 ret 235#ifdef MAYBE_VIRT 236 .bss 237 .p2align 2 238.Lcache_clear: 239 .space 4 240#endif 241#endif /* DONT_USE_DC */ 242