1/* 2 * Copyright 2010 Tilera Corporation. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation, version 2. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 11 * NON INFRINGEMENT. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <arch/chip.h> 16 17#include <linux/types.h> 18#include <linux/string.h> 19#include <linux/module.h> 20 21 22void *memset(void *s, int c, size_t n) 23{ 24 uint32_t *out32; 25 int n32; 26 uint32_t v16, v32; 27 uint8_t *out8 = s; 28#if !CHIP_HAS_WH64() 29 int ahead32; 30#else 31 int to_align32; 32#endif 33 34 /* Experimentation shows that a trivial tight loop is a win up until 35 * around a size of 20, where writing a word at a time starts to win. 36 */ 37#define BYTE_CUTOFF 20 38 39#if BYTE_CUTOFF < 3 40 /* This must be at least at least this big, or some code later 41 * on doesn't work. 42 */ 43#error "BYTE_CUTOFF is too small" 44#endif 45 46 if (n < BYTE_CUTOFF) { 47 /* Strangely, this turns out to be the tightest way to 48 * write this loop. 49 */ 50 if (n != 0) { 51 do { 52 /* Strangely, combining these into one line 53 * performs worse. 54 */ 55 *out8 = c; 56 out8++; 57 } while (--n != 0); 58 } 59 60 return s; 61 } 62 63#if !CHIP_HAS_WH64() 64 /* Use a spare issue slot to start prefetching the first cache 65 * line early. This instruction is free as the store can be buried 66 * in otherwise idle issue slots doing ALU ops. 67 */ 68 __insn_prefetch(out8); 69 70 /* We prefetch the end so that a short memset that spans two cache 71 * lines gets some prefetching benefit. Again we believe this is free 72 * to issue. 73 */ 74 __insn_prefetch(&out8[n - 1]); 75#endif /* !CHIP_HAS_WH64() */ 76 77 78 /* Align 'out8'. We know n >= 3 so this won't write past the end. */ 79 while (((uintptr_t) out8 & 3) != 0) { 80 *out8++ = c; 81 --n; 82 } 83 84 /* Align 'n'. */ 85 while (n & 3) 86 out8[--n] = c; 87 88 out32 = (uint32_t *) out8; 89 n32 = n >> 2; 90 91 /* Tile input byte out to 32 bits. */ 92 v16 = __insn_intlb(c, c); 93 v32 = __insn_intlh(v16, v16); 94 95 /* This must be at least 8 or the following loop doesn't work. */ 96#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) 97 98#if !CHIP_HAS_WH64() 99 100 ahead32 = CACHE_LINE_SIZE_IN_WORDS; 101 102 /* We already prefetched the first and last cache lines, so 103 * we only need to do more prefetching if we are storing 104 * to more than two cache lines. 105 */ 106 if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { 107 int i; 108 109 /* Prefetch the next several cache lines. 110 * This is the setup code for the software-pipelined 111 * loop below. 112 */ 113#define MAX_PREFETCH 5 114 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; 115 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) 116 ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; 117 118 for (i = CACHE_LINE_SIZE_IN_WORDS; 119 i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) 120 __insn_prefetch(&out32[i]); 121 } 122 123 if (n32 > ahead32) { 124 while (1) { 125 int j; 126 127 /* Prefetch by reading one word several cache lines 128 * ahead. Since loads are non-blocking this will 129 * cause the full cache line to be read while we are 130 * finishing earlier cache lines. Using a store 131 * here causes microarchitectural performance 132 * problems where a victimizing store miss goes to 133 * the head of the retry FIFO and locks the pipe for 134 * a few cycles. So a few subsequent stores in this 135 * loop go into the retry FIFO, and then later 136 * stores see other stores to the same cache line 137 * are already in the retry FIFO and themselves go 138 * into the retry FIFO, filling it up and grinding 139 * to a halt waiting for the original miss to be 140 * satisfied. 141 */ 142 __insn_prefetch(&out32[ahead32]); 143 144#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 145#error "Unhandled CACHE_LINE_SIZE_IN_WORDS" 146#endif 147 148 n32 -= CACHE_LINE_SIZE_IN_WORDS; 149 150 /* Save icache space by only partially unrolling 151 * this loop. 152 */ 153 for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { 154 *out32++ = v32; 155 *out32++ = v32; 156 *out32++ = v32; 157 *out32++ = v32; 158 } 159 160 /* To save compiled code size, reuse this loop even 161 * when we run out of prefetching to do by dropping 162 * ahead32 down. 163 */ 164 if (n32 <= ahead32) { 165 /* Not even a full cache line left, 166 * so stop now. 167 */ 168 if (n32 < CACHE_LINE_SIZE_IN_WORDS) 169 break; 170 171 /* Choose a small enough value that we don't 172 * prefetch past the end. There's no sense 173 * in touching cache lines we don't have to. 174 */ 175 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; 176 } 177 } 178 } 179 180#else /* CHIP_HAS_WH64() */ 181 182 /* Determine how many words we need to emit before the 'out32' 183 * pointer becomes aligned modulo the cache line size. 184 */ 185 to_align32 = 186 (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1); 187 188 /* Only bother aligning and using wh64 if there is at least 189 * one full cache line to process. This check also prevents 190 * overrunning the end of the buffer with alignment words. 191 */ 192 if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) { 193 int lines_left; 194 195 /* Align out32 mod the cache line size so we can use wh64. */ 196 n32 -= to_align32; 197 for (; to_align32 != 0; to_align32--) { 198 *out32 = v32; 199 out32++; 200 } 201 202 /* Use unsigned divide to turn this into a right shift. */ 203 lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS; 204 205 do { 206 /* Only wh64 a few lines at a time, so we don't 207 * exceed the maximum number of victim lines. 208 */ 209 int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) 210 ? lines_left 211 : CHIP_MAX_OUTSTANDING_VICTIMS()); 212 uint32_t *wh = out32; 213 int i = x; 214 int j; 215 216 lines_left -= x; 217 218 do { 219 __insn_wh64(wh); 220 wh += CACHE_LINE_SIZE_IN_WORDS; 221 } while (--i); 222 223 for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); 224 j != 0; j--) { 225 *out32++ = v32; 226 *out32++ = v32; 227 *out32++ = v32; 228 *out32++ = v32; 229 } 230 } while (lines_left != 0); 231 232 /* We processed all full lines above, so only this many 233 * words remain to be processed. 234 */ 235 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; 236 } 237 238#endif /* CHIP_HAS_WH64() */ 239 240 /* Now handle any leftover values. */ 241 if (n32 != 0) { 242 do { 243 *out32 = v32; 244 out32++; 245 } while (--n32 != 0); 246 } 247 248 return s; 249} 250EXPORT_SYMBOL(memset); 251