1/* A memset for CRIS. 2 Copyright (C) 1999-2005 Axis Communications. 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions 7 are met: 8 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 12 2. Neither the name of Axis Communications nor the names of its 13 contributors may be used to endorse or promote products derived 14 from this software without specific prior written permission. 15 16 THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS 17 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS 20 COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 21 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 25 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 26 IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 POSSIBILITY OF SUCH DAMAGE. */ 28 29 30/* Note the multiple occurrence of the expression "12*4", including the 31 asm. It is hard to get it into the asm in a good way. Thus better to 32 expose the problem everywhere: no macro. */ 33 34/* Assuming one cycle per dword written or read (ok, not really true; the 35 world is not ideal), and one cycle per instruction, then 43+3*(n/48-1) 36 <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full 37 48-byte block to set. */ 38 39#define MEMSET_BY_BLOCK_THRESHOLD (1 * 48) 40 41/* No name ambiguities in this file. */ 42__asm__ (".syntax no_register_prefix"); 43 44void *memset(void *pdst, int c, unsigned int plen) 45{ 46 /* Now we want the parameters in special registers. Make sure the 47 compiler does something usable with this. */ 48 49 register char *return_dst __asm__ ("r10") = pdst; 50 register int n __asm__ ("r12") = plen; 51 register int lc __asm__ ("r11") = c; 52 53 /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get 54 penalized here compared to the generic implementation. */ 55 56 /* This is fragile performancewise at best. Check with newer GCC 57 releases, if they compile cascaded "x |= x << 8" to sane code. */ 58 __asm__("movu.b %0,r13 \n\ 59 lslq 8,r13 \n\ 60 move.b %0,r13 \n\ 61 move.d r13,%0 \n\ 62 lslq 16,r13 \n\ 63 or.d r13,%0" 64 : "=r" (lc) /* Inputs. */ 65 : "0" (lc) /* Outputs. */ 66 : "r13"); /* Trash. */ 67 68 { 69 register char *dst __asm__ ("r13") = pdst; 70 71 if (((unsigned long) pdst & 3) != 0 72 /* Oops! n = 0 must be a valid call, regardless of alignment. */ 73 && n >= 3) 74 { 75 if ((unsigned long) dst & 1) 76 { 77 *dst = (char) lc; 78 n--; 79 dst++; 80 } 81 82 if ((unsigned long) dst & 2) 83 { 84 *(short *) dst = lc; 85 n -= 2; 86 dst += 2; 87 } 88 } 89 90 /* Decide which setting method to use. */ 91 if (n >= MEMSET_BY_BLOCK_THRESHOLD) 92 { 93 /* It is not optimal to tell the compiler about clobbering any 94 registers; that will move the saving/restoring of those registers 95 to the function prologue/epilogue, and make non-block sizes 96 suboptimal. */ 97 __asm__ volatile 98 ("\ 99 ;; GCC does promise correct register allocations, but let's \n\ 100 ;; make sure it keeps its promises. \n\ 101 .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ 102 .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ 103 .endif \n\ 104 \n\ 105 ;; Save the registers we'll clobber in the movem process \n\ 106 ;; on the stack. Don't mention them to gcc, it will only be \n\ 107 ;; upset. \n\ 108 subq 11*4,sp \n\ 109 movem r10,[sp] \n\ 110 \n\ 111 move.d r11,r0 \n\ 112 move.d r11,r1 \n\ 113 move.d r11,r2 \n\ 114 move.d r11,r3 \n\ 115 move.d r11,r4 \n\ 116 move.d r11,r5 \n\ 117 move.d r11,r6 \n\ 118 move.d r11,r7 \n\ 119 move.d r11,r8 \n\ 120 move.d r11,r9 \n\ 121 move.d r11,r10 \n\ 122 \n\ 123 ;; Now we've got this: \n\ 124 ;; r13 - dst \n\ 125 ;; r12 - n \n\ 126 \n\ 127 ;; Update n for the first loop \n\ 128 subq 12*4,r12 \n\ 1290: \n\ 130" 131#ifdef __arch_common_v10_v32 132 /* Cater to branch offset difference between v32 and v10. We 133 assume the branch below has an 8-bit offset. */ 134" setf\n" 135#endif 136" subq 12*4,r12 \n\ 137 bge 0b \n\ 138 movem r11,[r13+] \n\ 139 \n\ 140 ;; Compensate for last loop underflowing n. \n\ 141 addq 12*4,r12 \n\ 142 \n\ 143 ;; Restore registers from stack. \n\ 144 movem [sp+],r10" 145 146 /* Outputs. */ 147 : "=r" (dst), "=r" (n) 148 149 /* Inputs. */ 150 : "0" (dst), "1" (n), "r" (lc)); 151 } 152 153 /* An ad-hoc unroll, used for 4*12-1..16 bytes. */ 154 while (n >= 16) 155 { 156 *(long *) dst = lc; dst += 4; 157 *(long *) dst = lc; dst += 4; 158 *(long *) dst = lc; dst += 4; 159 *(long *) dst = lc; dst += 4; 160 n -= 16; 161 } 162 163 switch (n) 164 { 165 case 0: 166 break; 167 168 case 1: 169 *dst = (char) lc; 170 break; 171 172 case 2: 173 *(short *) dst = (short) lc; 174 break; 175 176 case 3: 177 *(short *) dst = (short) lc; dst += 2; 178 *dst = (char) lc; 179 break; 180 181 case 4: 182 *(long *) dst = lc; 183 break; 184 185 case 5: 186 *(long *) dst = lc; dst += 4; 187 *dst = (char) lc; 188 break; 189 190 case 6: 191 *(long *) dst = lc; dst += 4; 192 *(short *) dst = (short) lc; 193 break; 194 195 case 7: 196 *(long *) dst = lc; dst += 4; 197 *(short *) dst = (short) lc; dst += 2; 198 *dst = (char) lc; 199 break; 200 201 case 8: 202 *(long *) dst = lc; dst += 4; 203 *(long *) dst = lc; 204 break; 205 206 case 9: 207 *(long *) dst = lc; dst += 4; 208 *(long *) dst = lc; dst += 4; 209 *dst = (char) lc; 210 break; 211 212 case 10: 213 *(long *) dst = lc; dst += 4; 214 *(long *) dst = lc; dst += 4; 215 *(short *) dst = (short) lc; 216 break; 217 218 case 11: 219 *(long *) dst = lc; dst += 4; 220 *(long *) dst = lc; dst += 4; 221 *(short *) dst = (short) lc; dst += 2; 222 *dst = (char) lc; 223 break; 224 225 case 12: 226 *(long *) dst = lc; dst += 4; 227 *(long *) dst = lc; dst += 4; 228 *(long *) dst = lc; 229 break; 230 231 case 13: 232 *(long *) dst = lc; dst += 4; 233 *(long *) dst = lc; dst += 4; 234 *(long *) dst = lc; dst += 4; 235 *dst = (char) lc; 236 break; 237 238 case 14: 239 *(long *) dst = lc; dst += 4; 240 *(long *) dst = lc; dst += 4; 241 *(long *) dst = lc; dst += 4; 242 *(short *) dst = (short) lc; 243 break; 244 245 case 15: 246 *(long *) dst = lc; dst += 4; 247 *(long *) dst = lc; dst += 4; 248 *(long *) dst = lc; dst += 4; 249 *(short *) dst = (short) lc; dst += 2; 250 *dst = (char) lc; 251 break; 252 } 253 } 254 255 return return_dst; 256} 257