1193323Seddnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3. 2193323Sed 3193323Seddnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. 4193323Sed 5193323Seddnl Contributed to the GNU project by Torbj��rn Granlund. 6193323Sed 7193323Seddnl This file is part of the GNU MP Library. 8193323Seddnl 9193323Seddnl The GNU MP Library is free software; you can redistribute it and/or modify 10193323Seddnl it under the terms of either: 11193323Seddnl 12193323Seddnl * the GNU Lesser General Public License as published by the Free 13193323Seddnl Software Foundation; either version 3 of the License, or (at your 14193323Seddnl option) any later version. 15193323Seddnl 16193323Seddnl or 17193323Seddnl 18193323Seddnl * the GNU General Public License as published by the Free Software 19193323Seddnl Foundation; either version 2 of the License, or (at your option) any 20193323Seddnl later version. 21193323Seddnl 22193323Seddnl or both in parallel, as here. 23193323Seddnl 24193323Seddnl The GNU MP Library is distributed in the hope that it will be useful, but 25193323Seddnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26193323Seddnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27193323Seddnl for more details. 28193323Seddnl 29193323Seddnl You should have received copies of the GNU General Public License and the 30243830Sdimdnl GNU Lesser General Public License along with the GNU MP Library. If not, 31243830Sdimdnl see https://www.gnu.org/licenses/. 32193323Sed 33193323Sedinclude(`../config.m4') 34193323Sed 35193323SedC cycles/limb cycles/limb cycles/limb good 36193323SedC aligned unaligned best seen for cpu? 37193323SedC AMD K8,K9 2.0 illop 1.0/1.0 N 38193323SedC AMD K10 0.85 illop Y/N 39193323SedC AMD bd1 0.70 0.66 Y 40193323SedC AMD bd2 0.68 0.66 Y 41193323SedC AMD bd3 ? ? 42193323SedC AMD bd4 ? ? 43193323SedC AMD bt1 1.97 8.16 1.5/1.5 N 44193323SedC AMD bt2 0.77 0.93 0.65/opt N/Y 45193323SedC AMD zn1 ? ? 46193323SedC AMD zn2 ? ? 47193323SedC Intel P4 2.26 illop Y/N 48193323SedC Intel CNR 0.52 0.64 opt/opt Y 49243830SdimC Intel NHM 0.52 0.71 0.50/0.67 N 50193323SedC Intel SBR 0.51 0.54 opt/0.51 Y 51193323SedC Intel IBR 0.50 0.54 opt/opt Y 52243830SdimC Intel HWL 0.50 0.51 opt/opt Y 53193323SedC Intel BWL 0.55 0.55 opt/opt Y 54193323SedC Intel atom 1.16 1.61 opt/opt Y 55193323SedC Intel SLM 1.02 1.07 opt/opt Y 56193323SedC VIA nano 1.09 1.08 opt/opt Y 57193323Sed 58243830SdimC We use only 16-byte operations, except for unaligned top-most and bottom-most 59193323SedC limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That 60193323SedC instruction is better adapted to mpn_copyd's needs, we need to contort the 61193323SedC code to use it here. 62193323SedC 63243830SdimC For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop, 64193323SedC taken from the x86_64 default code. 65193323Sed 66193323SedC INPUT PARAMETERS 67193323Seddefine(`rp', `%rdi') 68243830Sdimdefine(`up', `%rsi') 69193323Seddefine(`n', `%rdx') 70193323Sed 71193323SedC There are three instructions for loading an aligned 128-bit quantity. We use 72193323SedC movaps, since it has the shortest coding. 73243830Sdimdnl define(`movdqa', ``movaps'') 74193323Sed 75193323Sedifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)') 76193323Sed 77193323SedASM_START() 78243830Sdim TEXT 79193323Sed ALIGN(64) 80193323SedPROLOGUE(mpn_copyi) 81193323Sed FUNC_ENTRY(3) 82193323Sed 83243830Sdim cmp $COPYI_SSE_THRESHOLD, n 84193323Sed jbe L(bc) 85193323Sed 86193323Sed test $8, R8(rp) C is rp 16-byte aligned? 87193323Sed jz L(rp_aligned) C jump if rp aligned 88243830Sdim 89193323Sed movsq C copy one limb 90193323Sed dec n 91193323Sed 92193323SedL(rp_aligned): 93193323Sed test $8, R8(up) 94243830Sdim jnz L(uent) 95193323Sed 96193323Sedifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 97193323Sed` sub $8, n', 98193323Sed` jmp L(am)') 99193323Sed 100243830Sdim ALIGN(16) 101243830SdimL(atop):movdqa 0(up), %xmm0 102193323Sed movdqa 16(up), %xmm1 103193323Sed movdqa 32(up), %xmm2 104193323Sed movdqa 48(up), %xmm3 105193323Sed lea 64(up), up 106193323Sed movdqa %xmm0, (rp) 107193323Sed movdqa %xmm1, 16(rp) 108193323Sed movdqa %xmm2, 32(rp) 109243830Sdim movdqa %xmm3, 48(rp) 110193323Sed lea 64(rp), rp 111193323SedL(am): sub $8, n 112193323Sed jnc L(atop) 113193323Sed 114193323Sed test $4, R8(n) 115193323Sed jz 1f 116193323Sed movdqa (up), %xmm0 117243830Sdim movdqa 16(up), %xmm1 118218893Sdim lea 32(up), up 119193323Sed movdqa %xmm0, (rp) 120193323Sed movdqa %xmm1, 16(rp) 121193323Sed lea 32(rp), rp 122193323Sed 123193323Sed1: test $2, R8(n) 124218893Sdim jz 1f 125193323Sed movdqa (up), %xmm0 126218893Sdim lea 16(up), up 127193323Sed movdqa %xmm0, (rp) 128193323Sed lea 16(rp), rp 129243830Sdim 130243830Sdim1: test $1, R8(n) 131243830Sdim jz 1f 132243830Sdim mov (up), %r8 133243830Sdim mov %r8, (rp) 134243830Sdim 135243830Sdim1: FUNC_EXIT() 136243830Sdim ret 137243830Sdim 138243830SdimL(uent): 139243830SdimC Code handling up - rp = 8 (mod 16) 140243830Sdim 141243830Sdim cmp $16, n 142243830Sdim jc L(ued0) 143243830Sdim 144243830SdimIFDOS(` add $-56, %rsp ') 145243830SdimIFDOS(` movdqa %xmm6, (%rsp) ') 146243830SdimIFDOS(` movdqa %xmm7, 16(%rsp) ') 147243830SdimIFDOS(` movdqa %xmm8, 32(%rsp) ') 148243830Sdim 149243830Sdim movaps 120(up), %xmm7 150243830Sdim movaps 104(up), %xmm6 151243830Sdim movaps 88(up), %xmm5 152193323Sed movaps 72(up), %xmm4 153243830Sdim movaps 56(up), %xmm3 154243830Sdim movaps 40(up), %xmm2 155243830Sdim lea 128(up), up 156193323Sed sub $32, n 157193323Sed jc L(ued1) 158193323Sed 159193323Sed ALIGN(16) 160243830SdimL(utop):movaps -104(up), %xmm1 161193323Sed sub $16, n 162193323Sed movaps -120(up), %xmm0 163193323Sed palignr($8, %xmm6, %xmm7) 164193323Sed movaps -136(up), %xmm8 165193323Sed movdqa %xmm7, 112(rp) 166243830Sdim palignr($8, %xmm5, %xmm6) 167193323Sed movaps 120(up), %xmm7 168193323Sed movdqa %xmm6, 96(rp) 169193323Sed palignr($8, %xmm4, %xmm5) 170193323Sed movaps 104(up), %xmm6 171193323Sed movdqa %xmm5, 80(rp) 172234353Sdim palignr($8, %xmm3, %xmm4) 173263508Sdim movaps 88(up), %xmm5 174234353Sdim movdqa %xmm4, 64(rp) 175234353Sdim palignr($8, %xmm2, %xmm3) 176234353Sdim movaps 72(up), %xmm4 177234353Sdim movdqa %xmm3, 48(rp) 178193323Sed palignr($8, %xmm1, %xmm2) 179210299Sed movaps 56(up), %xmm3 180210299Sed movdqa %xmm2, 32(rp) 181210299Sed palignr($8, %xmm0, %xmm1) 182210299Sed movaps 40(up), %xmm2 183210299Sed movdqa %xmm1, 16(rp) 184210299Sed palignr($8, %xmm8, %xmm0) 185210299Sed lea 128(up), up 186210299Sed movdqa %xmm0, (rp) 187193323Sed lea 128(rp), rp 188243830Sdim jnc L(utop) 189243830Sdim 190243830SdimL(ued1):movaps -104(up), %xmm1 191243830Sdim movaps -120(up), %xmm0 192243830Sdim movaps -136(up), %xmm8 193243830Sdim palignr($8, %xmm6, %xmm7) 194243830Sdim movdqa %xmm7, 112(rp) 195243830Sdim palignr($8, %xmm5, %xmm6) 196243830Sdim movdqa %xmm6, 96(rp) 197243830Sdim palignr($8, %xmm4, %xmm5) 198243830Sdim movdqa %xmm5, 80(rp) 199243830Sdim palignr($8, %xmm3, %xmm4) 200243830Sdim movdqa %xmm4, 64(rp) 201243830Sdim palignr($8, %xmm2, %xmm3) 202243830Sdim movdqa %xmm3, 48(rp) 203243830Sdim palignr($8, %xmm1, %xmm2) 204243830Sdim movdqa %xmm2, 32(rp) 205243830Sdim palignr($8, %xmm0, %xmm1) 206243830Sdim movdqa %xmm1, 16(rp) 207243830Sdim palignr($8, %xmm8, %xmm0) 208243830Sdim movdqa %xmm0, (rp) 209243830Sdim lea 128(rp), rp 210243830Sdim 211193323SedIFDOS(` movdqa (%rsp), %xmm6 ') 212193323SedIFDOS(` movdqa 16(%rsp), %xmm7 ') 213193323SedIFDOS(` movdqa 32(%rsp), %xmm8 ') 214193323SedIFDOS(` add $56, %rsp ') 215243830Sdim 216193323SedL(ued0):test $8, R8(n) 217193323Sed jz 1f 218193323Sed movaps 56(up), %xmm3 219193323Sed movaps 40(up), %xmm2 220193323Sed movaps 24(up), %xmm1 221193323Sed movaps 8(up), %xmm0 222243830Sdim movaps -8(up), %xmm4 223193323Sed palignr($8, %xmm2, %xmm3) 224193323Sed movdqa %xmm3, 48(rp) 225193323Sed palignr($8, %xmm1, %xmm2) 226193323Sed movdqa %xmm2, 32(rp) 227193323Sed palignr($8, %xmm0, %xmm1) 228193323Sed movdqa %xmm1, 16(rp) 229193323Sed palignr($8, %xmm4, %xmm0) 230193323Sed lea 64(up), up 231193323Sed movdqa %xmm0, (rp) 232193323Sed lea 64(rp), rp 233 2341: test $4, R8(n) 235 jz 1f 236 movaps 24(up), %xmm1 237 movaps 8(up), %xmm0 238 palignr($8, %xmm0, %xmm1) 239 movaps -8(up), %xmm3 240 movdqa %xmm1, 16(rp) 241 palignr($8, %xmm3, %xmm0) 242 lea 32(up), up 243 movdqa %xmm0, (rp) 244 lea 32(rp), rp 245 2461: test $2, R8(n) 247 jz 1f 248 movdqa 8(up), %xmm0 249 movdqa -8(up), %xmm3 250 palignr($8, %xmm3, %xmm0) 251 lea 16(up), up 252 movdqa %xmm0, (rp) 253 lea 16(rp), rp 254 2551: test $1, R8(n) 256 jz 1f 257 mov (up), %r8 258 mov %r8, (rp) 259 2601: FUNC_EXIT() 261 ret 262 263C Basecase code. Needed for good small operands speed, not for 264C correctness as the above code is currently written. 265 266L(bc): lea -8(rp), rp 267 sub $4, R32(n) 268 jc L(end) 269 270 ALIGN(16) 271L(top): mov (up), %r8 272 mov 8(up), %r9 273 lea 32(rp), rp 274 mov 16(up), %r10 275 mov 24(up), %r11 276 lea 32(up), up 277 mov %r8, -24(rp) 278 mov %r9, -16(rp) 279ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 280` sub $4, R32(n)') 281 mov %r10, -8(rp) 282 mov %r11, (rp) 283ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 284` jnc L(top)') 285 286L(end): test $1, R8(n) 287 jz 1f 288 mov (up), %r8 289 mov %r8, 8(rp) 290 lea 8(rp), rp 291 lea 8(up), up 2921: test $2, R8(n) 293 jz 1f 294 mov (up), %r8 295 mov 8(up), %r9 296 mov %r8, 8(rp) 297 mov %r9, 16(rp) 2981: FUNC_EXIT() 299 ret 300EPILOGUE() 301