1dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3. 2 3dnl Copyright 2012, 2015 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjorn Granlund. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb cycles/limb cycles/limb good 36C aligned unaligned best seen for cpu? 37C AMD K8,K9 2.0 illop 1.0/1.0 N 38C AMD K10 0.85 illop Y/N 39C AMD bull 0.70 0.70 Y 40C AMD pile 0.68 0.68 Y 41C AMD steam 42C AMD excavator 43C AMD bobcat 1.97 8.24 1.5/1.5 N 44C AMD jaguar 0.77 0.89 0.65/opt N/Y 45C Intel P4 2.26 illop Y/N 46C Intel core 0.52 0.80 opt/opt Y 47C Intel NHM 0.52 0.64 opt/opt Y 48C Intel SBR 0.51 0.51 opt/opt Y 49C Intel IBR 0.50 0.50 opt/opt Y 50C Intel HWL 0.50 0.51 opt/opt Y 51C Intel BWL 0.55 0.55 opt/opt Y 52C Intel atom 1.16 1.66 opt/opt Y 53C Intel SLM 1.02 1.04 opt/opt Y 54C VIA nano 1.08 1.06 opt/opt Y 55 56C We use only 16-byte operations, except for unaligned top-most and bottom-most 57C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). 58C 59C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop, 60C taken from the x86_64 default code. 61 62C INPUT PARAMETERS 63define(`rp', `%rdi') 64define(`up', `%rsi') 65define(`n', `%rdx') 66 67C There are three instructions for loading an aligned 128-bit quantity. We use 68C movaps, since it has the shortest coding. 69define(`movdqa', ``movaps'') 70 71ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)') 72 73ASM_START() 74 TEXT 75 ALIGN(64) 76PROLOGUE(mpn_copyd) 77 FUNC_ENTRY(3) 78 79 lea -8(up,n,8), up 80 lea -8(rp,n,8), rp 81 82 cmp $COPYD_SSE_THRESHOLD, n 83 jbe L(bc) 84 85 test $8, R8(rp) C is rp 16-byte aligned? 86 jnz L(rp_aligned) C jump if rp aligned 87 88 mov (up), %rax C copy one limb 89 mov %rax, (rp) 90 lea -8(up), up 91 lea -8(rp), rp 92 dec n 93 94L(rp_aligned): 95 test $8, R8(up) 96 jz L(uent) 97 98ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 99` sub $8, n', 100` jmp L(am)') 101 102 ALIGN(16) 103L(atop):movdqa -8(up), %xmm0 104 movdqa -24(up), %xmm1 105 movdqa -40(up), %xmm2 106 movdqa -56(up), %xmm3 107 lea -64(up), up 108 movdqa %xmm0, -8(rp) 109 movdqa %xmm1, -24(rp) 110 movdqa %xmm2, -40(rp) 111 movdqa %xmm3, -56(rp) 112 lea -64(rp), rp 113L(am): sub $8, n 114 jnc L(atop) 115 116 test $4, R8(n) 117 jz 1f 118 movdqa -8(up), %xmm0 119 movdqa -24(up), %xmm1 120 lea -32(up), up 121 movdqa %xmm0, -8(rp) 122 movdqa %xmm1, -24(rp) 123 lea -32(rp), rp 124 1251: test $2, R8(n) 126 jz 1f 127 movdqa -8(up), %xmm0 128 lea -16(up), up 129 movdqa %xmm0, -8(rp) 130 lea -16(rp), rp 131 1321: test $1, R8(n) 133 jz 1f 134 mov (up), %r8 135 mov %r8, (rp) 136 1371: FUNC_EXIT() 138 ret 139 140L(uent):sub $16, n 141 movdqa (up), %xmm0 142 jc L(uend) 143 144 ALIGN(16) 145L(utop):sub $16, n 146 movdqa -16(up), %xmm1 147 palignr($8, %xmm1, %xmm0) 148 movdqa %xmm0, -8(rp) 149 movdqa -32(up), %xmm2 150 palignr($8, %xmm2, %xmm1) 151 movdqa %xmm1, -24(rp) 152 movdqa -48(up), %xmm3 153 palignr($8, %xmm3, %xmm2) 154 movdqa %xmm2, -40(rp) 155 movdqa -64(up), %xmm0 156 palignr($8, %xmm0, %xmm3) 157 movdqa %xmm3, -56(rp) 158 movdqa -80(up), %xmm1 159 palignr($8, %xmm1, %xmm0) 160 movdqa %xmm0, -72(rp) 161 movdqa -96(up), %xmm2 162 palignr($8, %xmm2, %xmm1) 163 movdqa %xmm1, -88(rp) 164 movdqa -112(up), %xmm3 165 palignr($8, %xmm3, %xmm2) 166 movdqa %xmm2, -104(rp) 167 movdqa -128(up), %xmm0 168 palignr($8, %xmm0, %xmm3) 169 movdqa %xmm3, -120(rp) 170 lea -128(up), up 171 lea -128(rp), rp 172 jnc L(utop) 173 174L(uend):test $8, R8(n) 175 jz 1f 176 movdqa -16(up), %xmm1 177 palignr($8, %xmm1, %xmm0) 178 movdqa %xmm0, -8(rp) 179 movdqa -32(up), %xmm0 180 palignr($8, %xmm0, %xmm1) 181 movdqa %xmm1, -24(rp) 182 movdqa -48(up), %xmm1 183 palignr($8, %xmm1, %xmm0) 184 movdqa %xmm0, -40(rp) 185 movdqa -64(up), %xmm0 186 palignr($8, %xmm0, %xmm1) 187 movdqa %xmm1, -56(rp) 188 lea -64(up), up 189 lea -64(rp), rp 190 1911: test $4, R8(n) 192 jz 1f 193 movdqa -16(up), %xmm1 194 palignr($8, %xmm1, %xmm0) 195 movdqa %xmm0, -8(rp) 196 movdqa -32(up), %xmm0 197 palignr($8, %xmm0, %xmm1) 198 movdqa %xmm1, -24(rp) 199 lea -32(up), up 200 lea -32(rp), rp 201 2021: test $2, R8(n) 203 jz 1f 204 movdqa -16(up), %xmm1 205 palignr($8, %xmm1, %xmm0) 206 movdqa %xmm0, -8(rp) 207 lea -16(up), up 208 lea -16(rp), rp 209 2101: test $1, R8(n) 211 jz 1f 212 mov (up), %r8 213 mov %r8, (rp) 214 2151: FUNC_EXIT() 216 ret 217 218C Basecase code. Needed for good small operands speed, not for 219C correctness as the above code is currently written. 220 221L(bc): sub $4, R32(n) 222 jc L(end) 223 224 ALIGN(16) 225L(top): mov (up), %r8 226 mov -8(up), %r9 227 lea -32(rp), rp 228 mov -16(up), %r10 229 mov -24(up), %r11 230 lea -32(up), up 231 mov %r8, 32(rp) 232 mov %r9, 24(rp) 233ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 234` sub $4, R32(n)') 235 mov %r10, 16(rp) 236 mov %r11, 8(rp) 237ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 238` jnc L(top)') 239 240L(end): test $1, R8(n) 241 jz 1f 242 mov (up), %r8 243 mov %r8, (rp) 244 lea -8(rp), rp 245 lea -8(up), up 2461: test $2, R8(n) 247 jz 1f 248 mov (up), %r8 249 mov -8(up), %r9 250 mov %r8, (rp) 251 mov %r9, -8(rp) 2521: FUNC_EXIT() 253 ret 254EPILOGUE() 255