1dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3. 2 3dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjorn Granlund. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb cycles/limb cycles/limb good 36C aligned unaligned best seen for cpu? 37C AMD K8,K9 2.0 illop 1.0/1.0 N 38C AMD K10 0.85 illop Y/N 39C AMD bd1 1.39 ? 1.45 Y/N 40C AMD bd2 0.8-1.4 0.7-1.4 Y 41C AMD bd3 42C AMD bd4 43C AMD bobcat 1.97 ? 8.17 1.5/1.5 N 44C AMD jaguar 1.02 1.02 0.91/0.91 N 45C Intel P4 2.26 illop Y/N 46C Intel core 0.58 0.87 opt/0.74 Y 47C Intel NHM 0.64 1.14 opt/bad Y 48C Intel SBR 0.51 0.65 opt/opt Y 49C Intel IBR 0.50 0.64 opt/0.57 Y 50C Intel HWL 0.51 0.58 opt/opt Y 51C Intel BWL 0.52 0.64 opt/opt Y 52C Intel SKL 0.51 0.63 opt/opt Y 53C Intel atom 1.16 1.70 opt/opt Y 54C Intel SLM 1.02 1.52 N 55C VIA nano 1.09 1.10 opt/opt Y 56 57C We use only 16-byte operations, except for unaligned top-most and bottom-most 58C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That 59C instruction is better adapted to mpn_copyd's needs, we need to contort the 60C code to use it here. 61C 62C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken 63C from the x86_64 default code. 64 65C INPUT PARAMETERS 66define(`rp', `%rdi') 67define(`up', `%rsi') 68define(`n', `%rdx') 69 70C There are three instructions for loading an aligned 128-bit quantity. We use 71C movaps, since it has the shortest coding. 72define(`movdqa', ``movaps'') 73 74ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)') 75 76ASM_START() 77 TEXT 78 ALIGN(64) 79PROLOGUE(mpn_com) 80 FUNC_ENTRY(3) 81 82 cmp $COM_SSE_THRESHOLD, n 83 jbe L(bc) 84 85 pcmpeqb %xmm5, %xmm5 C set to 111...111 86 87 test $8, R8(rp) C is rp 16-byte aligned? 88 jz L(rp_aligned) C jump if rp aligned 89 90 mov (up), %r8 91 lea 8(up), up 92 not %r8 93 mov %r8, (rp) 94 lea 8(rp), rp 95 dec n 96 97L(rp_aligned): 98 test $8, R8(up) 99 jnz L(uent) 100 101ifelse(eval(COM_SSE_THRESHOLD >= 8),1, 102` sub $8, n', 103` jmp L(am)') 104 105 ALIGN(16) 106L(atop):movdqa 0(up), %xmm0 107 movdqa 16(up), %xmm1 108 movdqa 32(up), %xmm2 109 movdqa 48(up), %xmm3 110 lea 64(up), up 111 pxor %xmm5, %xmm0 112 pxor %xmm5, %xmm1 113 pxor %xmm5, %xmm2 114 pxor %xmm5, %xmm3 115 movdqa %xmm0, (rp) 116 movdqa %xmm1, 16(rp) 117 movdqa %xmm2, 32(rp) 118 movdqa %xmm3, 48(rp) 119 lea 64(rp), rp 120L(am): sub $8, n 121 jnc L(atop) 122 123 test $4, R8(n) 124 jz 1f 125 movdqa (up), %xmm0 126 movdqa 16(up), %xmm1 127 lea 32(up), up 128 pxor %xmm5, %xmm0 129 pxor %xmm5, %xmm1 130 movdqa %xmm0, (rp) 131 movdqa %xmm1, 16(rp) 132 lea 32(rp), rp 133 1341: test $2, R8(n) 135 jz 1f 136 movdqa (up), %xmm0 137 lea 16(up), up 138 pxor %xmm5, %xmm0 139 movdqa %xmm0, (rp) 140 lea 16(rp), rp 141 1421: test $1, R8(n) 143 jz 1f 144 mov (up), %r8 145 not %r8 146 mov %r8, (rp) 147 1481: FUNC_EXIT() 149 ret 150 151L(uent): 152C Code handling up - rp = 8 (mod 16) 153 154C FIXME: The code below only handles overlap if it is close to complete, or 155C quite separate: up-rp < 5 or up-up > 15 limbs 156 lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES 157 sub rp, %rax 158 cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES 159 jbe L(bc) C deflect to plain loop 160 161 sub $16, n 162 jc L(uend) 163 164 movdqa 120(up), %xmm3 165 166 sub $16, n 167 jmp L(um) 168 169 ALIGN(16) 170L(utop):movdqa 120(up), %xmm3 171 pxor %xmm5, %xmm0 172 movdqa %xmm0, -128(rp) 173 sub $16, n 174L(um): movdqa 104(up), %xmm2 175 palignr($8, %xmm2, %xmm3) 176 movdqa 88(up), %xmm1 177 pxor %xmm5, %xmm3 178 movdqa %xmm3, 112(rp) 179 palignr($8, %xmm1, %xmm2) 180 movdqa 72(up), %xmm0 181 pxor %xmm5, %xmm2 182 movdqa %xmm2, 96(rp) 183 palignr($8, %xmm0, %xmm1) 184 movdqa 56(up), %xmm3 185 pxor %xmm5, %xmm1 186 movdqa %xmm1, 80(rp) 187 palignr($8, %xmm3, %xmm0) 188 movdqa 40(up), %xmm2 189 pxor %xmm5, %xmm0 190 movdqa %xmm0, 64(rp) 191 palignr($8, %xmm2, %xmm3) 192 movdqa 24(up), %xmm1 193 pxor %xmm5, %xmm3 194 movdqa %xmm3, 48(rp) 195 palignr($8, %xmm1, %xmm2) 196 movdqa 8(up), %xmm0 197 pxor %xmm5, %xmm2 198 movdqa %xmm2, 32(rp) 199 palignr($8, %xmm0, %xmm1) 200 movdqa -8(up), %xmm3 201 pxor %xmm5, %xmm1 202 movdqa %xmm1, 16(rp) 203 palignr($8, %xmm3, %xmm0) 204 lea 128(up), up 205 lea 128(rp), rp 206 jnc L(utop) 207 208 pxor %xmm5, %xmm0 209 movdqa %xmm0, -128(rp) 210 211L(uend):test $8, R8(n) 212 jz 1f 213 movdqa 56(up), %xmm3 214 movdqa 40(up), %xmm2 215 palignr($8, %xmm2, %xmm3) 216 movdqa 24(up), %xmm1 217 pxor %xmm5, %xmm3 218 movdqa %xmm3, 48(rp) 219 palignr($8, %xmm1, %xmm2) 220 movdqa 8(up), %xmm0 221 pxor %xmm5, %xmm2 222 movdqa %xmm2, 32(rp) 223 palignr($8, %xmm0, %xmm1) 224 movdqa -8(up), %xmm3 225 pxor %xmm5, %xmm1 226 movdqa %xmm1, 16(rp) 227 palignr($8, %xmm3, %xmm0) 228 lea 64(up), up 229 pxor %xmm5, %xmm0 230 movdqa %xmm0, (rp) 231 lea 64(rp), rp 232 2331: test $4, R8(n) 234 jz 1f 235 movdqa 24(up), %xmm1 236 movdqa 8(up), %xmm0 237 palignr($8, %xmm0, %xmm1) 238 movdqa -8(up), %xmm3 239 pxor %xmm5, %xmm1 240 movdqa %xmm1, 16(rp) 241 palignr($8, %xmm3, %xmm0) 242 lea 32(up), up 243 pxor %xmm5, %xmm0 244 movdqa %xmm0, (rp) 245 lea 32(rp), rp 246 2471: test $2, R8(n) 248 jz 1f 249 movdqa 8(up), %xmm0 250 movdqa -8(up), %xmm3 251 palignr($8, %xmm3, %xmm0) 252 lea 16(up), up 253 pxor %xmm5, %xmm0 254 movdqa %xmm0, (rp) 255 lea 16(rp), rp 256 2571: test $1, R8(n) 258 jz 1f 259 mov (up), %r8 260 not %r8 261 mov %r8, (rp) 262 2631: FUNC_EXIT() 264 ret 265 266C Basecase code. Needed for good small operands speed, not for 267C correctness as the above code is currently written. 268 269L(bc): lea -8(rp), rp 270 sub $4, R32(n) 271 jc L(end) 272 273ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, 274` ALIGN(16)') 275L(top): mov (up), %r8 276 mov 8(up), %r9 277 lea 32(rp), rp 278 mov 16(up), %r10 279 mov 24(up), %r11 280 lea 32(up), up 281 not %r8 282 not %r9 283 not %r10 284 not %r11 285 mov %r8, -24(rp) 286 mov %r9, -16(rp) 287ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, 288` sub $4, R32(n)') 289 mov %r10, -8(rp) 290 mov %r11, (rp) 291ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, 292` jnc L(top)') 293 294L(end): test $1, R8(n) 295 jz 1f 296 mov (up), %r8 297 not %r8 298 mov %r8, 8(rp) 299 lea 8(rp), rp 300 lea 8(up), up 3011: test $2, R8(n) 302 jz 1f 303 mov (up), %r8 304 mov 8(up), %r9 305 not %r8 306 not %r9 307 mov %r8, 8(rp) 308 mov %r9, 16(rp) 3091: FUNC_EXIT() 310 ret 311EPILOGUE() 312