1dnl AMD64 mpn_copyi optimised for CPUs with fast SSE. 2 3dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, 4dnl Inc. 5 6dnl Contributed to the GNU project by Torbj��rn Granlund. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C cycles/limb cycles/limb cycles/limb good 37C aligned unaligned best seen for cpu? 38C AMD K8,K9 39C AMD K10 0.85 1.64 Y/N 40C AMD bull 1.4 1.4 N 41C AMD pile 0.77 0.93 N 42C AMD steam ? ? 43C AMD excavator ? ? 44C AMD bobcat 45C AMD jaguar 0.65 1.02 opt/0.93 Y/N 46C Intel P4 2.3 2.3 Y 47C Intel core 1.0 1.0 0.52/0.64 N 48C Intel NHM 0.5 0.67 Y 49C Intel SBR 0.51 0.75 opt/0.54 Y/N 50C Intel IBR 0.50 0.57 opt/0.54 Y 51C Intel HWL 0.50 0.57 opt/0.51 Y 52C Intel BWL 0.55 0.62 opt/0.55 Y 53C Intel atom 54C Intel SLM 1.02 1.27 opt/1.07 Y/N 55C VIA nano 1.16 5.16 Y/N 56 57C We try to do as many 16-byte operations as possible. The top-most and 58C bottom-most writes might need 8-byte operations. We can always write using 59C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 60C operations. 61 62C Instead of having separate loops for reading aligned and unaligned, we read 63C using MOVDQU. This seems to work great except for core2; there performance 64C doubles when reading using MOVDQA (for aligned source). It is unclear how to 65C best handle the unaligned case there. 66 67C INPUT PARAMETERS 68define(`rp', `%rdi') 69define(`up', `%rsi') 70define(`n', `%rdx') 71 72ABI_SUPPORT(DOS64) 73ABI_SUPPORT(STD64) 74 75dnl define(`movdqu', lddqu) 76 77ASM_START() 78 TEXT 79 ALIGN(64) 80PROLOGUE(mpn_copyi) 81 FUNC_ENTRY(3) 82 83 cmp $3, n C NB: bc code below assumes this limit 84 jc L(bc) 85 86 test $8, R8(rp) C is rp 16-byte aligned? 87 jz L(ali) C jump if rp aligned 88 movsq C copy single limb 89 dec n 90 91L(ali): sub $16, n 92 jc L(sma) 93 94IFDOS(` add $-56, %rsp ') 95IFDOS(` movdqa %xmm6, (%rsp) ') 96IFDOS(` movdqa %xmm7, 16(%rsp) ') 97 98 ALIGN(16) 99L(top): movdqu (up), %xmm0 100 movdqu 16(up), %xmm1 101 movdqu 32(up), %xmm2 102 movdqu 48(up), %xmm3 103 movdqu 64(up), %xmm4 104 movdqu 80(up), %xmm5 105 movdqu 96(up), %xmm6 106 movdqu 112(up), %xmm7 107 lea 128(up), up 108 movdqa %xmm0, (rp) 109 movdqa %xmm1, 16(rp) 110 movdqa %xmm2, 32(rp) 111 movdqa %xmm3, 48(rp) 112 movdqa %xmm4, 64(rp) 113 movdqa %xmm5, 80(rp) 114 movdqa %xmm6, 96(rp) 115 movdqa %xmm7, 112(rp) 116 lea 128(rp), rp 117 sub $16, n 118 jnc L(top) 119 120IFDOS(` movdqa (%rsp), %xmm6 ') 121IFDOS(` movdqa 16(%rsp), %xmm7 ') 122IFDOS(` add $56, %rsp ') 123 124L(sma): test $8, R8(n) 125 jz 1f 126 movdqu (up), %xmm0 127 movdqu 16(up), %xmm1 128 movdqu 32(up), %xmm2 129 movdqu 48(up), %xmm3 130 lea 64(up), up 131 movdqa %xmm0, (rp) 132 movdqa %xmm1, 16(rp) 133 movdqa %xmm2, 32(rp) 134 movdqa %xmm3, 48(rp) 135 lea 64(rp), rp 1361: 137 test $4, R8(n) 138 jz 1f 139 movdqu (up), %xmm0 140 movdqu 16(up), %xmm1 141 lea 32(up), up 142 movdqa %xmm0, (rp) 143 movdqa %xmm1, 16(rp) 144 lea 32(rp), rp 1451: 146 test $2, R8(n) 147 jz 1f 148 movdqu (up), %xmm0 149 lea 16(up), up 150 movdqa %xmm0, (rp) 151 lea 16(rp), rp 152 ALIGN(16) 1531: 154L(end): test $1, R8(n) 155 jz 1f 156 mov (up), %r8 157 mov %r8, (rp) 1581: 159 FUNC_EXIT() 160 ret 161 162C Basecase code. Needed for good small operands speed, not for correctness as 163C the above code is currently written. The commented-out lines need to be 164C reinstated if this code is to be used for n > 3, and then the post loop 165C offsets need fixing. 166 167L(bc): sub $2, n 168 jc L(end) 169 ALIGN(16) 1701: mov (up), %rax 171 mov 8(up), %rcx 172dnl lea 16(up), up 173 mov %rax, (rp) 174 mov %rcx, 8(rp) 175dnl lea 16(rp), rp 176dnl sub $2, n 177dnl jnc 1b 178 179 test $1, R8(n) 180 jz L(ret) 181 mov 16(up), %rax 182 mov %rax, 16(rp) 183L(ret): FUNC_EXIT() 184 ret 185EPILOGUE() 186