1dnl AMD64 mpn_copyi optimised for CPUs with fast AVX. 2 3dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbj��rn Granlund. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb cycles/limb cycles/limb good 36C aligned unaligned best seen for cpu? 37C AMD K8,K9 n/a 38C AMD K10 n/a 39C AMD bull n/a 40C AMD pile 4.87 4.87 N 41C AMD steam ? ? 42C AMD bobcat n/a 43C AMD jaguar n/a 44C Intel P4 n/a 45C Intel core n/a 46C Intel NHM n/a 47C Intel SBR 0.50 0.91 N 48C Intel IBR 0.50 0.65 N 49C Intel HWL 0.25 0.30 Y 50C Intel BWL 0.28 0.37 Y 51C Intel atom n/a 52C VIA nano n/a 53 54C We try to do as many 32-byte operations as possible. The top-most and 55C bottom-most writes might need 8-byte operations. For the bulk copying, we 56C write using aligned 32-byte operations, but we read with both aligned and 57C unaligned 32-byte operations. 58 59define(`rp', `%rdi') 60define(`up', `%rsi') 61define(`n', `%rdx') 62 63ABI_SUPPORT(DOS64) 64ABI_SUPPORT(STD64) 65 66dnl define(`vmovdqu', vlddqu) 67 68ASM_START() 69 TEXT 70 ALIGN(32) 71PROLOGUE(mpn_copyi) 72 FUNC_ENTRY(3) 73 74 cmp $7, n 75 jbe L(bc) 76 77 test $8, R8(rp) C is rp 16-byte aligned? 78 jz L(a2) C jump if rp aligned 79 mov (up), %rax 80 lea 8(up), up 81 mov %rax, (rp) 82 lea 8(rp), rp 83 dec n 84L(a2): test $16, R8(rp) C is rp 32-byte aligned? 85 jz L(a3) C jump if rp aligned 86 vmovdqu (up), %xmm0 87 lea 16(up), up 88 vmovdqa %xmm0, (rp) 89 lea 16(rp), rp 90 sub $2, n 91L(a3): sub $16, n 92 jc L(sma) 93 94 ALIGN(16) 95L(top): vmovdqu (up), %ymm0 96 vmovdqu 32(up), %ymm1 97 vmovdqu 64(up), %ymm2 98 vmovdqu 96(up), %ymm3 99 lea 128(up), up 100 vmovdqa %ymm0, (rp) 101 vmovdqa %ymm1, 32(rp) 102 vmovdqa %ymm2, 64(rp) 103 vmovdqa %ymm3, 96(rp) 104 lea 128(rp), rp 105L(ali): sub $16, n 106 jnc L(top) 107 108L(sma): test $8, R8(n) 109 jz 1f 110 vmovdqu (up), %ymm0 111 vmovdqu 32(up), %ymm1 112 lea 64(up), up 113 vmovdqa %ymm0, (rp) 114 vmovdqa %ymm1, 32(rp) 115 lea 64(rp), rp 1161: 117 test $4, R8(n) 118 jz 1f 119 vmovdqu (up), %ymm0 120 lea 32(up), up 121 vmovdqa %ymm0, (rp) 122 lea 32(rp), rp 1231: 124 test $2, R8(n) 125 jz 1f 126 vmovdqu (up), %xmm0 127 lea 16(up), up 128 vmovdqa %xmm0, (rp) 129 lea 16(rp), rp 1301: 131L(end): test $1, R8(n) 132 jz 1f 133 mov (up), %r8 134 mov %r8, (rp) 1351: 136 FUNC_EXIT() 137 ret 138 139 ALIGN(16) 140L(bc): test $4, R8(n) 141 jz 1f 142 mov (up), %rax 143 mov 8(up), %rcx 144 mov 16(up), %r8 145 mov 24(up), %r9 146 lea 32(up), up 147 mov %rax, (rp) 148 mov %rcx, 8(rp) 149 mov %r8, 16(rp) 150 mov %r9, 24(rp) 151 lea 32(rp), rp 1521: 153 test $2, R8(n) 154 jz 1f 155 mov (up), %rax 156 mov 8(up), %rcx 157 lea 16(up), up 158 mov %rax, (rp) 159 mov %rcx, 8(rp) 160 lea 16(rp), rp 1611: 162 test $1, R8(n) 163 jz 1f 164 mov (up), %rax 165 mov %rax, (rp) 1661: 167 FUNC_EXIT() 168 ret 169EPILOGUE() 170