1dnl AMD64 mpn_copyd optimised for CPUs with fast AVX. 2 3dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbj��rn Granlund. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb cycles/limb cycles/limb good 36C aligned unaligned best seen for cpu? 37C AMD K8,K9 n/a 38C AMD K10 n/a 39C AMD bull n/a 40C AMD pile 4.87 4.87 N 41C AMD steam ? ? 42C AMD bobcat n/a 43C AMD jaguar n/a 44C Intel P4 n/a 45C Intel core n/a 46C Intel NHM n/a 47C Intel SBR 0.50 0.91 N 48C Intel IBR 0.50 0.65 N 49C Intel HWL 0.25 0.30 Y 50C Intel BWL 0.28 0.37 Y 51C Intel atom n/a 52C VIA nano n/a 53 54C We try to do as many 32-byte operations as possible. The top-most and 55C bottom-most writes might need 8-byte operations. For the bulk copying, we 56C write using aligned 32-byte operations, but we read with both aligned and 57C unaligned 32-byte operations. 58 59define(`rp', `%rdi') 60define(`up', `%rsi') 61define(`n', `%rdx') 62 63ABI_SUPPORT(DOS64) 64ABI_SUPPORT(STD64) 65 66dnl define(`vmovdqu', vlddqu) 67 68ASM_START() 69 TEXT 70 ALIGN(32) 71PROLOGUE(mpn_copyd) 72 FUNC_ENTRY(3) 73 74 lea -32(rp,n,8), rp 75 lea -32(up,n,8), up 76 77 cmp $7, n C basecase needed for correctness 78 jbe L(bc) 79 80 test $8, R8(rp) C is rp 16-byte aligned? 81 jz L(a2) C jump if rp aligned 82 mov 24(up), %rax 83 lea -8(up), up 84 mov %rax, 24(rp) 85 lea -8(rp), rp 86 dec n 87L(a2): test $16, R8(rp) C is rp 32-byte aligned? 88 jz L(a3) C jump if rp aligned 89 vmovdqu 16(up), %xmm0 90 lea -16(up), up 91 vmovdqa %xmm0, 16(rp) 92 lea -16(rp), rp 93 sub $2, n 94L(a3): sub $16, n 95 jc L(sma) 96 97 ALIGN(16) 98L(top): vmovdqu (up), %ymm0 99 vmovdqu -32(up), %ymm1 100 vmovdqu -64(up), %ymm2 101 vmovdqu -96(up), %ymm3 102 lea -128(up), up 103 vmovdqa %ymm0, (rp) 104 vmovdqa %ymm1, -32(rp) 105 vmovdqa %ymm2, -64(rp) 106 vmovdqa %ymm3, -96(rp) 107 lea -128(rp), rp 108L(ali): sub $16, n 109 jnc L(top) 110 111L(sma): test $8, R8(n) 112 jz 1f 113 vmovdqu (up), %ymm0 114 vmovdqu -32(up), %ymm1 115 lea -64(up), up 116 vmovdqa %ymm0, (rp) 117 vmovdqa %ymm1, -32(rp) 118 lea -64(rp), rp 1191: 120 test $4, R8(n) 121 jz 1f 122 vmovdqu (up), %ymm0 123 lea -32(up), up 124 vmovdqa %ymm0, (rp) 125 lea -32(rp), rp 1261: 127 test $2, R8(n) 128 jz 1f 129 vmovdqu 16(up), %xmm0 130 lea -16(up), up 131 vmovdqa %xmm0, 16(rp) 132 lea -16(rp), rp 1331: 134 test $1, R8(n) 135 jz 1f 136 mov 24(up), %r8 137 mov %r8, 24(rp) 1381: 139 FUNC_EXIT() 140 ret 141 142 ALIGN(16) 143L(bc): test $4, R8(n) 144 jz 1f 145 mov 24(up), %rax 146 mov 16(up), %rcx 147 mov 8(up), %r8 148 mov (up), %r9 149 lea -32(up), up 150 mov %rax, 24(rp) 151 mov %rcx, 16(rp) 152 mov %r8, 8(rp) 153 mov %r9, (rp) 154 lea -32(rp), rp 1551: 156 test $2, R8(n) 157 jz 1f 158 mov 24(up), %rax 159 mov 16(up), %rcx 160 lea -16(up), up 161 mov %rax, 24(rp) 162 mov %rcx, 16(rp) 163 lea -16(rp), rp 1641: 165 test $1, R8(n) 166 jz 1f 167 mov 24(up), %rax 168 mov %rax, 24(rp) 1691: 170 FUNC_EXIT() 171 ret 172EPILOGUE() 173