1dnl Pentium-4 mpn_copyd -- copy limb vector, decrementing. 2dnl 3 4dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21 22dnl The std/rep/movsl/cld is very slow for small blocks on pentium4. Its 23dnl startup time seems to be about 165 cycles. It then needs 2.6 c/l. 24dnl We therefore use an open-coded 2 c/l copying loop. 25 26dnl Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some 27dnl nifty unrolled arrangement. Clearly, that could reach much higher 28dnl speeds, at least for large blocks. 29 30include(`../config.m4') 31 32 33defframe(PARAM_SIZE, 12) 34defframe(PARAM_SRC, 8) 35defframe(PARAM_DST, 4) 36 37 TEXT 38 ALIGN(8) 39 40PROLOGUE(mpn_copyd) 41deflit(`FRAME',0) 42 43 movl PARAM_SIZE, %ecx 44 45 movl PARAM_SRC, %eax 46 movl PARAM_DST, %edx 47 movl %ebx, PARAM_SIZE 48 addl $-1, %ecx 49 js L(end) 50 51L(loop): 52 movl (%eax,%ecx,4), %ebx 53 movl %ebx, (%edx,%ecx,4) 54 addl $-1, %ecx 55 56 jns L(loop) 57L(end): 58 movl PARAM_SIZE, %ebx 59 ret 60 61EPILOGUE() 62