1dnl Intel Pentium mpn_copyd -- copy limb vector, decrementing. 2 3dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P5: 1.25 cycles/limb 24 25 26C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); 27C 28C See comments in copyi.asm. 29 30defframe(PARAM_SIZE,12) 31defframe(PARAM_SRC, 8) 32defframe(PARAM_DST, 4) 33 34 TEXT 35 ALIGN(8) 36PROLOGUE(mpn_copyd) 37deflit(`FRAME',0) 38 39 movl PARAM_SRC, %eax 40 movl PARAM_SIZE, %ecx 41 42 pushl %esi FRAME_pushl() 43 pushl %edi FRAME_pushl() 44 45 leal -4(%eax,%ecx,4), %eax C &src[size-1] 46 movl PARAM_DST, %edx 47 48 subl $7, %ecx C size-7 49 jle L(end) 50 51 movl 28-4(%edx,%ecx,4), %esi C prefetch cache, dst[size-1] 52 nop 53 54L(top): 55 C eax src, decrementing 56 C ebx 57 C ecx counter, limbs 58 C edx dst 59 C esi scratch 60 C edi scratch 61 C ebp 62 63 movl 28-32(%edx,%ecx,4), %esi C prefetch dst cache line 64 subl $8, %ecx 65 66 movl (%eax), %esi C read words pairwise 67 movl -4(%eax), %edi 68 movl %esi, 56(%edx,%ecx,4) C store words pairwise 69 movl %edi, 52(%edx,%ecx,4) 70 71 movl -8(%eax), %esi 72 movl -12(%eax), %edi 73 movl %esi, 48(%edx,%ecx,4) 74 movl %edi, 44(%edx,%ecx,4) 75 76 movl -16(%eax), %esi 77 movl -20(%eax), %edi 78 movl %esi, 40(%edx,%ecx,4) 79 movl %edi, 36(%edx,%ecx,4) 80 81 movl -24(%eax), %esi 82 movl -28(%eax), %edi 83 movl %esi, 32(%edx,%ecx,4) 84 movl %edi, 28(%edx,%ecx,4) 85 86 leal -32(%eax), %eax 87 jg L(top) 88 89 90L(end): 91 C ecx -7 to 0, representing respectively 0 to 7 limbs remaining 92 C eax src end 93 C edx dst, next location to store 94 95 addl $4, %ecx 96 jle L(no4) 97 98 movl (%eax), %esi 99 movl -4(%eax), %edi 100 movl %esi, 8(%edx,%ecx,4) 101 movl %edi, 4(%edx,%ecx,4) 102 103 movl -8(%eax), %esi 104 movl -12(%eax), %edi 105 movl %esi, (%edx,%ecx,4) 106 movl %edi, -4(%edx,%ecx,4) 107 108 subl $16, %eax 109 subl $4, %ecx 110L(no4): 111 112 addl $2, %ecx 113 jle L(no2) 114 115 movl (%eax), %esi 116 movl -4(%eax), %edi 117 movl %esi, (%edx,%ecx,4) 118 movl %edi, -4(%edx,%ecx,4) 119 120 subl $8, %eax 121 subl $2, %ecx 122L(no2): 123 124 jnz L(done) 125 126 movl (%eax), %ecx 127 movl %ecx, (%edx) C risk of cache bank clash here 128 129L(done): 130 popl %edi 131 popl %esi 132 133 ret 134 135EPILOGUE() 136