1dnl AMD K7 mpn_copyd -- copy limb vector, decrementing. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C alignment dst/src, A=0mod8 N=4mod8 24C A/A A/N N/A N/N 25C K7 0.75 1.0 1.0 0.75 26 27 28C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); 29C 30C The various comments in mpn/x86/k7/copyi.asm apply here too. 31 32defframe(PARAM_SIZE,12) 33defframe(PARAM_SRC, 8) 34defframe(PARAM_DST, 4) 35deflit(`FRAME',0) 36 37dnl parameter space reused 38define(SAVE_EBX,`PARAM_SIZE') 39define(SAVE_ESI,`PARAM_SRC') 40 41dnl minimum 5 since the unrolled code can't handle less than 5 42deflit(UNROLL_THRESHOLD, 5) 43 44 TEXT 45 ALIGN(32) 46PROLOGUE(mpn_copyd) 47 48 movl PARAM_SIZE, %ecx 49 movl %ebx, SAVE_EBX 50 51 movl PARAM_SRC, %eax 52 movl PARAM_DST, %edx 53 54 cmpl $UNROLL_THRESHOLD, %ecx 55 jae L(unroll) 56 57 orl %ecx, %ecx 58 jz L(simple_done) 59 60L(simple): 61 C eax src 62 C ebx scratch 63 C ecx counter 64 C edx dst 65 C 66 C this loop is 2 cycles/limb 67 68 movl -4(%eax,%ecx,4), %ebx 69 movl %ebx, -4(%edx,%ecx,4) 70 decl %ecx 71 jnz L(simple) 72 73L(simple_done): 74 movl SAVE_EBX, %ebx 75 ret 76 77 78L(unroll): 79 movl %esi, SAVE_ESI 80 leal (%eax,%ecx,4), %ebx 81 leal (%edx,%ecx,4), %esi 82 83 andl %esi, %ebx 84 movl SAVE_ESI, %esi 85 subl $4, %ecx C size-4 86 87 testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) 88 jz L(aligned) 89 90 C both src and dst unaligned, process one limb to align them 91 movl 12(%eax,%ecx,4), %ebx 92 movl %ebx, 12(%edx,%ecx,4) 93 decl %ecx 94L(aligned): 95 96 97 ALIGN(16) 98L(top): 99 C eax src 100 C ebx 101 C ecx counter, limbs 102 C edx dst 103 104 movq 8(%eax,%ecx,4), %mm0 105 movq (%eax,%ecx,4), %mm1 106 subl $4, %ecx 107 movq %mm0, 16+8(%edx,%ecx,4) 108 movq %mm1, 16(%edx,%ecx,4) 109 jns L(top) 110 111 112 C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining 113 114 testb $2, %cl 115 jz L(finish_not_two) 116 117 movq 8(%eax,%ecx,4), %mm0 118 movq %mm0, 8(%edx,%ecx,4) 119L(finish_not_two): 120 121 testb $1, %cl 122 jz L(done) 123 124 movl (%eax), %ebx 125 movl %ebx, (%edx) 126 127L(done): 128 movl SAVE_EBX, %ebx 129 emms 130 ret 131 132 133EPILOGUE() 134