1dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing. 2 3dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P5: 1.25 cycles/limb 24 25 26C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); 27C 28C Destination prefetching is done to avoid repeated write-throughs on lines 29C not already in L1. 30C 31C At least one of the src or dst pointer needs to be incremented rather than 32C using indexing, so that there's somewhere to put the loop control without 33C an AGI. Incrementing one and not two lets us keep loop overhead to 2 34C cycles. Making it the src pointer incremented avoids an AGI on the %ecx 35C subtracts in the finishup code. 36C 37C The block of finishup code is almost as big as the main loop itself, which 38C is unfortunate, but it's faster that way than with say rep movsl, by about 39C 10 cycles for instance on P55. 40C 41C There's nothing to be gained from MMX on P55, since it can do only one 42C movq load (or store) per cycle, so the throughput would be the same as the 43C code here (and even then only if src and dst have the same alignment mod 44C 8). 45 46defframe(PARAM_SIZE,12) 47defframe(PARAM_SRC, 8) 48defframe(PARAM_DST, 4) 49 50 TEXT 51 ALIGN(8) 52PROLOGUE(mpn_copyi) 53deflit(`FRAME',0) 54 55 movl PARAM_SIZE, %ecx 56 movl PARAM_DST, %edx 57 58 pushl %ebx FRAME_pushl() 59 pushl %esi FRAME_pushl() 60 61 leal (%edx,%ecx,4), %edx C &dst[size-1] 62 xorl $-1, %ecx C -size-1 63 64 movl PARAM_SRC, %esi 65 addl $8, %ecx C -size+7 66 67 jns L(end) 68 69 movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0] 70 nop 71 72L(top): 73 C eax scratch 74 C ebx scratch 75 C ecx counter, limbs, negative 76 C edx &dst[size-1] 77 C esi src, incrementing 78 C edi 79 C ebp 80 81 movl (%edx,%ecx,4), %eax C fetch destination cache line 82 addl $8, %ecx 83 84 movl (%esi), %eax C read words pairwise 85 movl 4(%esi), %ebx 86 movl %eax, -60(%edx,%ecx,4) C store words pairwise 87 movl %ebx, -56(%edx,%ecx,4) 88 89 movl 8(%esi), %eax 90 movl 12(%esi), %ebx 91 movl %eax, -52(%edx,%ecx,4) 92 movl %ebx, -48(%edx,%ecx,4) 93 94 movl 16(%esi), %eax 95 movl 20(%esi), %ebx 96 movl %eax, -44(%edx,%ecx,4) 97 movl %ebx, -40(%edx,%ecx,4) 98 99 movl 24(%esi), %eax 100 movl 28(%esi), %ebx 101 movl %eax, -36(%edx,%ecx,4) 102 movl %ebx, -32(%edx,%ecx,4) 103 104 leal 32(%esi), %esi 105 js L(top) 106 107 108L(end): 109 C ecx 0 to 7, representing respectively 7 to 0 limbs remaining 110 C esi src end 111 C edx dst, next location to store 112 113 subl $4, %ecx 114 jns L(no4) 115 116 movl (%esi), %eax 117 movl 4(%esi), %ebx 118 movl %eax, -12(%edx,%ecx,4) 119 movl %ebx, -8(%edx,%ecx,4) 120 121 movl 8(%esi), %eax 122 movl 12(%esi), %ebx 123 movl %eax, -4(%edx,%ecx,4) 124 movl %ebx, (%edx,%ecx,4) 125 126 addl $16, %esi 127 addl $4, %ecx 128L(no4): 129 130 subl $2, %ecx 131 jns L(no2) 132 133 movl (%esi), %eax 134 movl 4(%esi), %ebx 135 movl %eax, -4(%edx,%ecx,4) 136 movl %ebx, (%edx,%ecx,4) 137 138 addl $8, %esi 139 addl $2, %ecx 140L(no2): 141 142 jnz L(done) 143 144 movl (%esi), %eax 145 movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here 146 147L(done): 148 popl %esi 149 popl %ebx 150 151 ret 152 153EPILOGUE() 154