1dnl Intel Pentium mpn_lshift -- mpn left shift. 2 3dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software 4dnl Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C P5,P54: 6.0 26C P55: 5.375 27 28 29C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 30C unsigned shift); 31C 32C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, 33C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. 34 35defframe(PARAM_SHIFT,16) 36defframe(PARAM_SIZE, 12) 37defframe(PARAM_SRC, 8) 38defframe(PARAM_DST, 4) 39 40 TEXT 41 ALIGN(8) 42PROLOGUE(mpn_lshift) 43 44 pushl %edi 45 pushl %esi 46 pushl %ebx 47 pushl %ebp 48deflit(`FRAME',16) 49 50 movl PARAM_DST,%edi 51 movl PARAM_SRC,%esi 52 movl PARAM_SIZE,%ebp 53 movl PARAM_SHIFT,%ecx 54 55C We can use faster code for shift-by-1 under certain conditions. 56 cmp $1,%ecx 57 jne L(normal) 58 leal 4(%esi),%eax 59 cmpl %edi,%eax 60 jnc L(special) C jump if s_ptr + 1 >= res_ptr 61 leal (%esi,%ebp,4),%eax 62 cmpl %eax,%edi 63 jnc L(special) C jump if res_ptr >= s_ptr + size 64 65L(normal): 66 leal -4(%edi,%ebp,4),%edi 67 leal -4(%esi,%ebp,4),%esi 68 69 movl (%esi),%edx 70 subl $4,%esi 71 xorl %eax,%eax 72 shldl( %cl, %edx, %eax) C compute carry limb 73 pushl %eax C push carry limb onto stack 74 75 decl %ebp 76 pushl %ebp 77 shrl $3,%ebp 78 jz L(end) 79 80 movl (%edi),%eax C fetch destination cache line 81 82 ALIGN(4) 83L(oop): movl -28(%edi),%eax C fetch destination cache line 84 movl %edx,%ebx 85 86 movl (%esi),%eax 87 movl -4(%esi),%edx 88 shldl( %cl, %eax, %ebx) 89 shldl( %cl, %edx, %eax) 90 movl %ebx,(%edi) 91 movl %eax,-4(%edi) 92 93 movl -8(%esi),%ebx 94 movl -12(%esi),%eax 95 shldl( %cl, %ebx, %edx) 96 shldl( %cl, %eax, %ebx) 97 movl %edx,-8(%edi) 98 movl %ebx,-12(%edi) 99 100 movl -16(%esi),%edx 101 movl -20(%esi),%ebx 102 shldl( %cl, %edx, %eax) 103 shldl( %cl, %ebx, %edx) 104 movl %eax,-16(%edi) 105 movl %edx,-20(%edi) 106 107 movl -24(%esi),%eax 108 movl -28(%esi),%edx 109 shldl( %cl, %eax, %ebx) 110 shldl( %cl, %edx, %eax) 111 movl %ebx,-24(%edi) 112 movl %eax,-28(%edi) 113 114 subl $32,%esi 115 subl $32,%edi 116 decl %ebp 117 jnz L(oop) 118 119L(end): popl %ebp 120 andl $7,%ebp 121 jz L(end2) 122L(oop2): 123 movl (%esi),%eax 124 shldl( %cl,%eax,%edx) 125 movl %edx,(%edi) 126 movl %eax,%edx 127 subl $4,%esi 128 subl $4,%edi 129 decl %ebp 130 jnz L(oop2) 131 132L(end2): 133 shll %cl,%edx C compute least significant limb 134 movl %edx,(%edi) C store it 135 136 popl %eax C pop carry limb 137 138 popl %ebp 139 popl %ebx 140 popl %esi 141 popl %edi 142 ret 143 144 145C We loop from least significant end of the arrays, which is only 146C permissable if the source and destination don't overlap, since the 147C function is documented to work for overlapping source and destination. 148 149L(special): 150 movl (%esi),%edx 151 addl $4,%esi 152 153 decl %ebp 154 pushl %ebp 155 shrl $3,%ebp 156 157 addl %edx,%edx 158 incl %ebp 159 decl %ebp 160 jz L(Lend) 161 162 movl (%edi),%eax C fetch destination cache line 163 164 ALIGN(4) 165L(Loop): 166 movl 28(%edi),%eax C fetch destination cache line 167 movl %edx,%ebx 168 169 movl (%esi),%eax 170 movl 4(%esi),%edx 171 adcl %eax,%eax 172 movl %ebx,(%edi) 173 adcl %edx,%edx 174 movl %eax,4(%edi) 175 176 movl 8(%esi),%ebx 177 movl 12(%esi),%eax 178 adcl %ebx,%ebx 179 movl %edx,8(%edi) 180 adcl %eax,%eax 181 movl %ebx,12(%edi) 182 183 movl 16(%esi),%edx 184 movl 20(%esi),%ebx 185 adcl %edx,%edx 186 movl %eax,16(%edi) 187 adcl %ebx,%ebx 188 movl %edx,20(%edi) 189 190 movl 24(%esi),%eax 191 movl 28(%esi),%edx 192 adcl %eax,%eax 193 movl %ebx,24(%edi) 194 adcl %edx,%edx 195 movl %eax,28(%edi) 196 197 leal 32(%esi),%esi C use leal not to clobber carry 198 leal 32(%edi),%edi 199 decl %ebp 200 jnz L(Loop) 201 202L(Lend): 203 popl %ebp 204 sbbl %eax,%eax C save carry in %eax 205 andl $7,%ebp 206 jz L(Lend2) 207 addl %eax,%eax C restore carry from eax 208L(Loop2): 209 movl %edx,%ebx 210 movl (%esi),%edx 211 adcl %edx,%edx 212 movl %ebx,(%edi) 213 214 leal 4(%esi),%esi C use leal not to clobber carry 215 leal 4(%edi),%edi 216 decl %ebp 217 jnz L(Loop2) 218 219 jmp L(L1) 220L(Lend2): 221 addl %eax,%eax C restore carry from eax 222L(L1): movl %edx,(%edi) C store last limb 223 224 sbbl %eax,%eax 225 negl %eax 226 227 popl %ebp 228 popl %ebx 229 popl %esi 230 popl %edi 231 ret 232 233EPILOGUE() 234