1dnl AMD K6-2 mpn_lshift -- mpn left shift. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6-2: 1.75 cycles/limb 24 25 26C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 27C unsigned shift); 28C 29 30defframe(PARAM_SHIFT,16) 31defframe(PARAM_SIZE, 12) 32defframe(PARAM_SRC, 8) 33defframe(PARAM_DST, 4) 34deflit(`FRAME',0) 35 36dnl used after src has been fetched 37define(VAR_RETVAL,`PARAM_SRC') 38 39dnl minimum 9, because unrolled loop can't handle less 40deflit(UNROLL_THRESHOLD, 9) 41 42 TEXT 43 ALIGN(32) 44 45PROLOGUE(mpn_lshift) 46deflit(`FRAME',0) 47 48 C The 1 limb case can be done without the push %ebx, but it's then 49 C still the same speed. The push is left as a free helping hand for 50 C the two_or_more code. 51 52 movl PARAM_SIZE, %eax 53 pushl %ebx FRAME_pushl() 54 55 movl PARAM_SRC, %ebx 56 decl %eax 57 58 movl PARAM_SHIFT, %ecx 59 jnz L(two_or_more) 60 61 movl (%ebx), %edx C src limb 62 movl PARAM_DST, %ebx 63 64 shldl( %cl, %edx, %eax) C return value 65 66 shll %cl, %edx 67 68 movl %edx, (%ebx) C dst limb 69 popl %ebx 70 71 ret 72 73 74C ----------------------------------------------------------------------------- 75 ALIGN(16) C avoid offset 0x1f 76L(two_or_more): 77 C eax size-1 78 C ebx src 79 C ecx shift 80 C edx 81 82 movl (%ebx,%eax,4), %edx C src high limb 83 negl %ecx 84 85 movd PARAM_SHIFT, %mm6 86 addl $32, %ecx C 32-shift 87 88 shrl %cl, %edx 89 cmpl $UNROLL_THRESHOLD-1, %eax 90 91 movl %edx, VAR_RETVAL 92 jae L(unroll) 93 94 95 movd %ecx, %mm7 96 movl %eax, %ecx 97 98 movl PARAM_DST, %eax 99 100L(simple): 101 C eax dst 102 C ebx src 103 C ecx counter, size-1 to 1 104 C edx retval 105 C 106 C mm0 scratch 107 C mm6 shift 108 C mm7 32-shift 109 110 movq -4(%ebx,%ecx,4), %mm0 111 112 psrlq %mm7, %mm0 113 114Zdisp( movd, %mm0, 0,(%eax,%ecx,4)) 115 loop L(simple) 116 117 118 movd (%ebx), %mm0 119 popl %ebx 120 121 psllq %mm6, %mm0 122 123 movd %mm0, (%eax) 124 movl %edx, %eax 125 126 femms 127 ret 128 129 130C ----------------------------------------------------------------------------- 131 ALIGN(16) 132L(unroll): 133 C eax size-1 134 C ebx src 135 C ecx 32-shift 136 C edx retval (but instead VAR_RETVAL is used) 137 C 138 C mm6 shift 139 140 addl $32, %ecx 141 movl PARAM_DST, %edx 142 143 movd %ecx, %mm7 144 subl $7, %eax C size-8 145 146 leal (%edx,%eax,4), %ecx C alignment of dst 147 148 movq 32-8(%ebx,%eax,4), %mm2 C src high qword 149 testb $4, %cl 150 151 jz L(dst_aligned) 152 psllq %mm6, %mm2 153 154 psrlq $32, %mm2 155 decl %eax 156 157 movd %mm2, 32(%edx,%eax,4) C dst high limb 158 movq 32-8(%ebx,%eax,4), %mm2 C new src high qword 159L(dst_aligned): 160 161 movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword 162 163 164 C This loop is the important bit, the rest is just support for it. 165 C Four src limbs are held at the start, and four more will be read. 166 C Four dst limbs will be written. This schedule seems necessary for 167 C full speed. 168 C 169 C The use of size-8 lets the loop stop when %eax goes negative and 170 C leaves -4 to -1 which can be tested with test $1 and $2. 171 172L(top): 173 C eax counter, size-8 step by -4 until <0 174 C ebx src 175 C ecx 176 C edx dst 177 C 178 C mm0 src next qword 179 C mm1 scratch 180 C mm2 src prev qword 181 C mm6 shift 182 C mm7 64-shift 183 184 psllq %mm6, %mm2 185 subl $4, %eax 186 187 movq %mm0, %mm1 188 psrlq %mm7, %mm0 189 190 por %mm0, %mm2 191 movq 24(%ebx,%eax,4), %mm0 192 193 psllq %mm6, %mm1 194 movq %mm2, 40(%edx,%eax,4) 195 196 movq %mm0, %mm2 197 psrlq %mm7, %mm0 198 199 por %mm0, %mm1 200 movq 16(%ebx,%eax,4), %mm0 201 202 movq %mm1, 32(%edx,%eax,4) 203 jnc L(top) 204 205 206 C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4. 207 C 208 C 8(%ebx) is the next source, and 24(%edx) is the next destination. 209 C %eax is between -4 and -1, representing respectively 0 to 3 extra 210 C limbs that must be read. 211 212 213 testl $2, %eax C testl to avoid bad cache line crossing 214 jz L(finish_nottwo) 215 216 C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes 217 C new mm2 and a new mm0 is loaded. 218 219 psllq %mm6, %mm2 220 movq %mm0, %mm1 221 222 psrlq %mm7, %mm0 223 subl $2, %eax 224 225 por %mm0, %mm2 226 movq 16(%ebx,%eax,4), %mm0 227 228 movq %mm2, 32(%edx,%eax,4) 229 movq %mm1, %mm2 230L(finish_nottwo): 231 232 233 C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0 234 235 testb $1, %al 236 psllq %mm6, %mm2 237 238 movq %mm0, %mm1 239 psrlq %mm7, %mm0 240 241 por %mm0, %mm2 242 psllq %mm6, %mm1 243 244 movq %mm2, 24(%edx,%eax,4) 245 jz L(finish_even) 246 247 248 C Size is odd, so mm1 and one extra limb to process. 249 250 movd (%ebx), %mm0 C src[0] 251 popl %ebx 252deflit(`FRAME',0) 253 254 movq %mm0, %mm2 255 psllq $32, %mm0 256 257 psrlq %mm7, %mm0 258 259 psllq %mm6, %mm2 260 por %mm0, %mm1 261 262 movq %mm1, 4(%edx) C dst[1,2] 263 movd %mm2, (%edx) C dst[0] 264 265 movl VAR_RETVAL, %eax 266 267 femms 268 ret 269 270 271 nop C avoid bad cache line crossing 272L(finish_even): 273deflit(`FRAME',4) 274 C Size is even, so only mm1 left to process. 275 276 movq %mm1, (%edx) C dst[0,1] 277 movl VAR_RETVAL, %eax 278 279 popl %ebx 280 femms 281 ret 282 283EPILOGUE() 284