1dnl AMD K6-2 mpn_rshift -- mpn right shift. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6-2: 1.75 cycles/limb 24 25 26C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 27C unsigned shift); 28C 29 30defframe(PARAM_SHIFT,16) 31defframe(PARAM_SIZE, 12) 32defframe(PARAM_SRC, 8) 33defframe(PARAM_DST, 4) 34deflit(`FRAME',0) 35 36dnl Minimum 9, because the unrolled loop can't handle less. 37dnl 38deflit(UNROLL_THRESHOLD, 9) 39 40 TEXT 41 ALIGN(32) 42 43PROLOGUE(mpn_rshift) 44deflit(`FRAME',0) 45 46 C The 1 limb case can be done without the push %ebx, but it's then 47 C still the same speed. The push is left as a free helping hand for 48 C the two_or_more code. 49 50 movl PARAM_SIZE, %eax 51 pushl %ebx FRAME_pushl() 52 53 movl PARAM_SRC, %ebx 54 decl %eax 55 56 movl PARAM_SHIFT, %ecx 57 jnz L(two_or_more) 58 59 movl (%ebx), %edx C src limb 60 movl PARAM_DST, %ebx 61 62 shrdl( %cl, %edx, %eax) C return value 63 64 shrl %cl, %edx 65 66 movl %edx, (%ebx) C dst limb 67 popl %ebx 68 69 ret 70 71 72C ----------------------------------------------------------------------------- 73 ALIGN(16) C avoid offset 0x1f 74L(two_or_more): 75 C eax size-1 76 C ebx src 77 C ecx shift 78 C edx 79 80 movl (%ebx), %edx C src low limb 81 negl %ecx 82 83 addl $32, %ecx 84 movd PARAM_SHIFT, %mm6 85 86 shll %cl, %edx 87 cmpl $UNROLL_THRESHOLD-1, %eax 88 89 jae L(unroll) 90 91 92 C eax size-1 93 C ebx src 94 C ecx 32-shift 95 C edx retval 96 C 97 C mm6 shift 98 99 movl PARAM_DST, %ecx 100 leal (%ebx,%eax,4), %ebx 101 102 leal -4(%ecx,%eax,4), %ecx 103 negl %eax 104 105 C This loop runs at about 3 cycles/limb, which is the amount of 106 C decoding, and this is despite every second access being unaligned. 107 108L(simple): 109 C eax counter, -(size-1) to -1 110 C ebx &src[size-1] 111 C ecx &dst[size-1] 112 C edx retval 113 C 114 C mm0 scratch 115 C mm6 shift 116 117Zdisp( movq, 0,(%ebx,%eax,4), %mm0) 118 incl %eax 119 120 psrlq %mm6, %mm0 121 122Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) 123 jnz L(simple) 124 125 126 movq %mm0, (%ecx) 127 movl %edx, %eax 128 129 popl %ebx 130 131 femms 132 ret 133 134 135C ----------------------------------------------------------------------------- 136 ALIGN(16) 137L(unroll): 138 C eax size-1 139 C ebx src 140 C ecx 32-shift 141 C edx retval 142 C 143 C mm6 shift 144 145 addl $32, %ecx 146 subl $7, %eax C size-8 147 148 movd %ecx, %mm7 149 movl PARAM_DST, %ecx 150 151 movq (%ebx), %mm2 C src low qword 152 leal (%ebx,%eax,4), %ebx C src end - 32 153 154 testb $4, %cl 155 leal (%ecx,%eax,4), %ecx C dst end - 32 156 157 notl %eax C -(size-7) 158 jz L(dst_aligned) 159 160 psrlq %mm6, %mm2 161 incl %eax 162 163Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb 164 movq 4(%ebx,%eax,4), %mm2 C new src low qword 165L(dst_aligned): 166 167 movq 12(%ebx,%eax,4), %mm0 C src second lowest qword 168 nop C avoid bad cache line crossing 169 170 171 C This loop is the important bit, the rest is just support for it. 172 C Four src limbs are held at the start, and four more will be read. 173 C Four dst limbs will be written. This schedule seems necessary for 174 C full speed. 175 C 176 C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and 177 C and leaves 0 to 3 which can be tested with test $1 and $2. 178 179L(top): 180 C eax counter, -(size-7) step by +4 until >=0 181 C ebx src end - 32 182 C ecx dst end - 32 183 C edx retval 184 C 185 C mm0 src next qword 186 C mm1 scratch 187 C mm2 src prev qword 188 C mm6 shift 189 C mm7 64-shift 190 191 psrlq %mm6, %mm2 192 addl $4, %eax 193 194 movq %mm0, %mm1 195 psllq %mm7, %mm0 196 197 por %mm0, %mm2 198 movq 4(%ebx,%eax,4), %mm0 199 200 psrlq %mm6, %mm1 201 movq %mm2, -12(%ecx,%eax,4) 202 203 movq %mm0, %mm2 204 psllq %mm7, %mm0 205 206 por %mm0, %mm1 207 movq 12(%ebx,%eax,4), %mm0 208 209 movq %mm1, -4(%ecx,%eax,4) 210 ja L(top) C jump if no carry and not zero 211 212 213 214 C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0 215 C to 3 representing respectively 3 to 0 further limbs. 216 217 testl $2, %eax C testl to avoid bad cache line crossings 218 jnz L(finish_nottwo) 219 220 C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0 221 C becomes new mm2 and a new mm0 is loaded. 222 223 psrlq %mm6, %mm2 224 movq %mm0, %mm1 225 226 psllq %mm7, %mm0 227 addl $2, %eax 228 229 por %mm0, %mm2 230 movq 12(%ebx,%eax,4), %mm0 231 232 movq %mm2, -4(%ecx,%eax,4) 233 movq %mm1, %mm2 234L(finish_nottwo): 235 236 237 testb $1, %al 238 psrlq %mm6, %mm2 239 240 movq %mm0, %mm1 241 psllq %mm7, %mm0 242 243 por %mm0, %mm2 244 psrlq %mm6, %mm1 245 246 movq %mm2, 4(%ecx,%eax,4) 247 jnz L(finish_even) 248 249 250 C one further extra limb to process 251 252 movd 32-4(%ebx), %mm0 C src[size-1], most significant limb 253 popl %ebx 254 255 movq %mm0, %mm2 256 psllq %mm7, %mm0 257 258 por %mm0, %mm1 259 psrlq %mm6, %mm2 260 261 movq %mm1, 32-12(%ecx) C dst[size-3,size-2] 262 movd %mm2, 32-4(%ecx) C dst[size-1] 263 264 movl %edx, %eax C retval 265 266 femms 267 ret 268 269 270 nop C avoid bad cache line crossing 271L(finish_even): 272 C no further extra limbs 273 274 movq %mm1, 32-8(%ecx) C dst[size-2,size-1] 275 movl %edx, %eax C retval 276 277 popl %ebx 278 279 femms 280 ret 281 282EPILOGUE() 283