rshift.asm revision 1.1.1.1
1dnl AMD K7 mpn_rshift -- mpn right shift. 2 3dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K7: 1.21 cycles/limb (at 16 limbs/loop). 24 25 26 27dnl K7: UNROLL_COUNT cycles/limb 28dnl 4 1.51 29dnl 8 1.26 30dnl 16 1.21 31dnl 32 1.2 32dnl Maximum possible with the current code is 64. 33 34deflit(UNROLL_COUNT, 16) 35 36 37C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 38C unsigned shift); 39C 40C Shift src,size right by shift many bits and store the result in dst,size. 41C Zeros are shifted in at the left. The bits shifted out at the right are 42C the return value. 43C 44C This code uses 64-bit MMX operations, which makes it possible to handle 45C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer 46C code, on the other hand, suffers from shrd being a vector path decode and 47C running at 3 cycles back-to-back. 48C 49C Full speed depends on source and destination being aligned, and some hairy 50C setups and finish-ups are done to arrange this for the loop. 51 52ifdef(`PIC',` 53deflit(UNROLL_THRESHOLD, 10) 54',` 55deflit(UNROLL_THRESHOLD, 10) 56') 57 58defframe(PARAM_SHIFT,16) 59defframe(PARAM_SIZE, 12) 60defframe(PARAM_SRC, 8) 61defframe(PARAM_DST, 4) 62 63defframe(SAVE_EDI, -4) 64defframe(SAVE_ESI, -8) 65defframe(SAVE_EBX, -12) 66deflit(SAVE_SIZE, 12) 67 68 TEXT 69 ALIGN(32) 70 71PROLOGUE(mpn_rshift) 72deflit(`FRAME',0) 73 74 movl PARAM_SIZE, %eax 75 movl PARAM_SRC, %edx 76 subl $SAVE_SIZE, %esp 77deflit(`FRAME',SAVE_SIZE) 78 79 movl PARAM_SHIFT, %ecx 80 movl %edi, SAVE_EDI 81 82 movl PARAM_DST, %edi 83 decl %eax 84 jnz L(more_than_one_limb) 85 86 movl (%edx), %edx C src limb 87 88 shrdl( %cl, %edx, %eax) C eax was decremented to zero 89 90 shrl %cl, %edx 91 92 movl %edx, (%edi) C dst limb 93 movl SAVE_EDI, %edi 94 addl $SAVE_SIZE, %esp 95 96 ret 97 98 99C ----------------------------------------------------------------------------- 100L(more_than_one_limb): 101 C eax size-1 102 C ebx 103 C ecx shift 104 C edx src 105 C esi 106 C edi dst 107 C ebp 108 109 movd PARAM_SHIFT, %mm6 C rshift 110 movd (%edx), %mm5 C src low limb 111 cmp $UNROLL_THRESHOLD-1, %eax 112 113 jae L(unroll) 114 leal (%edx,%eax,4), %edx C &src[size-1] 115 leal -4(%edi,%eax,4), %edi C &dst[size-2] 116 117 movd (%edx), %mm4 C src high limb 118 negl %eax 119 120 121L(simple_top): 122 C eax loop counter, limbs, negative 123 C ebx 124 C ecx shift 125 C edx carry 126 C edx &src[size-1] 127 C edi &dst[size-2] 128 C ebp 129 C 130 C mm0 scratch 131 C mm4 src high limb 132 C mm5 src low limb 133 C mm6 shift 134 135 movq (%edx,%eax,4), %mm0 136 incl %eax 137 138 psrlq %mm6, %mm0 139 140 movd %mm0, (%edi,%eax,4) 141 jnz L(simple_top) 142 143 144 psllq $32, %mm5 145 psrlq %mm6, %mm4 146 147 psrlq %mm6, %mm5 148 movd %mm4, 4(%edi) C dst high limb 149 150 movd %mm5, %eax C return value 151 152 movl SAVE_EDI, %edi 153 addl $SAVE_SIZE, %esp 154 emms 155 156 ret 157 158 159C ----------------------------------------------------------------------------- 160 ALIGN(16) 161L(unroll): 162 C eax size-1 163 C ebx 164 C ecx shift 165 C edx src 166 C esi 167 C edi dst 168 C ebp 169 C 170 C mm5 src low limb 171 C mm6 rshift 172 173 testb $4, %dl 174 movl %esi, SAVE_ESI 175 movl %ebx, SAVE_EBX 176 177 psllq $32, %mm5 178 jz L(start_src_aligned) 179 180 181 C src isn't aligned, process low limb separately (marked xxx) and 182 C step src and dst by one limb, making src aligned. 183 C 184 C source edx 185 C --+-------+-------+-------+ 186 C | xxx | 187 C --+-------+-------+-------+ 188 C 4mod8 0mod8 4mod8 189 C 190 C dest edi 191 C --+-------+-------+ 192 C | | xxx | 193 C --+-------+-------+ 194 195 movq (%edx), %mm0 C src low two limbs 196 addl $4, %edx 197 movl %eax, PARAM_SIZE C size-1 198 199 addl $4, %edi 200 decl %eax C size-2 is new size-1 201 202 psrlq %mm6, %mm0 203 movl %edi, PARAM_DST C new dst 204 205 movd %mm0, -4(%edi) 206L(start_src_aligned): 207 208 209 movq (%edx), %mm1 C src low two limbs 210 decl %eax C size-2, two last limbs handled at end 211 testl $4, %edi 212 213 psrlq %mm6, %mm5 214 jz L(start_dst_aligned) 215 216 217 C dst isn't aligned, add 4 to make it so, and pretend the shift is 218 C 32 bits extra. Low limb of dst (marked xxx) handled here separately. 219 C 220 C source edx 221 C --+-------+-------+ 222 C | mm1 | 223 C --+-------+-------+ 224 C 4mod8 0mod8 225 C 226 C dest edi 227 C --+-------+-------+-------+ 228 C | xxx | 229 C --+-------+-------+-------+ 230 C 4mod8 0mod8 4mod8 231 232 movq %mm1, %mm0 233 psrlq %mm6, %mm1 234 addl $32, %ecx C shift+32 235 236 movd %mm1, (%edi) 237 movq %mm0, %mm1 238 addl $4, %edi C new dst 239 240 movd %ecx, %mm6 241L(start_dst_aligned): 242 243 244 movq %mm1, %mm2 C copy of src low two limbs 245 negl %ecx 246 andl $-2, %eax C round size down to even 247 248 movl %eax, %ebx 249 negl %eax 250 addl $64, %ecx 251 252 andl $UNROLL_MASK, %eax 253 decl %ebx 254 255 shll %eax 256 257 movd %ecx, %mm7 C lshift = 64-rshift 258 259ifdef(`PIC',` 260 call L(pic_calc) 261L(here): 262',` 263 leal L(entry) (%eax,%eax,4), %esi 264 negl %eax 265') 266 shrl $UNROLL_LOG2, %ebx C loop counter 267 268 leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx 269 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi 270 movl PARAM_SIZE, %eax C for use at end 271 272 jmp *%esi 273 274 275ifdef(`PIC',` 276L(pic_calc): 277 C See mpn/x86/README about old gas bugs 278 leal (%eax,%eax,4), %esi 279 addl $L(entry)-L(here), %esi 280 addl (%esp), %esi 281 negl %eax 282 283 ret_internal 284') 285 286 287C ----------------------------------------------------------------------------- 288 ALIGN(64) 289L(top): 290 C eax size, for use at end 291 C ebx loop counter 292 C ecx lshift 293 C edx src 294 C esi was computed jump 295 C edi dst 296 C ebp 297 C 298 C mm0 scratch 299 C mm1 \ carry (alternating) 300 C mm2 / 301 C mm6 rshift 302 C mm7 lshift 303 C 304 C 10 code bytes/limb 305 C 306 C The two chunks differ in whether mm1 or mm2 hold the carry. 307 C The computed jump puts the initial carry in both mm1 and mm2. 308 309L(entry): 310deflit(CHUNK_COUNT, 4) 311forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 312 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 313 deflit(`disp1', eval(disp0 + 8)) 314 315Zdisp( movq, disp0,(%edx), %mm0) 316 psrlq %mm6, %mm2 317 318 movq %mm0, %mm1 319 psllq %mm7, %mm0 320 321 por %mm2, %mm0 322Zdisp( movq, %mm0, disp0,(%edi)) 323 324 325Zdisp( movq, disp1,(%edx), %mm0) 326 psrlq %mm6, %mm1 327 328 movq %mm0, %mm2 329 psllq %mm7, %mm0 330 331 por %mm1, %mm0 332Zdisp( movq, %mm0, disp1,(%edi)) 333') 334 335 addl $UNROLL_BYTES, %edx 336 addl $UNROLL_BYTES, %edi 337 decl %ebx 338 339 jns L(top) 340 341 342deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) 343deflit(`disp1', eval(disp0-0 + 8)) 344 345 testb $1, %al 346 psrlq %mm6, %mm2 C wanted rshifted in all cases below 347 movl SAVE_ESI, %esi 348 349 movd %mm5, %eax C return value 350 351 movl SAVE_EBX, %ebx 352 jz L(end_even) 353 354 355 C Size odd, destination was aligned. 356 C 357 C source 358 C edx 359 C +-------+---------------+-- 360 C | | mm2 | 361 C +-------+---------------+-- 362 C 363 C dest edi 364 C +-------+---------------+---------------+-- 365 C | | | written | 366 C +-------+---------------+---------------+-- 367 C 368 C mm6 = shift 369 C mm7 = ecx = 64-shift 370 371 372 C Size odd, destination was unaligned. 373 C 374 C source 375 C edx 376 C +-------+---------------+-- 377 C | | mm2 | 378 C +-------+---------------+-- 379 C 380 C dest edi 381 C +---------------+---------------+-- 382 C | | written | 383 C +---------------+---------------+-- 384 C 385 C mm6 = shift+32 386 C mm7 = ecx = 64-(shift+32) 387 388 389 C In both cases there's one extra limb of src to fetch and combine 390 C with mm2 to make a qword to store, and in the aligned case there's 391 C a further extra limb of dst to be formed. 392 393 394 movd disp0(%edx), %mm0 395 movq %mm0, %mm1 396 397 psllq %mm7, %mm0 398 testb $32, %cl 399 400 por %mm2, %mm0 401 psrlq %mm6, %mm1 402 403 movq %mm0, disp0(%edi) 404 jz L(finish_odd_unaligned) 405 406 movd %mm1, disp1(%edi) 407L(finish_odd_unaligned): 408 409 movl SAVE_EDI, %edi 410 addl $SAVE_SIZE, %esp 411 emms 412 413 ret 414 415 416L(end_even): 417 418 C Size even, destination was aligned. 419 C 420 C source 421 C +---------------+-- 422 C | mm2 | 423 C +---------------+-- 424 C 425 C dest edi 426 C +---------------+---------------+-- 427 C | | mm3 | 428 C +---------------+---------------+-- 429 C 430 C mm6 = shift 431 C mm7 = ecx = 64-shift 432 433 434 C Size even, destination was unaligned. 435 C 436 C source 437 C +---------------+-- 438 C | mm2 | 439 C +---------------+-- 440 C 441 C dest edi 442 C +-------+---------------+-- 443 C | | mm3 | 444 C +-------+---------------+-- 445 C 446 C mm6 = shift+32 447 C mm7 = 64-(shift+32) 448 449 450 C The movd for the unaligned case is the same data as the movq for 451 C the aligned case, it's just a choice between whether one or two 452 C limbs should be written. 453 454 455 testb $32, %cl 456 movd %mm2, disp0(%edi) 457 458 jz L(end_even_unaligned) 459 460 movq %mm2, disp0(%edi) 461L(end_even_unaligned): 462 463 movl SAVE_EDI, %edi 464 addl $SAVE_SIZE, %esp 465 emms 466 467 ret 468 469EPILOGUE() 470