1dnl AMD K6 mpn_rshift -- mpn right shift. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6: 3.0 cycles/limb 24 25 26C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 27C unsigned shift); 28C 29C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx 30C instructions. This is despite every second fetch being unaligned. 31 32 33defframe(PARAM_SHIFT,16) 34defframe(PARAM_SIZE, 12) 35defframe(PARAM_SRC, 8) 36defframe(PARAM_DST, 4) 37deflit(`FRAME',0) 38 39 TEXT 40 ALIGN(32) 41 42PROLOGUE(mpn_rshift) 43deflit(`FRAME',0) 44 45 C The 1 limb case can be done without the push %ebx, but it's then 46 C still the same speed. The push is left as a free helping hand for 47 C the two_or_more code. 48 49 movl PARAM_SIZE, %eax 50 pushl %ebx FRAME_pushl() 51 52 movl PARAM_SRC, %ebx 53 decl %eax 54 55 movl PARAM_SHIFT, %ecx 56 jnz L(two_or_more) 57 58 movl (%ebx), %edx C src limb 59 movl PARAM_DST, %ebx 60 61 shrdl( %cl, %edx, %eax) C return value 62 63 shrl %cl, %edx 64 65 movl %edx, (%ebx) C dst limb 66 popl %ebx 67 68 ret 69 70 71 ALIGN(16) C avoid offset 0x1f 72L(two_or_more): 73 C eax size-1 74 C ebx src 75 C ecx shift 76 C edx 77 78 movl (%ebx), %edx C src low limb 79 negl %ecx 80 81 addl $32, %ecx C 32-shift 82 movd PARAM_SHIFT, %mm6 83 84 shll %cl, %edx C retval 85 movl PARAM_DST, %ecx 86 87 leal (%ebx,%eax,4), %ebx 88 89 leal -4(%ecx,%eax,4), %ecx 90 negl %eax 91 92 93L(simple): 94 C eax counter (negative) 95 C ebx &src[size-1] 96 C ecx &dst[size-1] 97 C edx retval 98 C 99 C mm0 scratch 100 C mm6 shift 101 102Zdisp( movq, 0,(%ebx,%eax,4), %mm0) 103 incl %eax 104 105 psrlq %mm6, %mm0 106 107Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) 108 jnz L(simple) 109 110 111 movq %mm0, (%ecx) 112 movl %edx, %eax 113 114 popl %ebx 115 116 emms 117 ret 118 119EPILOGUE() 120