1dnl AMD K6 mpn_lshift -- mpn left shift. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6: 3.0 cycles/limb 24 25 26C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 27C unsigned shift); 28C 29C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx 30C instructions. This is despite every second fetch being unaligned. 31 32 33defframe(PARAM_SHIFT,16) 34defframe(PARAM_SIZE, 12) 35defframe(PARAM_SRC, 8) 36defframe(PARAM_DST, 4) 37 38 TEXT 39 ALIGN(32) 40 41PROLOGUE(mpn_lshift) 42deflit(`FRAME',0) 43 44 C The 1 limb case can be done without the push %ebx, but it's then 45 C still the same speed. The push is left as a free helping hand for 46 C the two_or_more code. 47 48 movl PARAM_SIZE, %eax 49 pushl %ebx FRAME_pushl() 50 51 movl PARAM_SRC, %ebx 52 decl %eax 53 54 movl PARAM_SHIFT, %ecx 55 jnz L(two_or_more) 56 57 movl (%ebx), %edx C src limb 58 movl PARAM_DST, %ebx 59 60 shldl( %cl, %edx, %eax) C return value 61 62 shll %cl, %edx 63 64 movl %edx, (%ebx) C dst limb 65 popl %ebx 66 67 ret 68 69 70 ALIGN(16) C avoid offset 0x1f 71 nop C avoid bad cache line crossing 72L(two_or_more): 73 C eax size-1 74 C ebx src 75 C ecx shift 76 C edx 77 78 movl (%ebx,%eax,4), %edx C src high limb 79 negl %ecx 80 81 movd PARAM_SHIFT, %mm6 82 addl $32, %ecx C 32-shift 83 84 shrl %cl, %edx 85 86 movd %ecx, %mm7 87 movl PARAM_DST, %ecx 88 89L(top): 90 C eax counter, size-1 to 1 91 C ebx src 92 C ecx dst 93 C edx retval 94 C 95 C mm0 scratch 96 C mm6 shift 97 C mm7 32-shift 98 99 movq -4(%ebx,%eax,4), %mm0 100 decl %eax 101 102 psrlq %mm7, %mm0 103 104 movd %mm0, 4(%ecx,%eax,4) 105 jnz L(top) 106 107 108 movd (%ebx), %mm0 109 popl %ebx 110 111 psllq %mm6, %mm0 112 movl %edx, %eax 113 114 movd %mm0, (%ecx) 115 116 emms 117 ret 118 119EPILOGUE() 120