1dnl x86-64 mpn_lshift optimized for Pentium 4. 2 3dnl Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: 2.5 25C K10: ? 26C P4: 3.29 27C P6-15 (Core2): 2.1 (fluctuates, presumably cache related) 28C P6-28 (Atom): 14.3 29 30C INPUT PARAMETERS 31define(`rp',`%rdi') 32define(`up',`%rsi') 33define(`n',`%rdx') 34define(`cnt',`%cl') 35 36ASM_START() 37 TEXT 38 ALIGN(32) 39PROLOGUE(mpn_lshift) 40 mov -8(up,n,8), %rax 41 movd %ecx, %mm4 42 neg %ecx C put rsh count in cl 43 and $63, %ecx 44 movd %ecx, %mm5 45 46 lea 1(n), %r8d 47 48 shr %cl, %rax C function return value 49 50 and $3, %r8d 51 je L(rol) C jump for n = 3, 7, 11, ... 52 53 dec %r8d 54 jne L(1) 55C n = 4, 8, 12, ... 56 movq -8(up,n,8), %mm2 57 psllq %mm4, %mm2 58 movq -16(up,n,8), %mm0 59 psrlq %mm5, %mm0 60 por %mm0, %mm2 61 movq %mm2, -8(rp,n,8) 62 dec n 63 jmp L(rol) 64 65L(1): dec %r8d 66 je L(1x) C jump for n = 1, 5, 9, 13, ... 67C n = 2, 6, 10, 16, ... 68 movq -8(up,n,8), %mm2 69 psllq %mm4, %mm2 70 movq -16(up,n,8), %mm0 71 psrlq %mm5, %mm0 72 por %mm0, %mm2 73 movq %mm2, -8(rp,n,8) 74 dec n 75L(1x): 76 cmp $1, n 77 je L(ast) 78 movq -8(up,n,8), %mm2 79 psllq %mm4, %mm2 80 movq -16(up,n,8), %mm3 81 psllq %mm4, %mm3 82 movq -16(up,n,8), %mm0 83 movq -24(up,n,8), %mm1 84 psrlq %mm5, %mm0 85 por %mm0, %mm2 86 psrlq %mm5, %mm1 87 por %mm1, %mm3 88 movq %mm2, -8(rp,n,8) 89 movq %mm3, -16(rp,n,8) 90 sub $2, n 91 92L(rol): movq -8(up,n,8), %mm2 93 psllq %mm4, %mm2 94 movq -16(up,n,8), %mm3 95 psllq %mm4, %mm3 96 97 sub $4, n C 4 98 jb L(end) C 2 99 ALIGN(32) 100L(top): 101 C finish stuff from lsh block 102 movq 16(up,n,8), %mm0 103 movq 8(up,n,8), %mm1 104 psrlq %mm5, %mm0 105 por %mm0, %mm2 106 psrlq %mm5, %mm1 107 movq (up,n,8), %mm0 108 por %mm1, %mm3 109 movq -8(up,n,8), %mm1 110 movq %mm2, 24(rp,n,8) 111 movq %mm3, 16(rp,n,8) 112 C start two new rsh 113 psrlq %mm5, %mm0 114 psrlq %mm5, %mm1 115 116 C finish stuff from rsh block 117 movq 8(up,n,8), %mm2 118 movq (up,n,8), %mm3 119 psllq %mm4, %mm2 120 por %mm2, %mm0 121 psllq %mm4, %mm3 122 movq -8(up,n,8), %mm2 123 por %mm3, %mm1 124 movq -16(up,n,8), %mm3 125 movq %mm0, 8(rp,n,8) 126 movq %mm1, (rp,n,8) 127 C start two new lsh 128 sub $4, n 129 psllq %mm4, %mm2 130 psllq %mm4, %mm3 131 132 jae L(top) C 2 133L(end): 134 movq 16(up,n,8), %mm0 135 psrlq %mm5, %mm0 136 por %mm0, %mm2 137 movq 8(up,n,8), %mm1 138 psrlq %mm5, %mm1 139 por %mm1, %mm3 140 movq %mm2, 24(rp,n,8) 141 movq %mm3, 16(rp,n,8) 142 143L(ast): movq (up), %mm2 144 psllq %mm4, %mm2 145 movq %mm2, (rp) 146 emms 147 ret 148EPILOGUE() 149