1dnl x86-64 mpn_rshift optimized for Pentium 4. 2 3dnl Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: 2.5 25C K10: ? 26C P4: 3.29 27C P6-15 (Core2): 2.1 (fluctuates, presumably cache related) 28C P6-28 (Atom): 14.3 29 30C INPUT PARAMETERS 31define(`rp',`%rdi') 32define(`up',`%rsi') 33define(`n',`%rdx') 34define(`cnt',`%cl') 35 36ASM_START() 37 TEXT 38 ALIGN(32) 39PROLOGUE(mpn_rshift) 40 mov (up), %rax 41 movd %ecx, %mm4 42 neg %ecx C put lsh count in cl 43 and $63, %ecx 44 movd %ecx, %mm5 45 46 lea -8(up,n,8), up 47 lea -8(rp,n,8), rp 48 lea 1(n), %r8d 49 neg n 50 51 shl %cl, %rax C function return value 52 53 and $3, %r8d 54 je L(rol) C jump for n = 3, 7, 11, ... 55 56 dec %r8d 57 jne L(1) 58C n = 4, 8, 12, ... 59 movq 8(up,n,8), %mm2 60 psrlq %mm4, %mm2 61 movq 16(up,n,8), %mm0 62 psllq %mm5, %mm0 63 por %mm0, %mm2 64 movq %mm2, 8(rp,n,8) 65 inc n 66 jmp L(rol) 67 68L(1): dec %r8d 69 je L(1x) C jump for n = 1, 5, 9, 13, ... 70C n = 2, 6, 10, 16, ... 71 movq 8(up,n,8), %mm2 72 psrlq %mm4, %mm2 73 movq 16(up,n,8), %mm0 74 psllq %mm5, %mm0 75 por %mm0, %mm2 76 movq %mm2, 8(rp,n,8) 77 inc n 78L(1x): 79 cmp $-1, n 80 je L(ast) 81 movq 8(up,n,8), %mm2 82 psrlq %mm4, %mm2 83 movq 16(up,n,8), %mm3 84 psrlq %mm4, %mm3 85 movq 16(up,n,8), %mm0 86 movq 24(up,n,8), %mm1 87 psllq %mm5, %mm0 88 por %mm0, %mm2 89 psllq %mm5, %mm1 90 por %mm1, %mm3 91 movq %mm2, 8(rp,n,8) 92 movq %mm3, 16(rp,n,8) 93 add $2, n 94 95L(rol): movq 8(up,n,8), %mm2 96 psrlq %mm4, %mm2 97 movq 16(up,n,8), %mm3 98 psrlq %mm4, %mm3 99 100 add $4, n C 4 101 jb L(end) C 2 102 ALIGN(32) 103L(top): 104 C finish stuff from lsh block 105 movq -16(up,n,8), %mm0 106 movq -8(up,n,8), %mm1 107 psllq %mm5, %mm0 108 por %mm0, %mm2 109 psllq %mm5, %mm1 110 movq (up,n,8), %mm0 111 por %mm1, %mm3 112 movq 8(up,n,8), %mm1 113 movq %mm2, -24(rp,n,8) 114 movq %mm3, -16(rp,n,8) 115 C start two new rsh 116 psllq %mm5, %mm0 117 psllq %mm5, %mm1 118 119 C finish stuff from rsh block 120 movq -8(up,n,8), %mm2 121 movq (up,n,8), %mm3 122 psrlq %mm4, %mm2 123 por %mm2, %mm0 124 psrlq %mm4, %mm3 125 movq 8(up,n,8), %mm2 126 por %mm3, %mm1 127 movq 16(up,n,8), %mm3 128 movq %mm0, -8(rp,n,8) 129 movq %mm1, (rp,n,8) 130 C start two new lsh 131 add $4, n 132 psrlq %mm4, %mm2 133 psrlq %mm4, %mm3 134 135 jae L(top) C 2 136L(end): 137 movq -16(up,n,8), %mm0 138 psllq %mm5, %mm0 139 por %mm0, %mm2 140 movq -8(up,n,8), %mm1 141 psllq %mm5, %mm1 142 por %mm1, %mm3 143 movq %mm2, -24(rp,n,8) 144 movq %mm3, -16(rp,n,8) 145 146L(ast): movq (up), %mm2 147 psrlq %mm4, %mm2 148 movq %mm2, (rp) 149 emms 150 ret 151EPILOGUE() 152