1dnl x86-64 mpn_rshift optimized for "Core 2". 2 3dnl Copyright 2007, 2009 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: 4.25 25C K10: 4.25 26C P4: 14.7 27C P6 core2: 1.27 28C P6 corei7: 1.5 29 30 31C INPUT PARAMETERS 32define(`rp', `%rdi') 33define(`up', `%rsi') 34define(`n', `%rdx') 35define(`cnt', `%cl') 36 37ASM_START() 38 TEXT 39 ALIGN(16) 40PROLOGUE(mpn_rshift) 41 mov %edx, %eax 42 and $3, %eax 43 jne L(nb00) 44L(b00): C n = 4, 8, 12, ... 45 mov (up), %r10 46 mov 8(up), %r11 47 xor %eax, %eax 48 shrd %cl, %r10, %rax 49 mov 16(up), %r8 50 lea 8(up), up 51 lea -24(rp), rp 52 sub $4, n 53 jmp L(00) 54 55L(nb00):C n = 1, 5, 9, ... 56 cmp $2, %eax 57 jae L(nb01) 58L(b01): mov (up), %r9 59 xor %eax, %eax 60 shrd %cl, %r9, %rax 61 sub $2, n 62 jb L(le1) 63 mov 8(up), %r10 64 mov 16(up), %r11 65 lea 16(up), up 66 lea -16(rp), rp 67 jmp L(01) 68L(le1): shr %cl, %r9 69 mov %r9, (rp) 70 ret 71 72L(nb01):C n = 2, 6, 10, ... 73 jne L(b11) 74L(b10): mov (up), %r8 75 mov 8(up), %r9 76 xor %eax, %eax 77 shrd %cl, %r8, %rax 78 sub $3, n 79 jb L(le2) 80 mov 16(up), %r10 81 lea 24(up), up 82 lea -8(rp), rp 83 jmp L(10) 84L(le2): shrd %cl, %r9, %r8 85 mov %r8, (rp) 86 shr %cl, %r9 87 mov %r9, 8(rp) 88 ret 89 90 ALIGN(16) 91L(b11): C n = 3, 7, 11, ... 92 mov (up), %r11 93 mov 8(up), %r8 94 xor %eax, %eax 95 shrd %cl, %r11, %rax 96 mov 16(up), %r9 97 lea 32(up), up 98 sub $4, n 99 jb L(end) 100 101 ALIGN(16) 102L(top): shrd %cl, %r8, %r11 103 mov -8(up), %r10 104 mov %r11, (rp) 105L(10): shrd %cl, %r9, %r8 106 mov (up), %r11 107 mov %r8, 8(rp) 108L(01): shrd %cl, %r10, %r9 109 mov 8(up), %r8 110 mov %r9, 16(rp) 111L(00): shrd %cl, %r11, %r10 112 mov 16(up), %r9 113 mov %r10, 24(rp) 114 add $32, up 115 lea 32(rp), rp 116 sub $4, n 117 jnc L(top) 118 119L(end): shrd %cl, %r8, %r11 120 mov %r11, (rp) 121 shrd %cl, %r9, %r8 122 mov %r8, 8(rp) 123 shr %cl, %r9 124 mov %r9, 16(rp) 125 ret 126EPILOGUE() 127