1dnl x86-64 mpn_lshiftc optimized for "Core 2". 2 3dnl Copyright 2007, 2009 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: ? 25C K10: ? 26C P4: ? 27C P6 core2: 1.5 28C P6 corei7: 1.75 29 30 31C INPUT PARAMETERS 32define(`rp', `%rdi') 33define(`up', `%rsi') 34define(`n', `%rdx') 35define(`cnt', `%cl') 36 37ASM_START() 38 TEXT 39 ALIGN(16) 40PROLOGUE(mpn_lshiftc) 41 lea -8(rp,n,8), rp 42 lea -8(up,n,8), up 43 44 mov %edx, %eax 45 and $3, %eax 46 jne L(nb00) 47L(b00): C n = 4, 8, 12, ... 48 mov (up), %r10 49 mov -8(up), %r11 50 xor %eax, %eax 51 shld %cl, %r10, %rax 52 mov -16(up), %r8 53 lea 24(rp), rp 54 sub $4, n 55 jmp L(00) 56 57L(nb00):C n = 1, 5, 9, ... 58 cmp $2, %eax 59 jae L(nb01) 60L(b01): mov (up), %r9 61 xor %eax, %eax 62 shld %cl, %r9, %rax 63 sub $2, n 64 jb L(le1) 65 mov -8(up), %r10 66 mov -16(up), %r11 67 lea -8(up), up 68 lea 16(rp), rp 69 jmp L(01) 70L(le1): shl %cl, %r9 71 not %r9 72 mov %r9, (rp) 73 ret 74 75L(nb01):C n = 2, 6, 10, ... 76 jne L(b11) 77L(b10): mov (up), %r8 78 mov -8(up), %r9 79 xor %eax, %eax 80 shld %cl, %r8, %rax 81 sub $3, n 82 jb L(le2) 83 mov -16(up), %r10 84 lea -16(up), up 85 lea 8(rp), rp 86 jmp L(10) 87L(le2): shld %cl, %r9, %r8 88 not %r8 89 mov %r8, (rp) 90 shl %cl, %r9 91 not %r9 92 mov %r9, -8(rp) 93 ret 94 95 ALIGN(16) C performance critical! 96L(b11): C n = 3, 7, 11, ... 97 mov (up), %r11 98 mov -8(up), %r8 99 xor %eax, %eax 100 shld %cl, %r11, %rax 101 mov -16(up), %r9 102 lea -24(up), up 103 sub $4, n 104 jb L(end) 105 106 ALIGN(16) 107L(top): shld %cl, %r8, %r11 108 mov (up), %r10 109 not %r11 110 mov %r11, (rp) 111L(10): shld %cl, %r9, %r8 112 mov -8(up), %r11 113 not %r8 114 mov %r8, -8(rp) 115L(01): shld %cl, %r10, %r9 116 mov -16(up), %r8 117 not %r9 118 mov %r9, -16(rp) 119L(00): shld %cl, %r11, %r10 120 mov -24(up), %r9 121 not %r10 122 mov %r10, -24(rp) 123 add $-32, up 124 lea -32(rp), rp 125 sub $4, n 126 jnc L(top) 127 128L(end): shld %cl, %r8, %r11 129 not %r11 130 mov %r11, (rp) 131 shld %cl, %r9, %r8 132 not %r8 133 mov %r8, -8(rp) 134 shl %cl, %r9 135 not %r9 136 mov %r9, -16(rp) 137 ret 138EPILOGUE() 139