aorrlsh1_n.asm revision 1.1.1.2
1dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) 2dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] 3 4dnl Copyright 2003, 2005, 2006, 2007, 2008, 2009, 2011, 2012 Free Software 5dnl Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb 26C AMD K8,K9 2 27C AMD K10 2 28C Intel P4 13 29C Intel core2 3.45 30C Intel corei 3.45 31C Intel atom ? 32C VIA nano ? 33 34 35C Sometimes speed degenerates, supposedly related to that some operand 36C alignments cause cache conflicts. 37 38C The speed is limited by decoding/issue bandwidth. There are 22 instructions 39C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l. 40 41C INPUT PARAMETERS 42define(`rp',`%rdi') 43define(`up',`%rsi') 44define(`vp',`%rdx') 45define(`n', `%rcx') 46 47ifdef(`OPERATION_addlsh1_n', ` 48 define(ADDSUB, add) 49 define(ADCSBB, adc) 50 define(func, mpn_addlsh1_n)') 51ifdef(`OPERATION_rsblsh1_n', ` 52 define(ADDSUB, sub) 53 define(ADCSBB, sbb) 54 define(func, mpn_rsblsh1_n)') 55 56MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) 57 58ABI_SUPPORT(DOS64) 59ABI_SUPPORT(STD64) 60 61ASM_START() 62 TEXT 63 ALIGN(16) 64PROLOGUE(func) 65 FUNC_ENTRY(4) 66 push %rbp 67 68 mov (vp), %r8 69 mov R32(n), R32(%rax) 70 lea (rp,n,8), rp 71 lea (up,n,8), up 72 lea (vp,n,8), vp 73 neg n 74 xor R32(%rbp), R32(%rbp) 75 and $3, R32(%rax) 76 je L(b00) 77 cmp $2, R32(%rax) 78 jc L(b01) 79 je L(b10) 80 81L(b11): add %r8, %r8 82 mov 8(vp,n,8), %r9 83 adc %r9, %r9 84 mov 16(vp,n,8), %r10 85 adc %r10, %r10 86 sbb R32(%rax), R32(%rax) C save scy 87 ADDSUB (up,n,8), %r8 88 ADCSBB 8(up,n,8), %r9 89 mov %r8, (rp,n,8) 90 mov %r9, 8(rp,n,8) 91 ADCSBB 16(up,n,8), %r10 92 mov %r10, 16(rp,n,8) 93 sbb R32(%rbp), R32(%rbp) C save acy 94 add $3, n 95 jmp L(ent) 96 97L(b10): add %r8, %r8 98 mov 8(vp,n,8), %r9 99 adc %r9, %r9 100 sbb R32(%rax), R32(%rax) C save scy 101 ADDSUB (up,n,8), %r8 102 ADCSBB 8(up,n,8), %r9 103 mov %r8, (rp,n,8) 104 mov %r9, 8(rp,n,8) 105 sbb R32(%rbp), R32(%rbp) C save acy 106 add $2, n 107 jmp L(ent) 108 109L(b01): add %r8, %r8 110 sbb R32(%rax), R32(%rax) C save scy 111 ADDSUB (up,n,8), %r8 112 mov %r8, (rp,n,8) 113 sbb R32(%rbp), R32(%rbp) C save acy 114 inc n 115L(ent): jns L(end) 116 117 ALIGN(16) 118L(top): add R32(%rax), R32(%rax) C restore scy 119 120 mov (vp,n,8), %r8 121L(b00): adc %r8, %r8 122 mov 8(vp,n,8), %r9 123 adc %r9, %r9 124 mov 16(vp,n,8), %r10 125 adc %r10, %r10 126 mov 24(vp,n,8), %r11 127 adc %r11, %r11 128 129 sbb R32(%rax), R32(%rax) C save scy 130 add R32(%rbp), R32(%rbp) C restore acy 131 132 ADCSBB (up,n,8), %r8 133 nop C Hammer speedup! 134 ADCSBB 8(up,n,8), %r9 135 mov %r8, (rp,n,8) 136 mov %r9, 8(rp,n,8) 137 ADCSBB 16(up,n,8), %r10 138 ADCSBB 24(up,n,8), %r11 139 mov %r10, 16(rp,n,8) 140 mov %r11, 24(rp,n,8) 141 142 sbb R32(%rbp), R32(%rbp) C save acy 143 add $4, n 144 js L(top) 145 146L(end): 147ifdef(`OPERATION_addlsh1_n',` 148 add R32(%rbp), R32(%rax) 149 neg R32(%rax)') 150ifdef(`OPERATION_rsblsh1_n',` 151 sub R32(%rax), R32(%rbp) 152 movslq R32(%rbp), %rax') 153 154 pop %rbp 155 FUNC_EXIT() 156 ret 157EPILOGUE() 158