rsh1aors_n.asm revision 1.1.1.1
1dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1 2 3dnl Copyright 2003, 2005, 2009 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: 2.14 (mpn_add_n + mpn_rshift need 4.125) 25C K10: 2.14 (mpn_add_n + mpn_rshift need 4.125) 26C P4: 12.75 27C P6-15: 3.75 28 29C TODO 30C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm. 31C * Try to approach the cache bandwidth 1.5 c/l. It should be possible. 32 33C INPUT PARAMETERS 34define(`rp',`%rdi') 35define(`up',`%rsi') 36define(`vp',`%rdx') 37define(`n',`%rcx') 38define(`n32',`%ecx') 39 40ifdef(`OPERATION_rsh1add_n', ` 41 define(ADDSUB, add) 42 define(ADCSBB, adc) 43 define(func_n, mpn_rsh1add_n) 44 define(func_nc, mpn_rsh1add_nc)') 45ifdef(`OPERATION_rsh1sub_n', ` 46 define(ADDSUB, sub) 47 define(ADCSBB, sbb) 48 define(func_n, mpn_rsh1sub_n) 49 define(func_nc, mpn_rsh1sub_nc)') 50 51MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) 52 53ASM_START() 54 TEXT 55 56 ALIGN(16) 57PROLOGUE(func_nc) 58 push %rbx 59 60 xor %eax, %eax 61 neg %r8 C set C flag from parameter 62 mov (up), %rbx 63 ADCSBB (vp), %rbx 64 jmp L(ent) 65EPILOGUE() 66 67 ALIGN(16) 68PROLOGUE(func_n) 69 push %rbx 70 71 xor %eax, %eax 72 mov (up), %rbx 73 ADDSUB (vp), %rbx 74L(ent): 75 rcr %rbx C rotate, save acy 76 adc %eax, %eax C return value 77 78 mov n32, R32(%r11) 79 and $3, R32(%r11) 80 81 cmp $1, R32(%r11) 82 je L(do) C jump if n = 1 5 9 ... 83 84L(n1): cmp $2, R32(%r11) 85 jne L(n2) C jump unless n = 2 6 10 ... 86 add %rbx, %rbx C rotate carry limb, restore acy 87 mov 8(up), %r10 88 ADCSBB 8(vp), %r10 89 lea 8(up), up 90 lea 8(vp), vp 91 lea 8(rp), rp 92 rcr %r10 93 rcr %rbx 94 mov %rbx, -8(rp) 95 jmp L(cj1) 96 97L(n2): cmp $3, R32(%r11) 98 jne L(n3) C jump unless n = 3 7 11 ... 99 add %rbx, %rbx C rotate carry limb, restore acy 100 mov 8(up), %r9 101 mov 16(up), %r10 102 ADCSBB 8(vp), %r9 103 ADCSBB 16(vp), %r10 104 lea 16(up), up 105 lea 16(vp), vp 106 lea 16(rp), rp 107 rcr %r10 108 rcr %r9 109 rcr %rbx 110 mov %rbx, -16(rp) 111 jmp L(cj2) 112 113L(n3): dec n C come here for n = 4 8 12 ... 114 add %rbx, %rbx C rotate carry limb, restore acy 115 mov 8(up), %r8 116 mov 16(up), %r9 117 ADCSBB 8(vp), %r8 118 ADCSBB 16(vp), %r9 119 mov 24(up), %r10 120 ADCSBB 24(vp), %r10 121 lea 24(up), up 122 lea 24(vp), vp 123 lea 24(rp), rp 124 rcr %r10 125 rcr %r9 126 rcr %r8 127 rcr %rbx 128 mov %rbx, -24(rp) 129 mov %r8, -16(rp) 130L(cj2): mov %r9, -8(rp) 131L(cj1): mov %r10, %rbx 132 133L(do): 134 shr $2, n C 4 135 je L(end) C 2 136 ALIGN(16) 137L(top): add %rbx, %rbx C rotate carry limb, restore acy 138 139 mov 8(up), %r8 140 mov 16(up), %r9 141 ADCSBB 8(vp), %r8 142 ADCSBB 16(vp), %r9 143 mov 24(up), %r10 144 mov 32(up), %r11 145 ADCSBB 24(vp), %r10 146 ADCSBB 32(vp), %r11 147 148 lea 32(up), up 149 lea 32(vp), vp 150 151 rcr %r11 C rotate, save acy 152 rcr %r10 153 rcr %r9 154 rcr %r8 155 156 rcr %rbx 157 mov %rbx, (rp) 158 mov %r8, 8(rp) 159 mov %r9, 16(rp) 160 mov %r10, 24(rp) 161 mov %r11, %rbx 162 163 lea 32(rp), rp 164 dec n 165 jne L(top) 166 167L(end): mov %rbx, (rp) 168 pop %rbx 169 ret 170EPILOGUE() 171