1284345Ssjgdnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63. 2284345Ssjg 3284345Ssjgdnl Copyright 2008 Free Software Foundation, Inc. 4284345Ssjg 5284345Ssjgdnl This file is part of the GNU MP Library. 6284345Ssjgdnl 7284345Ssjgdnl The GNU MP Library is free software; you can redistribute it and/or modify 8284345Ssjgdnl it under the terms of either: 9284345Ssjgdnl 10284345Ssjgdnl * the GNU Lesser General Public License as published by the Free 11284345Ssjgdnl Software Foundation; either version 3 of the License, or (at your 12284345Ssjgdnl option) any later version. 13284345Ssjgdnl 14284345Ssjgdnl or 15284345Ssjgdnl 16284345Ssjgdnl * the GNU General Public License as published by the Free Software 17284345Ssjgdnl Foundation; either version 2 of the License, or (at your option) any 18284345Ssjgdnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 2.167 35C AMD K10 2.167 36C Intel P4 12.0 37C Intel core2 4.0 38C Intel corei ? 39C Intel atom ? 40C VIA nano ? 41 42C TODO 43C * Perhaps handle various n mod 3 sizes better. The code now is too large. 44 45C INPUT PARAMETERS 46define(`rp', `%rdi') 47define(`ap', `%rsi') 48define(`bp_param', `%rdx') 49define(`n', `%rcx') 50define(`u0', `%r8') 51define(`v0', `%r9') 52 53 54define(`bp', `%rbp') 55 56ASM_START() 57 TEXT 58 ALIGN(16) 59PROLOGUE(mpn_addaddmul_1msb0) 60 push %r12 61 push %rbp 62 63 lea (ap,n,8), ap 64 lea (bp_param,n,8), bp 65 lea (rp,n,8), rp 66 neg n 67 68 mov (ap,n,8), %rax 69 mul %r8 70 mov %rax, %r12 71 mov (bp,n,8), %rax 72 mov %rdx, %r10 73 add $3, n 74 jns L(end) 75 76 ALIGN(16) 77L(top): mul %r9 78 add %rax, %r12 79 mov -16(ap,n,8), %rax 80 adc %rdx, %r10 81 mov %r12, -24(rp,n,8) 82 mul %r8 83 add %rax, %r10 84 mov -16(bp,n,8), %rax 85 mov $0, R32(%r11) 86 adc %rdx, %r11 87 mul %r9 88 add %rax, %r10 89 mov -8(ap,n,8), %rax 90 adc %rdx, %r11 91 mov %r10, -16(rp,n,8) 92 mul %r8 93 add %rax, %r11 94 mov -8(bp,n,8), %rax 95 mov $0, R32(%r12) 96 adc %rdx, %r12 97 mul %r9 98 add %rax, %r11 99 adc %rdx, %r12 100 mov (ap,n,8), %rax 101 mul %r8 102 add %rax, %r12 103 mov %r11, -8(rp,n,8) 104 mov (bp,n,8), %rax 105 mov $0, R32(%r10) 106 adc %rdx, %r10 107 add $3, n 108 js L(top) 109 110L(end): cmp $1, R32(n) 111 ja 2f 112 jz 1f 113 114 mul %r9 115 add %rax, %r12 116 mov -16(ap), %rax 117 adc %rdx, %r10 118 mov %r12, -24(rp) 119 mul %r8 120 add %rax, %r10 121 mov -16(bp), %rax 122 mov $0, R32(%r11) 123 adc %rdx, %r11 124 mul %r9 125 add %rax, %r10 126 mov -8(ap), %rax 127 adc %rdx, %r11 128 mov %r10, -16(rp) 129 mul %r8 130 add %rax, %r11 131 mov -8(bp), %rax 132 mov $0, R32(%r12) 133 adc %rdx, %r12 134 mul %r9 135 add %rax, %r11 136 adc %rdx, %r12 137 mov %r11, -8(rp) 138 mov %r12, %rax 139 pop %rbp 140 pop %r12 141 ret 142 1431: mul %r9 144 add %rax, %r12 145 mov -8(ap), %rax 146 adc %rdx, %r10 147 mov %r12, -16(rp) 148 mul %r8 149 add %rax, %r10 150 mov -8(bp), %rax 151 mov $0, R32(%r11) 152 adc %rdx, %r11 153 mul %r9 154 add %rax, %r10 155 adc %rdx, %r11 156 mov %r10, -8(rp) 157 mov %r11, %rax 158 pop %rbp 159 pop %r12 160 ret 161 1622: mul %r9 163 add %rax, %r12 164 mov %r12, -8(rp) 165 adc %rdx, %r10 166 mov %r10, %rax 167 pop %rbp 168 pop %r12 169 ret 170EPILOGUE() 171