1dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and 2dnl add the result to a third limb vector. 3 4dnl Copyright 2008 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C K8,K9: 2.375 25C K10: 2.375 26C P4: ? 27C P6 core2: 4.45 28C P6 corei7: 4.35 29 30C This code is the result of running a code generation and optimization tool 31C suite written by David Harvey and Torbjorn Granlund. 32 33C TODO 34C * Work on feed-in and wind-down code. 35C * Convert "mov $0" to "xor". 36C * Adjust initial lea to save some bytes. 37C * Perhaps adjust n from n_param&3 value? 38 39C INPUT PARAMETERS 40define(`rp', `%rdi') 41define(`up', `%rsi') 42define(`n_param',`%rdx') 43define(`vp', `%rcx') 44 45define(`v0', `%r8') 46define(`v1', `%r9') 47define(`w0', `%rbx') 48define(`w1', `%rcx') 49define(`w2', `%rbp') 50define(`w3', `%r10') 51define(`n', `%r11') 52 53ASM_START() 54 TEXT 55 ALIGN(16) 56PROLOGUE(mpn_addmul_2) 57 push %rbx 58 push %rbp 59 60 mov (vp), v0 61 mov 8(vp), v1 62 63 mov n_param, n 64 neg n 65 lea -32(up,n_param,8), up 66 lea -32(rp,n_param,8), rp 67 68 and $3, R32(n_param) 69 jz L(am2p0) 70 cmp $2, R32(n_param) 71 jc L(am2p1) 72 jz L(am2p2) 73L(am2p3): 74 mov 32(up,n,8), %rax 75 mul v0 76 mov %rax, w1 77 mov 32(up,n,8), %rax 78 mov %rdx, w2 79 xor R32(w3), R32(w3) 80 add $2, n 81 jmp L(am3) 82L(am2p0): 83 mov 32(up,n,8), %rax 84 mul v0 85 mov %rax, w0 86 mov 32(up,n,8), %rax 87 mov %rdx, w1 88 xor R32(w2), R32(w2) 89 add $3, n 90 jmp L(am0) 91L(am2p1): 92 mov 32(up,n,8), %rax 93 mul v0 94 mov %rax, w3 95 mov 32(up,n,8), %rax 96 mov %rdx, w0 97 xor R32(w1), R32(w1) 98 jmp L(am1) 99L(am2p2): 100 mov 32(up,n,8), %rax 101 mul v0 102 mov %rax, w2 103 mov 32(up,n,8), %rax 104 mov %rdx, w3 105 xor R32(w0), R32(w0) 106 xor R32(w1), R32(w1) 107 add $1, n 108 jmp L(am2) 109 110 ALIGN(32) 111L(top): 112 add w3, (rp,n,8) C 0 21 113 adc %rax, w0 C 1 24 114 mov 8(up,n,8), %rax 115 adc %rdx, w1 C 3 26 116 mov $0, R32(w2) 117 mul v0 118 add %rax, w0 C 2 26 119 mov 8(up,n,8), %rax 120 adc %rdx, w1 C 4 28 121 adc $0, R32(w2) C 6 30 122L(am0): mul v1 123 add w0, 8(rp,n,8) C 3 27 124 adc %rax, w1 C 6 30 125 adc %rdx, w2 C 8 32 126 mov 16(up,n,8), %rax 127 mov $0, R32(w3) 128 mul v0 129 add %rax, w1 C 8 130 mov 16(up,n,8), %rax 131 adc %rdx, w2 C 10 132 adc $0, R32(w3) C 12 133L(am3): mul v1 134 add w1, 16(rp,n,8) C 9 135 adc %rax, w2 C 12 136 mov 24(up,n,8), %rax 137 adc %rdx, w3 C 14 138 mul v0 139 mov $0, R32(w0) 140 add %rax, w2 C 14 141 adc %rdx, w3 C 16 142 mov $0, R32(w1) 143 mov 24(up,n,8), %rax 144 adc $0, R32(w0) C 18 145L(am2): mul v1 146 add w2, 24(rp,n,8) C 15 147 adc %rax, w3 C 18 148 adc %rdx, w0 C 20 149 mov 32(up,n,8), %rax 150 mul v0 151 add %rax, w3 C 20 152 mov 32(up,n,8), %rax 153 adc %rdx, w0 C 22 154 adc $0, R32(w1) C 24 155L(am1): mul v1 156 add $4, n 157 js L(top) 158 159 add w3, (rp,n,8) 160 adc %rax, w0 161 adc %rdx, w1 162 mov w0, 8(rp,n,8) 163 mov w1, %rax 164 165 pop %rbp 166 pop %rbx 167 ret 168EPILOGUE() 169