1dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. 2dnl ("rsb" means reversed subtract, name mandated by mpn_sublsh1_n which 3dnl subtacts the shifted operand from the unshifted operand.) 4 5dnl Copyright 2006 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb 26C K8,K9: 3.25 (mpn_lshift + mpn_add_n costs about 4.1 c/l) 27C K10: 3.25 (mpn_lshift + mpn_add_n costs about 4.1 c/l) 28C P4: 14 29C P6-15: 4 30 31C This was written quickly and not optimized at all. Surely one could get 32C closer to 3 c/l or perhaps even under 3 c/l. Ideas: 33C 1) Use indexing to save the 3 LEA 34C 2) Write reasonable feed-in code 35C 3) Be more clever about register usage 36C 4) Unroll more, handling CL negation, carry save/restore cost much now 37C 5) Reschedule 38 39C INPUT PARAMETERS 40define(`rp', `%rdi') 41define(`up', `%rsi') 42define(`vp', `%rdx') 43define(`n', `%rcx') 44define(`cnt', `%r8') 45 46ifdef(`OPERATION_addlsh_n',` 47 define(ADDSUBC, `adc') 48 define(func, mpn_addlsh_n) 49') 50ifdef(`OPERATION_rsblsh_n',` 51 define(ADDSUBC, `sbb') 52 define(func, mpn_rsblsh_n) 53') 54 55MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) 56 57ASM_START() 58 TEXT 59 ALIGN(16) 60PROLOGUE(func) 61 62 push %r12 63 push %r13 64 push %r14 65 push %r15 66 push %rbx 67 68 mov n, %rax 69 xor %ebx, %ebx C clear carry save register 70 mov %r8d, %ecx C shift count 71 xor %r15d, %r15d C limb carry 72 73 mov %eax, %r11d 74 and $3, %r11d 75 je L(4) 76 sub $1, %r11d 77 78L(oopette): 79 mov 0(vp), %r8 80 mov %r8, %r12 81 shl %cl, %r8 82 or %r15, %r8 83 neg %cl 84 mov %r12, %r15 85 shr %cl, %r15 86 neg %cl 87 add %ebx, %ebx 88 ADDSUBC 0(up), %r8 89 mov %r8, 0(rp) 90 sbb %ebx, %ebx 91 lea 8(up), up 92 lea 8(vp), vp 93 lea 8(rp), rp 94 sub $1, %r11d 95 jnc L(oopette) 96 97L(4): 98 sub $4, %rax 99 jc L(end) 100 101L(oop): 102 mov 0(vp), %r8 103 mov %r8, %r12 104 mov 8(vp), %r9 105 mov %r9, %r13 106 mov 16(vp), %r10 107 mov %r10, %r14 108 mov 24(vp), %r11 109 110 shl %cl, %r8 111 shl %cl, %r9 112 shl %cl, %r10 113 or %r15, %r8 114 mov %r11, %r15 115 shl %cl, %r11 116 117 neg %cl 118 119 shr %cl, %r12 120 shr %cl, %r13 121 shr %cl, %r14 122 shr %cl, %r15 C used next loop 123 124 or %r12, %r9 125 or %r13, %r10 126 or %r14, %r11 127 128 neg %cl 129 130 add %ebx, %ebx C restore carry flag 131 132 ADDSUBC 0(up), %r8 133 ADDSUBC 8(up), %r9 134 ADDSUBC 16(up), %r10 135 ADDSUBC 24(up), %r11 136 137 mov %r8, 0(rp) 138 mov %r9, 8(rp) 139 mov %r10, 16(rp) 140 mov %r11, 24(rp) 141 142 sbb %ebx, %ebx C save carry flag 143 144 lea 32(up), up 145 lea 32(vp), vp 146 lea 32(rp), rp 147 148 sub $4, %rax 149 jnc L(oop) 150L(end): 151 add %ebx, %ebx 152 ADDSUBC $0, %r15 153 mov %r15, %rax 154 pop %rbx 155 pop %r15 156 pop %r14 157 pop %r13 158 pop %r12 159 160 ret 161EPILOGUE() 162