1dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). 2 3dnl Copyright 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 25C K10: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 26C P4: 16.5 27C P6-15: 4.35 28 29C This was written quickly and not optimized at all, but it runs very well on 30C K8. But perhaps one could get under 3 c/l. Ideas: 31C 1) Use indexing to save the 3 LEA 32C 2) Write reasonable feed-in code 33C 3) Be more clever about register usage 34C 4) Unroll more, handling CL negation, carry save/restore cost much now 35C 5) Reschedule 36 37C INPUT PARAMETERS 38define(`rp', `%rdi') 39define(`up', `%rsi') 40define(`vp', `%rdx') 41define(`n', `%rcx') 42define(`cnt', `%r8') 43 44ASM_START() 45 TEXT 46 ALIGN(16) 47PROLOGUE(mpn_lshsub_n) 48 49 push %r12 50 push %r13 51 push %r14 52 push %r15 53 push %rbx 54 55 mov n, %rax 56 xor %ebx, %ebx C clear carry save register 57 mov %r8d, %ecx C shift count 58 xor %r15d, %r15d C limb carry 59 60 mov %eax, %r11d 61 and $3, %r11d 62 je L(4) 63 sub $1, %r11d 64 65L(oopette): 66 add %ebx, %ebx C restore carry flag 67 mov 0(up), %r8 68 lea 8(up), up 69 sbb 0(vp), %r8 70 mov %r8, %r12 71 sbb %ebx, %ebx C save carry flag 72 shl %cl, %r8 73 or %r15, %r8 74 mov %r12, %r15 75 lea 8(vp), vp 76 neg %cl 77 shr %cl, %r15 78 neg %cl 79 mov %r8, 0(rp) 80 lea 8(rp), rp 81 sub $1, %r11d 82 jnc L(oopette) 83 84L(4): 85 sub $4, %rax 86 jc L(end) 87 88 ALIGN(16) 89L(oop): 90 add %ebx, %ebx C restore carry flag 91 92 mov 0(up), %r8 93 mov 8(up), %r9 94 mov 16(up), %r10 95 mov 24(up), %r11 96 97 lea 32(up), up 98 99 sbb 0(vp), %r8 100 mov %r8, %r12 101 sbb 8(vp), %r9 102 mov %r9, %r13 103 sbb 16(vp), %r10 104 mov %r10, %r14 105 sbb 24(vp), %r11 106 107 sbb %ebx, %ebx C save carry flag 108 109 shl %cl, %r8 110 shl %cl, %r9 111 shl %cl, %r10 112 or %r15, %r8 113 mov %r11, %r15 114 shl %cl, %r11 115 116 lea 32(vp), vp 117 118 neg %cl 119 120 shr %cl, %r12 121 shr %cl, %r13 122 shr %cl, %r14 123 shr %cl, %r15 C used next loop 124 125 or %r12, %r9 126 or %r13, %r10 127 or %r14, %r11 128 129 neg %cl 130 131 mov %r8, 0(rp) 132 mov %r9, 8(rp) 133 mov %r10, 16(rp) 134 mov %r11, 24(rp) 135 136 lea 32(rp), rp 137 138 sub $4, %rax 139 jnc L(oop) 140L(end): 141 neg %ebx 142 shl %cl, %rbx 143 adc %r15, %rbx 144 mov %rbx, %rax 145 pop %rbx 146 pop %r15 147 pop %r14 148 pop %r13 149 pop %r12 150 151 ret 152EPILOGUE() 153