1dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). 2 3dnl Copyright 2006, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 36C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 37C Intel P4 16.5 38C Intel core2 4.35 39C Intel corei ? 40C Intel atom ? 41C VIA nano ? 42 43C This was written quickly and not optimized at all, but it runs very well on 44C K8. But perhaps one could get under 3 c/l. Ideas: 45C 1) Use indexing to save the 3 LEA 46C 2) Write reasonable feed-in code 47C 3) Be more clever about register usage 48C 4) Unroll more, handling CL negation, carry save/restore cost much now 49C 5) Reschedule 50 51C INPUT PARAMETERS 52define(`rp', `%rdi') 53define(`up', `%rsi') 54define(`vp', `%rdx') 55define(`n', `%rcx') 56define(`cnt', `%r8') 57 58ABI_SUPPORT(DOS64) 59ABI_SUPPORT(STD64) 60 61ASM_START() 62 TEXT 63 ALIGN(16) 64PROLOGUE(mpn_lshsub_n) 65 FUNC_ENTRY(4) 66IFDOS(` mov 56(%rsp), %r8d ') 67 68 push %r12 69 push %r13 70 push %r14 71 push %r15 72 push %rbx 73 74 mov n, %rax 75 xor R32(%rbx), R32(%rbx) C clear carry save register 76 mov R32(%r8), R32(%rcx) C shift count 77 xor R32(%r15), R32(%r15) C limb carry 78 79 mov R32(%rax), R32(%r11) 80 and $3, R32(%r11) 81 je L(4) 82 sub $1, R32(%r11) 83 84L(oopette): 85 add R32(%rbx), R32(%rbx) C restore carry flag 86 mov 0(up), %r8 87 lea 8(up), up 88 sbb 0(vp), %r8 89 mov %r8, %r12 90 sbb R32(%rbx), R32(%rbx) C save carry flag 91 shl R8(%rcx), %r8 92 or %r15, %r8 93 mov %r12, %r15 94 lea 8(vp), vp 95 neg R8(%rcx) 96 shr R8(%rcx), %r15 97 neg R8(%rcx) 98 mov %r8, 0(rp) 99 lea 8(rp), rp 100 sub $1, R32(%r11) 101 jnc L(oopette) 102 103L(4): 104 sub $4, %rax 105 jc L(end) 106 107 ALIGN(16) 108L(oop): 109 add R32(%rbx), R32(%rbx) C restore carry flag 110 111 mov 0(up), %r8 112 mov 8(up), %r9 113 mov 16(up), %r10 114 mov 24(up), %r11 115 116 lea 32(up), up 117 118 sbb 0(vp), %r8 119 mov %r8, %r12 120 sbb 8(vp), %r9 121 mov %r9, %r13 122 sbb 16(vp), %r10 123 mov %r10, %r14 124 sbb 24(vp), %r11 125 126 sbb R32(%rbx), R32(%rbx) C save carry flag 127 128 shl R8(%rcx), %r8 129 shl R8(%rcx), %r9 130 shl R8(%rcx), %r10 131 or %r15, %r8 132 mov %r11, %r15 133 shl R8(%rcx), %r11 134 135 lea 32(vp), vp 136 137 neg R8(%rcx) 138 139 shr R8(%rcx), %r12 140 shr R8(%rcx), %r13 141 shr R8(%rcx), %r14 142 shr R8(%rcx), %r15 C used next loop 143 144 or %r12, %r9 145 or %r13, %r10 146 or %r14, %r11 147 148 neg R8(%rcx) 149 150 mov %r8, 0(rp) 151 mov %r9, 8(rp) 152 mov %r10, 16(rp) 153 mov %r11, 24(rp) 154 155 lea 32(rp), rp 156 157 sub $4, %rax 158 jnc L(oop) 159L(end): 160 neg R32(%rbx) 161 shl R8(%rcx), %rbx 162 adc %r15, %rbx 163 mov %rbx, %rax 164 pop %rbx 165 pop %r15 166 pop %r14 167 pop %r13 168 pop %r12 169 170 FUNC_EXIT() 171 ret 172EPILOGUE() 173