1251875Speterdnl AMD64 mpn_lshsub_n. R = 2^k(U - V). 2251875Speter 3251875Speterdnl Copyright 2006 Free Software Foundation, Inc. 4251875Speter 5251875Speterdnl This file is part of the GNU MP Library. 6251875Speter 7251875Speterdnl The GNU MP Library is free software; you can redistribute it and/or modify 8251875Speterdnl it under the terms of the GNU Lesser General Public License as published 9251875Speterdnl by the Free Software Foundation; either version 3 of the License, or (at 10251875Speterdnl your option) any later version. 11251875Speter 12251875Speterdnl The GNU MP Library is distributed in the hope that it will be useful, but 13251875Speterdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14251875Speterdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15251875Speterdnl License for more details. 16251875Speter 17251875Speterdnl You should have received a copy of the GNU Lesser General Public License 18251875Speterdnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19251875Speter 20251875Speterinclude(`../config.m4') 21251875Speter 22251875Speter 23251875SpeterC cycles/limb 24251875SpeterC K8,K9: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 25251875SpeterC K10: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 26251875SpeterC P4: 16.5 27251875SpeterC P6-15: 4.35 28251875Speter 29251875SpeterC This was written quickly and not optimized at all, but it runs very well on 30251875SpeterC K8. But perhaps one could get under 3 c/l. Ideas: 31251875SpeterC 1) Use indexing to save the 3 LEA 32251875SpeterC 2) Write reasonable feed-in code 33251875SpeterC 3) Be more clever about register usage 34251875SpeterC 4) Unroll more, handling CL negation, carry save/restore cost much now 35251875SpeterC 5) Reschedule 36251875Speter 37251875SpeterC INPUT PARAMETERS 38251875Speterdefine(`rp', `%rdi') 39251875Speterdefine(`up', `%rsi') 40251875Speterdefine(`vp', `%rdx') 41251875Speterdefine(`n', `%rcx') 42251875Speterdefine(`cnt', `%r8') 43251875Speter 44251875SpeterASM_START() 45251875Speter TEXT 46251875Speter ALIGN(16) 47251875SpeterPROLOGUE(mpn_lshsub_n) 48251875Speter 49251875Speter push %r12 50251875Speter push %r13 51251875Speter push %r14 52251875Speter push %r15 53251875Speter push %rbx 54251875Speter 55251875Speter mov n, %rax 56251875Speter xor %ebx, %ebx C clear carry save register 57251875Speter mov %r8d, %ecx C shift count 58251875Speter xor %r15d, %r15d C limb carry 59251875Speter 60251875Speter mov %eax, %r11d 61251875Speter and $3, %r11d 62251875Speter je L(4) 63251875Speter sub $1, %r11d 64251875Speter 65251875SpeterL(oopette): 66251875Speter add %ebx, %ebx C restore carry flag 67251875Speter mov 0(up), %r8 68251875Speter lea 8(up), up 69251875Speter sbb 0(vp), %r8 70251875Speter mov %r8, %r12 71251875Speter sbb %ebx, %ebx C save carry flag 72251875Speter shl %cl, %r8 73251875Speter or %r15, %r8 74251875Speter mov %r12, %r15 75251875Speter lea 8(vp), vp 76251875Speter neg %cl 77251875Speter shr %cl, %r15 78251875Speter neg %cl 79251875Speter mov %r8, 0(rp) 80251875Speter lea 8(rp), rp 81251875Speter sub $1, %r11d 82251875Speter jnc L(oopette) 83251875Speter 84251875SpeterL(4): 85251875Speter sub $4, %rax 86251875Speter jc L(end) 87251875Speter 88251875Speter ALIGN(16) 89251875SpeterL(oop): 90251875Speter add %ebx, %ebx C restore carry flag 91251875Speter 92251875Speter mov 0(up), %r8 93251875Speter mov 8(up), %r9 94251875Speter mov 16(up), %r10 95251875Speter mov 24(up), %r11 96251875Speter 97251875Speter lea 32(up), up 98251875Speter 99251875Speter sbb 0(vp), %r8 100251875Speter mov %r8, %r12 101251875Speter sbb 8(vp), %r9 102251875Speter mov %r9, %r13 103251875Speter sbb 16(vp), %r10 104251875Speter mov %r10, %r14 105251875Speter sbb 24(vp), %r11 106251875Speter 107251875Speter sbb %ebx, %ebx C save carry flag 108251875Speter 109251875Speter shl %cl, %r8 110251875Speter shl %cl, %r9 111251875Speter shl %cl, %r10 112251875Speter or %r15, %r8 113251875Speter mov %r11, %r15 114251875Speter shl %cl, %r11 115251875Speter 116251875Speter lea 32(vp), vp 117251875Speter 118251875Speter neg %cl 119251875Speter 120251875Speter shr %cl, %r12 121251875Speter shr %cl, %r13 122251875Speter shr %cl, %r14 123251875Speter shr %cl, %r15 C used next loop 124251875Speter 125251875Speter or %r12, %r9 126251875Speter or %r13, %r10 127251875Speter or %r14, %r11 128251875Speter 129251875Speter neg %cl 130251875Speter 131251875Speter mov %r8, 0(rp) 132251875Speter mov %r9, 8(rp) 133251875Speter mov %r10, 16(rp) 134251875Speter mov %r11, 24(rp) 135251875Speter 136251875Speter lea 32(rp), rp 137251875Speter 138251875Speter sub $4, %rax 139251875Speter jnc L(oop) 140251875SpeterL(end): 141251875Speter neg %ebx 142251875Speter shl %cl, %rbx 143251875Speter adc %r15, %rbx 144251875Speter mov %rbx, %rax 145251875Speter pop %rbx 146251875Speter pop %r15 147251875Speter pop %r14 148251875Speter pop %r13 149251875Speter pop %r12 150251875Speter 151251875Speter ret 152251875SpeterEPILOGUE() 153251875Speter