1254721Semastednl AMD64 mpn_lshsub_n. R = 2^k(U - V). 2254721Semaste 3254721Semastednl Copyright 2006, 2011, 2012 Free Software Foundation, Inc. 4254721Semaste 5254721Semastednl This file is part of the GNU MP Library. 6254721Semastednl 7254721Semastednl The GNU MP Library is free software; you can redistribute it and/or modify 8254721Semastednl it under the terms of either: 9254721Semastednl 10254721Semastednl * the GNU Lesser General Public License as published by the Free 11254721Semastednl Software Foundation; either version 3 of the License, or (at your 12254721Semastednl option) any later version. 13254721Semastednl 14263363Semastednl or 15263363Semastednl 16254721Semastednl * the GNU General Public License as published by the Free Software 17254721Semastednl Foundation; either version 2 of the License, or (at your option) any 18254721Semastednl later version. 19254721Semastednl 20254721Semastednl or both in parallel, as here. 21254721Semastednl 22254721Semastednl The GNU MP Library is distributed in the hope that it will be useful, but 23254721Semastednl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24254721Semastednl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25263363Semastednl for more details. 26254721Semastednl 27254721Semastednl You should have received copies of the GNU General Public License and the 28254721Semastednl GNU Lesser General Public License along with the GNU MP Library. If not, 29254721Semastednl see https://www.gnu.org/licenses/. 30254721Semaste 31254721Semasteinclude(`../config.m4') 32254721Semaste 33254721Semaste 34254721SemasteC cycles/limb 35263363SemasteC AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 36254721SemasteC AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 37254721SemasteC Intel P4 16.5 38254721SemasteC Intel core2 4.35 39254721SemasteC Intel corei ? 40263363SemasteC Intel atom ? 41263363SemasteC VIA nano ? 42263363Semaste 43263363SemasteC This was written quickly and not optimized at all, but it runs very well on 44254721SemasteC K8. But perhaps one could get under 3 c/l. Ideas: 45254721SemasteC 1) Use indexing to save the 3 LEA 46254721SemasteC 2) Write reasonable feed-in code 47254721SemasteC 3) Be more clever about register usage 48254721SemasteC 4) Unroll more, handling CL negation, carry save/restore cost much now 49254721SemasteC 5) Reschedule 50254721Semaste 51254721SemasteC INPUT PARAMETERS 52254721Semastedefine(`rp', `%rdi') 53254721Semastedefine(`up', `%rsi') 54254721Semastedefine(`vp', `%rdx') 55254721Semastedefine(`n', `%rcx') 56254721Semastedefine(`cnt', `%r8') 57254721Semaste 58254721SemasteABI_SUPPORT(DOS64) 59254721SemasteABI_SUPPORT(STD64) 60254721Semaste 61254721SemasteASM_START() 62254721Semaste TEXT 63254721Semaste ALIGN(16) 64254721SemastePROLOGUE(mpn_lshsub_n) 65254721Semaste FUNC_ENTRY(4) 66254721SemasteIFDOS(` mov 56(%rsp), %r8d ') 67263363Semaste 68263367Semaste push %r12 69269024Semaste push %r13 70269024Semaste push %r14 71269024Semaste push %r15 72254721Semaste push %rbx 73254721Semaste 74254721Semaste mov n, %rax 75254721Semaste xor R32(%rbx), R32(%rbx) C clear carry save register 76254721Semaste mov R32(%r8), R32(%rcx) C shift count 77254721Semaste xor R32(%r15), R32(%r15) C limb carry 78254721Semaste 79254721Semaste mov R32(%rax), R32(%r11) 80254721Semaste and $3, R32(%r11) 81254721Semaste je L(4) 82263363Semaste sub $1, R32(%r11) 83263363Semaste 84254721SemasteL(oopette): 85254721Semaste add R32(%rbx), R32(%rbx) C restore carry flag 86254721Semaste mov 0(up), %r8 87254721Semaste lea 8(up), up 88254721Semaste sbb 0(vp), %r8 89254721Semaste mov %r8, %r12 90269024Semaste sbb R32(%rbx), R32(%rbx) C save carry flag 91254721Semaste shl R8(%rcx), %r8 92254721Semaste or %r15, %r8 93254721Semaste mov %r12, %r15 94254721Semaste lea 8(vp), vp 95254721Semaste neg R8(%rcx) 96254721Semaste shr R8(%rcx), %r15 97254721Semaste neg R8(%rcx) 98263363Semaste mov %r8, 0(rp) 99263363Semaste lea 8(rp), rp 100263363Semaste sub $1, R32(%r11) 101263363Semaste jnc L(oopette) 102269024Semaste 103269024SemasteL(4): 104254721Semaste sub $4, %rax 105254721Semaste jc L(end) 106254721Semaste 107254721Semaste ALIGN(16) 108254721SemasteL(oop): 109254721Semaste add R32(%rbx), R32(%rbx) C restore carry flag 110254721Semaste 111254721Semaste mov 0(up), %r8 112254721Semaste mov 8(up), %r9 113254721Semaste mov 16(up), %r10 114254721Semaste mov 24(up), %r11 115254721Semaste 116254721Semaste lea 32(up), up 117254721Semaste 118254721Semaste sbb 0(vp), %r8 119263367Semaste mov %r8, %r12 120263367Semaste sbb 8(vp), %r9 121254721Semaste mov %r9, %r13 122254721Semaste sbb 16(vp), %r10 123254721Semaste mov %r10, %r14 124263367Semaste sbb 24(vp), %r11 125269024Semaste 126269024Semaste sbb R32(%rbx), R32(%rbx) C save carry flag 127269024Semaste 128269024Semaste shl R8(%rcx), %r8 129269024Semaste shl R8(%rcx), %r9 130269024Semaste shl R8(%rcx), %r10 131269024Semaste or %r15, %r8 132269024Semaste mov %r11, %r15 133263367Semaste shl R8(%rcx), %r11 134263367Semaste 135263367Semaste lea 32(vp), vp 136263367Semaste 137263367Semaste neg R8(%rcx) 138263367Semaste 139269024Semaste shr R8(%rcx), %r12 140269024Semaste shr R8(%rcx), %r13 141269024Semaste shr R8(%rcx), %r14 142269024Semaste shr R8(%rcx), %r15 C used next loop 143269024Semaste 144263367Semaste or %r12, %r9 145263367Semaste or %r13, %r10 146263367Semaste or %r14, %r11 147263367Semaste 148263367Semaste neg R8(%rcx) 149263367Semaste 150263367Semaste mov %r8, 0(rp) 151263367Semaste mov %r9, 8(rp) 152263367Semaste mov %r10, 16(rp) 153263367Semaste mov %r11, 24(rp) 154263367Semaste 155263367Semaste lea 32(rp), rp 156263367Semaste 157254721Semaste sub $4, %rax 158254721Semaste jnc L(oop) 159254721SemasteL(end): 160263367Semaste neg R32(%rbx) 161269024Semaste shl R8(%rcx), %rbx 162269024Semaste adc %r15, %rbx 163269024Semaste mov %rbx, %rax 164269024Semaste pop %rbx 165269024Semaste pop %r15 166269024Semaste pop %r14 167269024Semaste pop %r13 168269024Semaste pop %r12 169269024Semaste 170269024Semaste FUNC_EXIT() 171269024Semaste ret 172269024SemasteEPILOGUE() 173269024Semaste