1dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << C), optimised for Core 2 and 2dnl Core iN. 3 4dnl Contributed to the GNU project by Torbjorn Granlund. 5 6dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34C cycles/limb 35C AMD K8,K9 4.25 36C AMD K10 ? 37C Intel P4 ? 38C Intel core2 3 39C Intel NHM 3.1 40C Intel SBR 2.47 41C Intel atom ? 42C VIA nano ? 43 44C INPUT PARAMETERS 45define(`rp',`%rdi') 46define(`up',`%rsi') 47define(`vp',`%rdx') 48define(`n', `%rcx') 49 50ASM_START() 51 TEXT 52 ALIGN(8) 53PROLOGUE(func) 54 FUNC_ENTRY(4) 55 push %rbx 56 push %r12 57 58 mov R32(%rcx), R32(%rax) 59 lea 24(up,n,8), up 60 lea 24(vp,n,8), vp 61 lea 24(rp,n,8), rp 62 neg n 63 64 xor R32(%r11), R32(%r11) 65 66 mov -24(vp,n,8), %r8 C do first limb early 67 shrd $RSH, %r8, %r11 68 69 and $3, R32(%rax) 70 je L(b0) 71 cmp $2, R32(%rax) 72 jc L(b1) 73 je L(b2) 74 75L(b3): mov -16(vp,n,8), %r9 76 shrd $RSH, %r9, %r8 77 mov -8(vp,n,8), %r10 78 shrd $RSH, %r10, %r9 79 mov -24(up,n,8), %r12 80 ADDSUB %r11, %r12 81 mov %r12, -24(rp,n,8) 82 mov -16(up,n,8), %r12 83 ADCSBB %r8, %r12 84 mov %r12, -16(rp,n,8) 85 mov -8(up,n,8), %r12 86 ADCSBB %r9, %r12 87 mov %r12, -8(rp,n,8) 88 mov %r10, %r11 89 sbb R32(%rax), R32(%rax) C save cy 90 add $3, n 91 js L(top) 92 jmp L(end) 93 94L(b1): mov -24(up,n,8), %r12 95 ADDSUB %r11, %r12 96 mov %r12, -24(rp,n,8) 97 mov %r8, %r11 98 sbb R32(%rax), R32(%rax) C save cy 99 inc n 100 js L(top) 101 jmp L(end) 102 103L(b2): mov -16(vp,n,8), %r9 104 shrd $RSH, %r9, %r8 105 mov -24(up,n,8), %r12 106 ADDSUB %r11, %r12 107 mov %r12, -24(rp,n,8) 108 mov -16(up,n,8), %r12 109 ADCSBB %r8, %r12 110 mov %r12, -16(rp,n,8) 111 mov %r9, %r11 112 sbb R32(%rax), R32(%rax) C save cy 113 add $2, n 114 js L(top) 115 jmp L(end) 116 117 ALIGN(16) 118L(top): mov -24(vp,n,8), %r8 119 shrd $RSH, %r8, %r11 120L(b0): mov -16(vp,n,8), %r9 121 shrd $RSH, %r9, %r8 122 mov -8(vp,n,8), %r10 123 shrd $RSH, %r10, %r9 124 mov (vp,n,8), %rbx 125 shrd $RSH, %rbx, %r10 126 127 add R32(%rax), R32(%rax) C restore cy 128 129 mov -24(up,n,8), %r12 130 ADCSBB %r11, %r12 131 mov %r12, -24(rp,n,8) 132 133 mov -16(up,n,8), %r12 134 ADCSBB %r8, %r12 135 mov %r12, -16(rp,n,8) 136 137 mov -8(up,n,8), %r12 138 ADCSBB %r9, %r12 139 mov %r12, -8(rp,n,8) 140 141 mov (up,n,8), %r12 142 ADCSBB %r10, %r12 143 mov %r12, (rp,n,8) 144 145 mov %rbx, %r11 146 sbb R32(%rax), R32(%rax) C save cy 147 148 add $4, n 149 js L(top) 150 151L(end): shr $RSH, %r11 152 pop %r12 153 pop %rbx 154 sub R32(%r11), R32(%rax) 155 neg R32(%rax) 156 FUNC_EXIT() 157 ret 158EPILOGUE() 159