1dnl AMD64 mpn_sublsh1_n optimised for Intel Atom. 2dnl Used also for AMD bd1. 3 4dnl Contributed to the GNU project by Torbjorn Granlund. 5 6dnl Copyright 2011, 2012 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C TODO 37C * This code is slightly large at 501 bytes. 38C * aorrlsh1_n.asm and this file use the same basic pattern. 39 40C cycles/limb 41C AMD K8,K9 ? 42C AMD K10 ? 43C AMD bd1 2.3 44C AMD bobcat ? 45C Intel P4 ? 46C Intel core2 ? 47C Intel NHM ? 48C Intel SBR ? 49C Intel atom 5 (4.875 is probably possible) 50C VIA nano ? 51 52C INPUT PARAMETERS 53define(`rp', `%rdi') 54define(`up', `%rsi') 55define(`vp', `%rdx') 56define(`n', `%rcx') 57define(`cy', `%r8') 58 59ABI_SUPPORT(DOS64) 60ABI_SUPPORT(STD64) 61 62ASM_START() 63 TEXT 64 ALIGN(16) 65PROLOGUE(mpn_sublsh1_n) 66 FUNC_ENTRY(4) 67 push %rbp 68 push %r15 69 xor R32(%rbp), R32(%rbp) 70L(ent): mov R32(n), R32(%rax) 71 and $3, R32(%rax) 72 jz L(b0) 73 cmp $2, R32(%rax) 74 jz L(b2) 75 jg L(b3) 76 77L(b1): mov (vp), %r8 78 add %r8, %r8 79 lea 8(vp), vp 80 sbb R32(%rax), R32(%rax) C save scy 81 add R32(%rbp), R32(%rbp) C restore acy 82 mov (up), %r15 83 sbb %r8, %r15 84 mov %r15, (rp) 85 sbb R32(%rbp), R32(%rbp) C save acy 86 lea 8(up), up 87 lea 8(rp), rp 88 jmp L(b0) 89 90L(b2): mov (vp), %r8 91 add %r8, %r8 92 mov 8(vp), %r9 93 adc %r9, %r9 94 lea 16(vp), vp 95 sbb R32(%rax), R32(%rax) C save scy 96 add R32(%rbp), R32(%rbp) C restore acy 97 mov (up), %r15 98 sbb %r8, %r15 99 mov %r15, (rp) 100 mov 8(up), %r15 101 sbb %r9, %r15 102 mov %r15, 8(rp) 103 sbb R32(%rbp), R32(%rbp) C save acy 104 lea 16(up), up 105 lea 16(rp), rp 106 jmp L(b0) 107 108L(b3): mov (vp), %r8 109 add %r8, %r8 110 mov 8(vp), %r9 111 adc %r9, %r9 112 mov 16(vp), %r10 113 adc %r10, %r10 114 lea 24(vp), vp 115 sbb R32(%rax), R32(%rax) C save scy 116 add R32(%rbp), R32(%rbp) C restore acy 117 mov (up), %r15 118 sbb %r8, %r15 119 mov %r15, (rp) 120 mov 8(up), %r15 121 sbb %r9, %r15 122 mov %r15, 8(rp) 123 mov 16(up), %r15 124 sbb %r10, %r15 125 mov %r15, 16(rp) 126 sbb R32(%rbp), R32(%rbp) C save acy 127 lea 24(up), up 128 lea 24(rp), rp 129 130L(b0): test $4, R8(n) 131 jz L(skp) 132 add R32(%rax), R32(%rax) C restore scy 133 mov (vp), %r8 134 adc %r8, %r8 135 mov 8(vp), %r9 136 adc %r9, %r9 137 mov 16(vp), %r10 138 adc %r10, %r10 139 mov 24(vp), %r11 140 adc %r11, %r11 141 lea 32(vp), vp 142 sbb R32(%rax), R32(%rax) C save scy 143 add R32(%rbp), R32(%rbp) C restore acy 144 mov (up), %r15 145 sbb %r8, %r15 146 mov %r15, (rp) 147 mov 8(up), %r15 148 sbb %r9, %r15 149 mov %r15, 8(rp) 150 mov 16(up), %r15 151 sbb %r10, %r15 152 mov %r15, 16(rp) 153 mov 24(up), %r15 154 sbb %r11, %r15 155 mov %r15, 24(rp) 156 lea 32(up), up 157 lea 32(rp), rp 158 sbb R32(%rbp), R32(%rbp) C save acy 159 160L(skp): cmp $8, n 161 jl L(rtn) 162 163 push %r12 164 push %r13 165 push %r14 166 push %rbx 167 lea -64(rp), rp 168 jmp L(x) 169 170 ALIGN(16) 171L(top): mov (vp), %r8 172 add R32(%rax), R32(%rax) 173 lea 64(vp), vp 174 adc %r8, %r8 175 mov -56(vp), %r9 176 adc %r9, %r9 177 mov -48(vp), %r10 178 adc %r10, %r10 179 mov -40(vp), %r11 180 adc %r11, %r11 181 mov -32(vp), %r12 182 adc %r12, %r12 183 mov -24(vp), %r13 184 adc %r13, %r13 185 mov -16(vp), %r14 186 adc %r14, %r14 187 mov -8(vp), %r15 188 adc %r15, %r15 189 sbb R32(%rax), R32(%rax) 190 add R32(%rbp), R32(%rbp) 191 mov (up), %rbp 192 lea 64(rp), rp 193 mov 8(up), %rbx 194 sbb %r8, %rbp 195 mov 32(up), %r8 196 mov %rbp, (rp) 197 sbb %r9, %rbx 198 mov 16(up), %rbp 199 mov %rbx, 8(rp) 200 sbb %r10, %rbp 201 mov 24(up), %rbx 202 mov %rbp, 16(rp) 203 sbb %r11, %rbx 204 mov %rbx, 24(rp) 205 sbb %r12, %r8 206 mov 40(up), %r9 207 mov %r8, 32(rp) 208 sbb %r13, %r9 209 mov 48(up), %rbp 210 mov %r9, 40(rp) 211 sbb %r14, %rbp 212 mov 56(up), %rbx 213 mov %rbp, 48(rp) 214 sbb %r15, %rbx 215 lea 64(up), up 216 mov %rbx, 56(rp) 217 sbb R32(%rbp), R32(%rbp) 218L(x): sub $8, n 219 jge L(top) 220 221L(end): pop %rbx 222 pop %r14 223 pop %r13 224 pop %r12 225L(rtn): 226 add R32(%rbp), R32(%rax) 227 neg R32(%rax) 228 229 pop %r15 230 pop %rbp 231 FUNC_EXIT() 232 ret 233EPILOGUE() 234PROLOGUE(mpn_sublsh1_nc) 235 FUNC_ENTRY(4) 236IFDOS(` mov 56(%rsp), %r8 ') 237 push %rbp 238 push %r15 239 neg %r8 C set CF 240 sbb R32(%rbp), R32(%rbp) C save acy 241 jmp L(ent) 242EPILOGUE() 243