1dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) 2 3dnl Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C AMD K8,K9 2.2 36C AMD K10 2.2 37C Intel P4 12.75 38C Intel core2 3.45 39C Intel corei ? 40C Intel atom ? 41C VIA nano 3.25 42 43C Sometimes speed degenerates, supposedly related to that some operand 44C alignments cause cache conflicts. 45 46C The speed is limited by decoding/issue bandwidth. There are 26 instructions 47C in the loop, which corresponds to 26/3/4 = 2.167 c/l. 48 49C INPUT PARAMETERS 50define(`rp',`%rdi') 51define(`up',`%rsi') 52define(`vp',`%rdx') 53define(`n', `%rcx') 54 55ABI_SUPPORT(DOS64) 56ABI_SUPPORT(STD64) 57 58ASM_START() 59 TEXT 60 ALIGN(16) 61PROLOGUE(mpn_sublsh1_n) 62 FUNC_ENTRY(4) 63 push %rbx 64 push %rbp 65 66 mov (vp), %r8 67 mov R32(n), R32(%rax) 68 lea (rp,n,8), rp 69 lea (up,n,8), up 70 lea (vp,n,8), vp 71 neg n 72 xor R32(%rbp), R32(%rbp) 73 and $3, R32(%rax) 74 je L(b00) 75 cmp $2, R32(%rax) 76 jc L(b01) 77 je L(b10) 78 79L(b11): add %r8, %r8 80 mov 8(vp,n,8), %r9 81 adc %r9, %r9 82 mov 16(vp,n,8), %r10 83 adc %r10, %r10 84 sbb R32(%rax), R32(%rax) C save scy 85 mov (up,n,8), %rbp 86 mov 8(up,n,8), %rbx 87 sub %r8, %rbp 88 sbb %r9, %rbx 89 mov %rbp, (rp,n,8) 90 mov %rbx, 8(rp,n,8) 91 mov 16(up,n,8), %rbp 92 sbb %r10, %rbp 93 mov %rbp, 16(rp,n,8) 94 sbb R32(%rbp), R32(%rbp) C save acy 95 add $3, n 96 jmp L(ent) 97 98L(b10): add %r8, %r8 99 mov 8(vp,n,8), %r9 100 adc %r9, %r9 101 sbb R32(%rax), R32(%rax) C save scy 102 mov (up,n,8), %rbp 103 mov 8(up,n,8), %rbx 104 sub %r8, %rbp 105 sbb %r9, %rbx 106 mov %rbp, (rp,n,8) 107 mov %rbx, 8(rp,n,8) 108 sbb R32(%rbp), R32(%rbp) C save acy 109 add $2, n 110 jmp L(ent) 111 112L(b01): add %r8, %r8 113 sbb R32(%rax), R32(%rax) C save scy 114 mov (up,n,8), %rbp 115 sub %r8, %rbp 116 mov %rbp, (rp,n,8) 117 sbb R32(%rbp), R32(%rbp) C save acy 118 inc n 119L(ent): jns L(end) 120 121 ALIGN(16) 122L(top): add R32(%rax), R32(%rax) C restore scy 123 124 mov (vp,n,8), %r8 125L(b00): adc %r8, %r8 126 mov 8(vp,n,8), %r9 127 adc %r9, %r9 128 mov 16(vp,n,8), %r10 129 adc %r10, %r10 130 mov 24(vp,n,8), %r11 131 adc %r11, %r11 132 133 sbb R32(%rax), R32(%rax) C save scy 134 add R32(%rbp), R32(%rbp) C restore acy 135 136 mov (up,n,8), %rbp 137 mov 8(up,n,8), %rbx 138 sbb %r8, %rbp 139 sbb %r9, %rbx 140 mov %rbp, (rp,n,8) 141 mov %rbx, 8(rp,n,8) 142 mov 16(up,n,8), %rbp 143 mov 24(up,n,8), %rbx 144 sbb %r10, %rbp 145 sbb %r11, %rbx 146 mov %rbp, 16(rp,n,8) 147 mov %rbx, 24(rp,n,8) 148 149 sbb R32(%rbp), R32(%rbp) C save acy 150 add $4, n 151 js L(top) 152 153L(end): add R32(%rbp), R32(%rax) 154 neg R32(%rax) 155 156 pop %rbp 157 pop %rbx 158 FUNC_EXIT() 159 ret 160EPILOGUE() 161