1dnl AMD64 mpn_sec_tabselect. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb good for cpu 37C AMD K8,K9 1.5 Y 38C AMD K10 1.4 39C AMD bd1 2.64 40C AMD bobcat 2.15 Y 41C Intel P4 4 42C Intel core2 1.38 43C Intel NHM 1.75 44C Intel SBR 1.25 45C Intel atom 2.5 Y 46C VIA nano 1.75 Y 47 48C NOTES 49C * This has not been tuned for any specific processor. Its speed should not 50C be too bad, though. 51C * Using SSE2/AVX2 could result in many-fold speedup. 52C * WORKS FOR n mod 4 = 0 ONLY! 53 54C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) 55define(`rp', `%rdi') 56define(`tp', `%rsi') 57define(`n', `%rdx') 58define(`nents', `%rcx') 59define(`which', `%r8') 60 61define(`i', `%rbp') 62define(`j', `%r9') 63 64C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 65C nents n rp tab i which j * * * * * * 66 67ABI_SUPPORT(DOS64) 68ABI_SUPPORT(STD64) 69 70ASM_START() 71 TEXT 72 ALIGN(16) 73PROLOGUE(mpn_sec_tabselect) 74 FUNC_ENTRY(4) 75IFDOS(` mov 56(%rsp), %r8d ') 76 77 push %rbx 78 push %rbp 79 push %r12 80 push %r13 81 push %r14 82 push %r15 83 84 mov n, j 85 add $-4, j 86 js L(outer_end) 87 88L(outer_top): 89 mov nents, i 90 push tp 91 xor R32(%r12), R32(%r12) 92 xor R32(%r13), R32(%r13) 93 xor R32(%r14), R32(%r14) 94 xor R32(%r15), R32(%r15) 95 mov which, %rbx 96 97 ALIGN(16) 98L(top): sub $1, %rbx 99 sbb %rax, %rax 100 mov 0(tp), %r10 101 mov 8(tp), %r11 102 and %rax, %r10 103 and %rax, %r11 104 or %r10, %r12 105 or %r11, %r13 106 mov 16(tp), %r10 107 mov 24(tp), %r11 108 and %rax, %r10 109 and %rax, %r11 110 or %r10, %r14 111 or %r11, %r15 112 lea (tp,n,8), tp 113 add $-1, i 114 jne L(top) 115 116 mov %r12, 0(rp) 117 mov %r13, 8(rp) 118 mov %r14, 16(rp) 119 mov %r15, 24(rp) 120 pop tp 121 lea 32(tp), tp 122 lea 32(rp), rp 123 add $-4, j 124 jns L(outer_top) 125L(outer_end): 126 127 test $2, R8(n) 128 jz L(b0x) 129L(b1x): mov nents, i 130 push tp 131 xor R32(%r12), R32(%r12) 132 xor R32(%r13), R32(%r13) 133 mov which, %rbx 134 ALIGN(16) 135L(tp2): sub $1, %rbx 136 sbb %rax, %rax 137 mov 0(tp), %r10 138 mov 8(tp), %r11 139 and %rax, %r10 140 and %rax, %r11 141 or %r10, %r12 142 or %r11, %r13 143 lea (tp,n,8), tp 144 add $-1, i 145 jne L(tp2) 146 mov %r12, 0(rp) 147 mov %r13, 8(rp) 148 pop tp 149 lea 16(tp), tp 150 lea 16(rp), rp 151 152L(b0x): test $1, R8(n) 153 jz L(b00) 154L(b01): mov nents, i 155 xor R32(%r12), R32(%r12) 156 mov which, %rbx 157 ALIGN(16) 158L(tp1): sub $1, %rbx 159 sbb %rax, %rax 160 mov 0(tp), %r10 161 and %rax, %r10 162 or %r10, %r12 163 lea (tp,n,8), tp 164 add $-1, i 165 jne L(tp1) 166 mov %r12, 0(rp) 167 168L(b00): pop %r15 169 pop %r14 170 pop %r13 171 pop %r12 172 pop %rbp 173 pop %rbx 174 FUNC_EXIT() 175 ret 176EPILOGUE() 177