1dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor, 2dnl returning quotient only. 3 4dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software 5dnl Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C norm/unorm 37C AMD K8,K9 10 + 38C AMD K10 10 + 39C AMD bull 13.7 - 40C AMD pile 13.7 + 41C AMD steam 42C AMD excavator 43C AMD bobcat 15 - 44C AMD jaguar 16 - 45C Intel P4 33 = 46C Intel core2 13.25 = 47C Intel NHM 14 = 48C Intel SBR 8.5 - 49C Intel IBR 8.5 - 50C Intel HWL 8 = 51C Intel BWL 8 = 52C Intel SKL 8 = 53C Intel atom 42 -- 54C Intel SLM 20.4 -- 55C VIA nano 56 57C INPUT PARAMETERS 58define(`rp', `%rdi') 59define(`up', `%rsi') 60define(`n', `%rdx') 61define(`d', `%rcx') 62define(`di', `%r8') C just mpn_pi1_bdiv_q_1 63define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 64 65ABI_SUPPORT(DOS64) 66ABI_SUPPORT(STD64) 67 68ASM_START() 69 TEXT 70 ALIGN(16) 71PROLOGUE(mpn_bdiv_q_1) 72 FUNC_ENTRY(4) 73 push %rbx 74 75 mov %rcx, %rax 76 xor R32(%rcx), R32(%rcx) C ncnt count 77 mov %rdx, %r10 78 79 bt $0, R32(%rax) 80 jnc L(evn) C skip bsf unless divisor is even 81 82L(odd): mov %rax, %rbx 83 shr R32(%rax) 84 and $127, R32(%rax) C d/2, 7 bits 85 86 LEA( binvert_limb_table, %rdx) 87 88 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 89 90 mov %rbx, %r11 C d without twos 91 92 lea (%rax,%rax), R32(%rdx) C 2*inv 93 imul R32(%rax), R32(%rax) C inv*inv 94 imul R32(%rbx), R32(%rax) C inv*inv*d 95 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 96 97 lea (%rdx,%rdx), R32(%rax) C 2*inv 98 imul R32(%rdx), R32(%rdx) C inv*inv 99 imul R32(%rbx), R32(%rdx) C inv*inv*d 100 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 101 102 lea (%rax,%rax), %r8 C 2*inv 103 imul %rax, %rax C inv*inv 104 imul %rbx, %rax C inv*inv*d 105 sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits 106 107 jmp L(pi1) 108 109L(evn): bsf %rax, %rcx 110 shr R8(%rcx), %rax 111 jmp L(odd) 112EPILOGUE() 113 114PROLOGUE(mpn_pi1_bdiv_q_1) 115 FUNC_ENTRY(4) 116IFDOS(` mov 56(%rsp), %r8 ') 117IFDOS(` mov 64(%rsp), %r9 ') 118 push %rbx 119 120 mov %rcx, %r11 C d 121 mov %rdx, %r10 C n 122 mov %r9, %rcx C ncnt 123 124L(pi1): mov (up), %rax C up[0] 125 126 dec %r10 127 jz L(one) 128 129 mov 8(up), %rdx C up[1] 130 lea (up,%r10,8), up C up end 131 lea (rp,%r10,8), rp C rp end 132 neg %r10 C -n 133 134 shrd R8(%rcx), %rdx, %rax 135 136 xor R32(%rbx), R32(%rbx) 137 jmp L(ent) 138 139 ALIGN(8) 140L(top): 141 C rax q 142 C rbx carry bit, 0 or 1 143 C rcx ncnt 144 C rdx 145 C r10 counter, limbs, negative 146 C r11 d 147 148 mul %r11 C carry limb in rdx 149 mov (up,%r10,8), %rax 150 mov 8(up,%r10,8), %r9 151 shrd R8(%rcx), %r9, %rax 152 nop 153 sub %rbx, %rax C apply carry bit 154 setc R8(%rbx) 155 sub %rdx, %rax C apply carry limb 156 adc $0, R32(%rbx) 157L(ent): imul %r8, %rax 158 mov %rax, (rp,%r10,8) 159 inc %r10 160 jnz L(top) 161 162 mul %r11 C carry limb in rdx 163 mov (up), %rax C up high limb 164 shr R8(%rcx), %rax 165 sub %rbx, %rax C apply carry bit 166 sub %rdx, %rax C apply carry limb 167 imul %r8, %rax 168 mov %rax, (rp) 169 pop %rbx 170 FUNC_EXIT() 171 ret 172 173L(one): shr R8(%rcx), %rax 174 imul %r8, %rax 175 mov %rax, (rp) 176 pop %rbx 177 FUNC_EXIT() 178 ret 179EPILOGUE() 180