190075Sobriendnl AMD64 mpn_divexact_1 -- mpn by limb exact division. 2169689Skan 3132718Skandnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc. 490075Sobrien 590075Sobriendnl This file is part of the GNU MP Library. 690075Sobriendnl 790075Sobriendnl The GNU MP Library is free software; you can redistribute it and/or modify 890075Sobriendnl it under the terms of either: 990075Sobriendnl 1090075Sobriendnl * the GNU Lesser General Public License as published by the Free 1190075Sobriendnl Software Foundation; either version 3 of the License, or (at your 1290075Sobriendnl option) any later version. 1390075Sobriendnl 1490075Sobriendnl or 1590075Sobriendnl 1690075Sobriendnl * the GNU General Public License as published by the Free Software 1790075Sobriendnl Foundation; either version 2 of the License, or (at your option) any 1890075Sobriendnl later version. 19169689Skandnl 20169689Skandnl or both in parallel, as here. 2190075Sobriendnl 22132718Skandnl The GNU MP Library is distributed in the hope that it will be useful, but 2390075Sobriendnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24132718Skandnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25132718Skandnl for more details. 2690075Sobriendnl 2790075Sobriendnl You should have received copies of the GNU General Public License and the 2890075Sobriendnl GNU Lesser General Public License along with the GNU MP Library. If not, 29117395Skandnl see https://www.gnu.org/licenses/. 3090075Sobrien 3190075Sobrieninclude(`../config.m4') 3290075Sobrien 3390075Sobrien 3490075SobrienC cycles/limb 3590075SobrienC AMD K8,K9 10 36117395SkanC AMD K10 10 37117395SkanC Intel P4 33 38169689SkanC Intel core2 13.25 39169689SkanC Intel corei 14 40169689SkanC Intel atom 42 41169689SkanC VIA nano 43 42169689Skan 43169689SkanC A quick adoption of the 32-bit K7 code. 4490075Sobrien 4590075Sobrien 4690075SobrienC INPUT PARAMETERS 4790075SobrienC rp rdi 4890075SobrienC up rsi 4990075SobrienC n rdx 5090075SobrienC divisor rcx 5190075Sobrien 5290075SobrienABI_SUPPORT(DOS64) 5390075SobrienABI_SUPPORT(STD64) 54117395Skan 55117395SkanASM_START() 5690075Sobrien TEXT 5790075Sobrien ALIGN(16) 5890075SobrienPROLOGUE(mpn_divexact_1) 5990075Sobrien FUNC_ENTRY(4) 6090075Sobrien push %rbx 6190075Sobrien 6290075Sobrien mov %rcx, %rax 6390075Sobrien xor R32(%rcx), R32(%rcx) C shift count 64117395Skan mov %rdx, %r8 6590075Sobrien 6690075Sobrien bt $0, R32(%rax) 67169689Skan jnc L(evn) C skip bsfq unless divisor is even 68169689Skan 69169689SkanL(odd): mov %rax, %rbx 7090075Sobrien shr R32(%rax) 7190075Sobrien and $127, R32(%rax) C d/2, 7 bits 7290075Sobrien 7390075Sobrien LEA( binvert_limb_table, %rdx) 74169689Skan 75169689Skan movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 7690075Sobrien 7790075Sobrien mov %rbx, %r11 C d without twos 7890075Sobrien 7990075Sobrien lea (%rax,%rax), R32(%rdx) C 2*inv 8090075Sobrien imul R32(%rax), R32(%rax) C inv*inv 8190075Sobrien imul R32(%rbx), R32(%rax) C inv*inv*d 8290075Sobrien sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 83169689Skan 84169689Skan lea (%rdx,%rdx), R32(%rax) C 2*inv 8590075Sobrien imul R32(%rdx), R32(%rdx) C inv*inv 8690075Sobrien imul R32(%rbx), R32(%rdx) C inv*inv*d 8790075Sobrien sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 8890075Sobrien 8990075Sobrien lea (%rax,%rax), %r10 C 2*inv 9090075Sobrien imul %rax, %rax C inv*inv 9190075Sobrien imul %rbx, %rax C inv*inv*d 9290075Sobrien sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits 9390075Sobrien 9490075Sobrien lea (%rsi,%r8,8), %rsi C up end 9590075Sobrien lea -8(%rdi,%r8,8), %rdi C rp end 9690075Sobrien neg %r8 C -n 97117395Skan 9890075Sobrien mov (%rsi,%r8,8), %rax C up[0] 9990075Sobrien 100117395Skan inc %r8 10190075Sobrien jz L(one) 102132718Skan 103132718Skan mov (%rsi,%r8,8), %rdx C up[1] 10490075Sobrien 105132718Skan shrd R8(%rcx), %rdx, %rax 106132718Skan 107132718Skan xor R32(%rbx), R32(%rbx) 108132718Skan jmp L(ent) 109132718Skan 110132718SkanL(evn): bsf %rax, %rcx 111132718Skan shr R8(%rcx), %rax 112132718Skan jmp L(odd) 113132718Skan 114132718Skan ALIGN(8) 115132718SkanL(top): 116132718Skan C rax q 117132718Skan C rbx carry bit, 0 or 1 118132718Skan C rcx shift 119132718Skan C rdx 120169689Skan C rsi up end 121169689Skan C rdi rp end 12290075Sobrien C r8 counter, limbs, negative 12390075Sobrien C r10 d^(-1) mod 2^64 124132718Skan C r11 d, shifted down 12590075Sobrien 126132718Skan mul %r11 C carry limb in rdx 0 10 12790075Sobrien mov -8(%rsi,%r8,8), %rax C 128132718Skan mov (%rsi,%r8,8), %r9 C 129132718Skan shrd R8(%rcx), %r9, %rax C 13090075Sobrien nop C 13190075Sobrien sub %rbx, %rax C apply carry bit 13290075Sobrien setc %bl C 13390075Sobrien sub %rdx, %rax C apply carry limb 5 134132718Skan adc $0, %rbx C 6 13590075SobrienL(ent): imul %r10, %rax C 6 13690075Sobrien mov %rax, (%rdi,%r8,8) C 13790075Sobrien inc %r8 C 13890075Sobrien jnz L(top) 13990075Sobrien 14090075Sobrien mul %r11 C carry limb in rdx 141169689Skan mov -8(%rsi), %rax C up high limb 142132718Skan shr R8(%rcx), %rax 14390075Sobrien sub %rbx, %rax C apply carry bit 14490075Sobrien sub %rdx, %rax C apply carry limb 14590075Sobrien imul %r10, %rax 14690075Sobrien mov %rax, (%rdi) 14790075Sobrien pop %rbx 14890075Sobrien FUNC_EXIT() 14990075Sobrien ret 150169689Skan 151169689SkanL(one): shr R8(%rcx), %rax 15290075Sobrien imul %r10, %rax 153169689Skan mov %rax, (%rdi) 154132718Skan pop %rbx 155132718Skan FUNC_EXIT() 15690075Sobrien ret 157169689Skan 15890075SobrienEPILOGUE() 159117395Skan