1dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. 2 3dnl Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34 35C cycles/limb 36C AMD K8,K9 1.0 37C AMD K10 1.12 38C Intel P4 3.25 39C Intel core2 1.5 40C Intel corei 1.5 41C Intel atom 2.5 42C VIA nano 1.75 43 44 45C INPUT PARAMETERS 46define(`ap', %rdi) 47define(`n', %rsi) 48 49C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) 50 51C TODO 52C * Review feed-in and wind-down code. In particular, try to avoid adc and 53C sbb to placate Pentium4. 54C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling, 55C without the dual loop exits. 56 57ABI_SUPPORT(DOS64) 58ABI_SUPPORT(STD64) 59 60ASM_START() 61 TEXT 62 ALIGN(32) 63PROLOGUE(mpn_mod_34lsub1) 64 FUNC_ENTRY(2) 65 66 mov $0x0000FFFFFFFFFFFF, %r11 67 68 sub $2, %rsi 69 ja L(gt2) 70 71 mov (ap), %rax 72 nop 73 jb L(1) 74 75 mov 8(ap), %rsi 76 mov %rax, %rdx 77 shr $48, %rax C src[0] low 78 79 and %r11, %rdx C src[0] high 80 add %rdx, %rax 81 mov R32(%rsi), R32(%rdx) 82 83 shr $32, %rsi C src[1] high 84 add %rsi, %rax 85 86 shl $16, %rdx C src[1] low 87 add %rdx, %rax 88 89L(1): FUNC_EXIT() 90 ret 91 92 93 ALIGN(16) 94L(gt2): xor R32(%rax), R32(%rax) 95 xor R32(%rcx), R32(%rcx) 96 xor R32(%rdx), R32(%rdx) 97 xor %r8, %r8 98 xor %r9, %r9 99 xor %r10, %r10 100 101L(top): add (ap), %rax 102 adc $0, %r10 103 add 8(ap), %rcx 104 adc $0, %r8 105 add 16(ap), %rdx 106 adc $0, %r9 107 108 sub $3, %rsi 109 jng L(end) 110 111 add 24(ap), %rax 112 adc $0, %r10 113 add 32(ap), %rcx 114 adc $0, %r8 115 add 40(ap), %rdx 116 lea 48(ap), ap 117 adc $0, %r9 118 119 sub $3, %rsi 120 jg L(top) 121 122 123 add $-24, ap 124L(end): add %r9, %rax 125 adc %r10, %rcx 126 adc %r8, %rdx 127 128 inc %rsi 129 mov $0x1, R32(%r10) 130 js L(combine) 131 132 mov $0x10000, R32(%r10) 133 adc 24(ap), %rax 134 dec %rsi 135 js L(combine) 136 137 adc 32(ap), %rcx 138 mov $0x100000000, %r10 139 140L(combine): 141 sbb %rsi, %rsi C carry 142 mov %rax, %rdi C 0mod3 143 shr $48, %rax C 0mod3 high 144 145 and %r10, %rsi C carry masked 146 and %r11, %rdi C 0mod3 low 147 mov R32(%rcx), R32(%r10) C 1mod3 148 149 add %rsi, %rax C apply carry 150 shr $32, %rcx C 1mod3 high 151 152 add %rdi, %rax C apply 0mod3 low 153 movzwl %dx, R32(%rdi) C 2mod3 154 shl $16, %r10 C 1mod3 low 155 156 add %rcx, %rax C apply 1mod3 high 157 shr $16, %rdx C 2mod3 high 158 159 add %r10, %rax C apply 1mod3 low 160 shl $32, %rdi C 2mod3 low 161 162 add %rdx, %rax C apply 2mod3 high 163 add %rdi, %rax C apply 2mod3 low 164 165 FUNC_EXIT() 166 ret 167EPILOGUE() 168