1dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1. 2 3dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C P6: 2.0 cycles/limb 35 36C TODO 37C Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13 38C with the current carry handling scheme. 39 40C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 41C 42C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3 43C into 2mod3, but at that point going into a separate carries total so we 44C don't keep the carry flag live across the loop control. Avoiding decl 45C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66. 46C 47 48defframe(PARAM_SIZE, 8) 49defframe(PARAM_SRC, 4) 50 51dnl re-use parameter space 52define(SAVE_EBX, `PARAM_SIZE') 53define(SAVE_ESI, `PARAM_SRC') 54 55 TEXT 56 ALIGN(16) 57PROLOGUE(mpn_mod_34lsub1) 58deflit(`FRAME',0) 59 60 movl PARAM_SIZE, %ecx 61 movl PARAM_SRC, %edx 62 63 subl $2, %ecx C size-2 64 movl (%edx), %eax C src[0] 65 ja L(three_or_more) 66 jb L(one) 67 68 C size==2 69 70 movl 4(%edx), %ecx C src[1] 71 72 movl %eax, %edx C src[0] 73 shrl $24, %eax C src[0] high 74 75 andl $0xFFFFFF, %edx C src[0] low 76 77 addl %edx, %eax 78 movl %ecx, %edx C src[1] 79 shrl $16, %ecx C src[1] high 80 81 andl $0xFFFF, %edx 82 addl %ecx, %eax 83 84 shll $8, %edx C src[1] low 85 86 addl %edx, %eax 87L(one): 88 ret 89 90 91L(three_or_more): 92 C eax src[0], initial acc 0mod3 93 C ebx 94 C ecx size-2 95 C edx src 96 C esi 97 C edi 98 C ebp 99 100 movl %ebx, SAVE_EBX 101 movl 4(%edx), %ebx C src[1], initial 1mod3 102 subl $3, %ecx C size-5 103 104 movl %esi, SAVE_ESI 105 movl 8(%edx), %esi C src[2], initial 2mod3 106 107 pushl %edi FRAME_pushl() 108 movl $0, %edi C initial carries 0mod3 109 jng L(done) C if size < 6 110 111 112L(top): 113 C eax acc 0mod3 114 C ebx acc 1mod3 115 C ecx counter, limbs 116 C edx src 117 C esi acc 2mod3 118 C edi carrys into 0mod3 119 C ebp 120 121 addl 12(%edx), %eax 122 adcl 16(%edx), %ebx 123 adcl 20(%edx), %esi 124 leal 12(%edx), %edx 125 adcl $0, %edi 126 127 subl $3, %ecx 128 jg L(top) C at least 3 more to process 129 130 131L(done): 132 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively 133 cmpl $-1, %ecx 134 jl L(done_0) C if -2, meaning 0 more limbs 135 136 C 1 or 2 more limbs 137 movl $0, %ecx 138 je L(done_1) C if -1, meaning 1 more limb only 139 movl 16(%edx), %ecx 140L(done_1): 141 addl 12(%edx), %eax C 0mod3 142 adcl %ecx, %ebx C 1mod3 143 adcl $0, %esi C 2mod3 144 adcl $0, %edi C carries 0mod3 145 146L(done_0): 147 C eax acc 0mod3 148 C ebx acc 1mod3 149 C ecx 150 C edx 151 C esi acc 2mod3 152 C edi carries 0mod3 153 C ebp 154 155 movl %eax, %ecx C 0mod3 156 shrl $24, %eax C 0mod3 high initial total 157 158 andl $0xFFFFFF, %ecx C 0mod3 low 159 movl %edi, %edx C carries 160 shrl $24, %edi C carries high 161 162 addl %ecx, %eax C add 0mod3 low 163 andl $0xFFFFFF, %edx C carries 0mod3 low 164 movl %ebx, %ecx C 1mod3 165 166 shrl $16, %ebx C 1mod3 high 167 addl %edi, %eax C add carries high 168 addl %edx, %eax C add carries 0mod3 low 169 170 andl $0xFFFF, %ecx C 1mod3 low mask 171 addl %ebx, %eax C add 1mod3 high 172 movl SAVE_EBX, %ebx 173 174 shll $8, %ecx C 1mod3 low 175 movl %esi, %edx C 2mod3 176 popl %edi FRAME_popl() 177 178 shrl $8, %esi C 2mod3 high 179 andl $0xFF, %edx C 2mod3 low mask 180 addl %ecx, %eax C add 1mod3 low 181 182 shll $16, %edx C 2mod3 low 183 addl %esi, %eax C add 2mod3 high 184 movl SAVE_ESI, %esi 185 186 addl %edx, %eax C add 2mod3 low 187 188 ret 189 190EPILOGUE() 191