1dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction. 2 3dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb. 24 25 26ifdef(`OPERATION_add_n', ` 27 define(M4_inst, adcl) 28 define(M4_function_n, mpn_add_n) 29 define(M4_function_nc, mpn_add_nc) 30 define(M4_description, add) 31',`ifdef(`OPERATION_sub_n', ` 32 define(M4_inst, sbbl) 33 define(M4_function_n, mpn_sub_n) 34 define(M4_function_nc, mpn_sub_nc) 35 define(M4_description, subtract) 36',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n 37')')') 38 39MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 40 41 42C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 43C mp_size_t size); 44C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 45C mp_size_t size, mp_limb_t carry); 46C 47C Calculate src1,size M4_description src2,size, and store the result in 48C dst,size. The return value is the carry bit from the top of the result 49C (1 or 0). 50C 51C The _nc version accepts 1 or 0 for an initial carry into the low limb of 52C the calculation. Note values other than 1 or 0 here will lead to garbage 53C results. 54C 55C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and 56C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of 57C loop control, which with 4 limbs/loop means an extra 0.25 c/l. 58 59define(PARAM_CARRY, `FRAME+20(%esp)') 60define(PARAM_SIZE, `FRAME+16(%esp)') 61define(PARAM_SRC2, `FRAME+12(%esp)') 62define(PARAM_SRC1, `FRAME+8(%esp)') 63define(PARAM_DST, `FRAME+4(%esp)') 64deflit(`FRAME',0) 65 66dnl minimum 5 because the unrolled code can't handle less 67deflit(UNROLL_THRESHOLD, 5) 68 69 TEXT 70 ALIGN(32) 71 72PROLOGUE(M4_function_nc) 73 movl PARAM_CARRY, %eax 74 jmp L(start) 75EPILOGUE() 76 77 78PROLOGUE(M4_function_n) 79 xorl %eax, %eax 80L(start): 81 movl PARAM_SIZE, %ecx 82 pushl %ebx 83FRAME_pushl() 84 85 movl PARAM_SRC1, %ebx 86 pushl %edi 87FRAME_pushl() 88 89 movl PARAM_SRC2, %edx 90 cmpl $UNROLL_THRESHOLD, %ecx 91 92 movl PARAM_DST, %edi 93 jae L(unroll) 94 95 96 shrl %eax C initial carry flag 97 98 C offset 0x21 here, close enough to aligned 99L(simple): 100 C eax scratch 101 C ebx src1 102 C ecx counter 103 C edx src2 104 C esi 105 C edi dst 106 C ebp 107 C 108 C The store to (%edi) could be done with a stosl; it'd be smaller 109 C code, but there's no speed gain and a cld would have to be added 110 C (per mpn/x86/README). 111 112 movl (%ebx), %eax 113 leal 4(%ebx), %ebx 114 115 M4_inst (%edx), %eax 116 117 movl %eax, (%edi) 118 leal 4(%edi), %edi 119 120 leal 4(%edx), %edx 121 loop L(simple) 122 123 124 movl $0, %eax 125 popl %edi 126 127 setc %al 128 129 popl %ebx 130 ret 131 132 133C ----------------------------------------------------------------------------- 134L(unroll): 135 C eax carry 136 C ebx src1 137 C ecx counter 138 C edx src2 139 C esi 140 C edi dst 141 C ebp 142 143 cmpl %edi, %ebx 144 pushl %esi 145 146 je L(inplace) 147 148ifdef(`OPERATION_add_n',` 149 cmpl %edi, %edx 150 151 je L(inplace_reverse) 152') 153 154 movl %ecx, %esi 155 156 andl $-4, %ecx 157 andl $3, %esi 158 159 leal (%ebx,%ecx,4), %ebx 160 leal (%edx,%ecx,4), %edx 161 leal (%edi,%ecx,4), %edi 162 163 negl %ecx 164 shrl %eax 165 166 ALIGN(32) 167L(normal_top): 168 C eax counter, qwords, negative 169 C ebx src1 170 C ecx scratch 171 C edx src2 172 C esi 173 C edi dst 174 C ebp 175 176 movl (%ebx,%ecx,4), %eax 177 leal 5(%ecx), %ecx 178 M4_inst -20(%edx,%ecx,4), %eax 179 movl %eax, -20(%edi,%ecx,4) 180 181 movl 4-20(%ebx,%ecx,4), %eax 182 M4_inst 4-20(%edx,%ecx,4), %eax 183 movl %eax, 4-20(%edi,%ecx,4) 184 185 movl 8-20(%ebx,%ecx,4), %eax 186 M4_inst 8-20(%edx,%ecx,4), %eax 187 movl %eax, 8-20(%edi,%ecx,4) 188 189 movl 12-20(%ebx,%ecx,4), %eax 190 M4_inst 12-20(%edx,%ecx,4), %eax 191 movl %eax, 12-20(%edi,%ecx,4) 192 193 loop L(normal_top) 194 195 196 decl %esi 197 jz L(normal_finish_one) 198 js L(normal_done) 199 200 C two or three more limbs 201 202 movl (%ebx), %eax 203 M4_inst (%edx), %eax 204 movl %eax, (%edi) 205 206 movl 4(%ebx), %eax 207 M4_inst 4(%edx), %eax 208 decl %esi 209 movl %eax, 4(%edi) 210 211 jz L(normal_done) 212 movl $2, %ecx 213 214L(normal_finish_one): 215 movl (%ebx,%ecx,4), %eax 216 M4_inst (%edx,%ecx,4), %eax 217 movl %eax, (%edi,%ecx,4) 218 219L(normal_done): 220 popl %esi 221 popl %edi 222 223 movl $0, %eax 224 popl %ebx 225 226 setc %al 227 228 ret 229 230 231C ----------------------------------------------------------------------------- 232 233ifdef(`OPERATION_add_n',` 234L(inplace_reverse): 235 C dst==src2 236 237 movl %ebx, %edx 238') 239 240L(inplace): 241 C eax initial carry 242 C ebx 243 C ecx size 244 C edx src 245 C esi 246 C edi dst 247 C ebp 248 249 leal -1(%ecx), %esi 250 decl %ecx 251 252 andl $-4, %ecx 253 andl $3, %esi 254 255 movl (%edx), %ebx C src low limb 256 leal (%edx,%ecx,4), %edx 257 258 leal (%edi,%ecx,4), %edi 259 negl %ecx 260 261 shrl %eax 262 263 264 ALIGN(32) 265L(inplace_top): 266 C eax 267 C ebx next src limb 268 C ecx size 269 C edx src 270 C esi 271 C edi dst 272 C ebp 273 274 M4_inst %ebx, (%edi,%ecx,4) 275 276 movl 4(%edx,%ecx,4), %eax 277 leal 5(%ecx), %ecx 278 279 M4_inst %eax, 4-20(%edi,%ecx,4) 280 281 movl 8-20(%edx,%ecx,4), %eax 282 movl 12-20(%edx,%ecx,4), %ebx 283 284 M4_inst %eax, 8-20(%edi,%ecx,4) 285 M4_inst %ebx, 12-20(%edi,%ecx,4) 286 287 movl 16-20(%edx,%ecx,4), %ebx 288 loop L(inplace_top) 289 290 291 C now %esi is 0 to 3 representing respectively 1 to 4 limbs more 292 293 M4_inst %ebx, (%edi) 294 295 decl %esi 296 jz L(inplace_finish_one) 297 js L(inplace_done) 298 299 C two or three more limbs 300 301 movl 4(%edx), %eax 302 movl 8(%edx), %ebx 303 M4_inst %eax, 4(%edi) 304 M4_inst %ebx, 8(%edi) 305 306 decl %esi 307 movl $2, %ecx 308 309 jz L(normal_done) 310 311L(inplace_finish_one): 312 movl 4(%edx,%ecx,4), %eax 313 M4_inst %eax, 4(%edi,%ecx,4) 314 315L(inplace_done): 316 popl %esi 317 popl %edi 318 319 movl $0, %eax 320 popl %ebx 321 322 setc %al 323 324 ret 325 326EPILOGUE() 327