1dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K7: 1.64 cycles/limb (at 16 limbs/loop). 24 25 26 27dnl K7: UNROLL_COUNT cycles/limb 28dnl 8 1.9 29dnl 16 1.64 30dnl 32 1.7 31dnl 64 2.0 32dnl Maximum possible with the current code is 64. 33 34deflit(UNROLL_COUNT, 16) 35 36 37ifdef(`OPERATION_add_n', ` 38 define(M4_inst, adcl) 39 define(M4_function_n, mpn_add_n) 40 define(M4_function_nc, mpn_add_nc) 41 define(M4_description, add) 42',`ifdef(`OPERATION_sub_n', ` 43 define(M4_inst, sbbl) 44 define(M4_function_n, mpn_sub_n) 45 define(M4_function_nc, mpn_sub_nc) 46 define(M4_description, subtract) 47',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n 48')')') 49 50MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 51 52 53C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 54C mp_size_t size); 55C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 56C mp_size_t size, mp_limb_t carry); 57C 58C Calculate src1,size M4_description src2,size, and store the result in 59C dst,size. The return value is the carry bit from the top of the result (1 60C or 0). 61C 62C The _nc version accepts 1 or 0 for an initial carry into the low limb of 63C the calculation. Note values other than 1 or 0 here will lead to garbage 64C results. 65C 66C This code runs at 1.64 cycles/limb, which might be the best possible with 67C plain integer operations. Each limb is 2 loads and 1 store, any 2 of 68C which can be done each cycle, leading to 1.5 c/l. 69 70dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. 71ifdef(`PIC',` 72deflit(UNROLL_THRESHOLD, 8) 73',` 74deflit(UNROLL_THRESHOLD, 8) 75') 76 77defframe(PARAM_CARRY,20) 78defframe(PARAM_SIZE, 16) 79defframe(PARAM_SRC2, 12) 80defframe(PARAM_SRC1, 8) 81defframe(PARAM_DST, 4) 82 83defframe(SAVE_EBP, -4) 84defframe(SAVE_ESI, -8) 85defframe(SAVE_EBX, -12) 86defframe(SAVE_EDI, -16) 87deflit(STACK_SPACE, 16) 88 89 TEXT 90 ALIGN(32) 91deflit(`FRAME',0) 92 93PROLOGUE(M4_function_nc) 94 movl PARAM_CARRY, %eax 95 jmp L(start) 96EPILOGUE() 97 98PROLOGUE(M4_function_n) 99 100 xorl %eax, %eax C carry 101L(start): 102 movl PARAM_SIZE, %ecx 103 subl $STACK_SPACE, %esp 104deflit(`FRAME',STACK_SPACE) 105 106 movl %edi, SAVE_EDI 107 movl %ebx, SAVE_EBX 108 cmpl $UNROLL_THRESHOLD, %ecx 109 110 movl PARAM_SRC2, %edx 111 movl PARAM_SRC1, %ebx 112 jae L(unroll) 113 114 movl PARAM_DST, %edi 115 leal (%ebx,%ecx,4), %ebx 116 leal (%edx,%ecx,4), %edx 117 118 leal (%edi,%ecx,4), %edi 119 negl %ecx 120 shrl %eax 121 122 C This loop in in a single 16 byte code block already, so no 123 C alignment necessary. 124L(simple): 125 C eax scratch 126 C ebx src1 127 C ecx counter 128 C edx src2 129 C esi 130 C edi dst 131 C ebp 132 133 movl (%ebx,%ecx,4), %eax 134 M4_inst (%edx,%ecx,4), %eax 135 movl %eax, (%edi,%ecx,4) 136 incl %ecx 137 jnz L(simple) 138 139 movl $0, %eax 140 movl SAVE_EDI, %edi 141 142 movl SAVE_EBX, %ebx 143 setc %al 144 addl $STACK_SPACE, %esp 145 146 ret 147 148 149C ----------------------------------------------------------------------------- 150 C This is at 0x55, close enough to aligned. 151L(unroll): 152deflit(`FRAME',STACK_SPACE) 153 movl %ebp, SAVE_EBP 154 andl $-2, %ecx C size low bit masked out 155 andl $1, PARAM_SIZE C size low bit kept 156 157 movl %ecx, %edi 158 decl %ecx 159 movl PARAM_DST, %ebp 160 161 shrl $UNROLL_LOG2, %ecx 162 negl %edi 163 movl %esi, SAVE_ESI 164 165 andl $UNROLL_MASK, %edi 166 167ifdef(`PIC',` 168 call L(pic_calc) 169L(here): 170',` 171 leal L(entry) (%edi,%edi,8), %esi C 9 bytes per 172') 173 negl %edi 174 shrl %eax 175 176 leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx 177 leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx 178 leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi 179 180 jmp *%esi 181 182 183ifdef(`PIC',` 184L(pic_calc): 185 C See mpn/x86/README about old gas bugs 186 leal (%edi,%edi,8), %esi 187 addl $L(entry)-L(here), %esi 188 addl (%esp), %esi 189 ret_internal 190') 191 192 193C ----------------------------------------------------------------------------- 194 ALIGN(32) 195L(top): 196 C eax zero 197 C ebx src1 198 C ecx counter 199 C edx src2 200 C esi scratch (was computed jump) 201 C edi dst 202 C ebp scratch 203 204 leal UNROLL_BYTES(%edx), %edx 205 206L(entry): 207deflit(CHUNK_COUNT, 2) 208forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 209 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 210 deflit(`disp1', eval(disp0 + 4)) 211 212Zdisp( movl, disp0,(%ebx), %esi) 213 movl disp1(%ebx), %ebp 214Zdisp( M4_inst,disp0,(%edx), %esi) 215Zdisp( movl, %esi, disp0,(%edi)) 216 M4_inst disp1(%edx), %ebp 217 movl %ebp, disp1(%edi) 218') 219 220 decl %ecx 221 leal UNROLL_BYTES(%ebx), %ebx 222 leal UNROLL_BYTES(%edi), %edi 223 jns L(top) 224 225 226 mov PARAM_SIZE, %esi 227 movl SAVE_EBP, %ebp 228 movl $0, %eax 229 230 decl %esi 231 js L(even) 232 233 movl (%ebx), %ecx 234 M4_inst UNROLL_BYTES(%edx), %ecx 235 movl %ecx, (%edi) 236L(even): 237 238 movl SAVE_EDI, %edi 239 movl SAVE_EBX, %ebx 240 setc %al 241 242 movl SAVE_ESI, %esi 243 addl $STACK_SPACE, %esp 244 245 ret 246 247EPILOGUE() 248