1dnl Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C) 2 3dnl Contributed to the GNU project by Marco Bodrato. 4 5dnl Copyright 2011 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size); 36C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 37C mp_limb_t carry); 38C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,); 39C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 40C mp_signed_limb_t borrow); 41 42defframe(PARAM_CORB, 16) 43defframe(PARAM_SIZE, 12) 44defframe(PARAM_SRC, 8) 45defframe(PARAM_DST, 4) 46 47C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 48C mp_size_t size,); 49C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 50C mp_size_t size, mp_limb_t carry); 51C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 52C mp_size_t size,); 53C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 54C mp_size_t size, mp_limb_t borrow); 55 56C if src1 == dst, _ip1 is used 57 58C cycles/limb 59C dst!=src1,src2 dst==src1 60C P5 61C P6 model 0-8,10-12 62C P6 model 9 (Banias) 63C P6 model 13 (Dothan) 64C P4 model 0 (Willamette) 65C P4 model 1 (?) 66C P4 model 2 (Northwood) 67C P4 model 3 (Prescott) 68C P4 model 4 (Nocona) 69C Intel Atom 7 6 70C AMD K6 71C AMD K7 72C AMD K8 73C AMD K10 74 75defframe(GPARAM_CORB, 20) 76defframe(GPARAM_SIZE, 16) 77defframe(GPARAM_SRC2, 12) 78 79dnl re-use parameter space 80define(SAVE_EBP,`PARAM_SIZE') 81define(SAVE_EBX,`PARAM_SRC') 82define(SAVE_UP,`PARAM_DST') 83 84define(M, eval(m4_lshift(1,LSH))) 85define(`rp', `%edi') 86define(`up', `%esi') 87 88ASM_START() 89 TEXT 90 ALIGN(8) 91 92PROLOGUE(M4_ip_function_c) 93deflit(`FRAME',0) 94 movl PARAM_CORB, %ecx 95 movl %ecx, %edx 96 shr $LSH, %edx 97 andl $1, %edx 98 M4_opp %edx, %ecx 99 jmp L(start_nc) 100EPILOGUE() 101 102PROLOGUE(M4_ip_function) 103deflit(`FRAME',0) 104 105 xor %ecx, %ecx 106 xor %edx, %edx 107L(start_nc): 108 push rp FRAME_pushl() 109 mov PARAM_DST, rp 110 mov up, SAVE_UP 111 mov PARAM_SRC, up 112 mov %ebx, SAVE_EBX 113 mov PARAM_SIZE, %ebx C size 114L(inplace): 115 incl %ebx C size + 1 116 shr %ebx C (size+1)\2 117 mov %ebp, SAVE_EBP 118 jnc L(entry) C size odd 119 120 add %edx, %edx C size even 121 mov %ecx, %ebp 122 mov (up), %ecx 123 lea -4(rp), rp 124 lea (%ebp,%ecx,M), %eax 125 lea 4(up), up 126 jmp L(enteven) 127 128 ALIGN(16) 129L(oop): 130 lea (%ecx,%eax,M), %ebp 131 shr $RSH, %eax 132 mov 4(up), %ecx 133 add %edx, %edx 134 lea 8(up), up 135 M4_inst %ebp, (rp) 136 lea (%eax,%ecx,M), %eax 137 138L(enteven): 139 M4_inst %eax, 4(rp) 140 lea 8(rp), rp 141 142 sbb %edx, %edx 143 shr $RSH, %ecx 144 145L(entry): 146 mov (up), %eax 147 decl %ebx 148 jnz L(oop) 149 150 lea (%ecx,%eax,M), %ebp 151 shr $RSH, %eax 152 shr %edx 153 M4_inst %ebp, (rp) 154 mov SAVE_UP, up 155 adc $0, %eax 156 mov SAVE_EBP, %ebp 157 mov SAVE_EBX, %ebx 158 pop rp FRAME_popl() 159 ret 160EPILOGUE() 161 162PROLOGUE(M4_function_c) 163deflit(`FRAME',0) 164 movl GPARAM_CORB, %ecx 165 movl %ecx, %edx 166 shr $LSH, %edx 167 andl $1, %edx 168 M4_opp %edx, %ecx 169 jmp L(generic_nc) 170EPILOGUE() 171 172PROLOGUE(M4_function) 173deflit(`FRAME',0) 174 175 xor %ecx, %ecx 176 xor %edx, %edx 177L(generic_nc): 178 push rp FRAME_pushl() 179 mov PARAM_DST, rp 180 mov up, SAVE_UP 181 mov PARAM_SRC, up 182 cmp rp, up 183 mov %ebx, SAVE_EBX 184 jne L(general) 185 mov GPARAM_SIZE, %ebx C size 186 mov GPARAM_SRC2, up 187 jmp L(inplace) 188 189L(general): 190 mov GPARAM_SIZE, %eax C size 191 mov %ebx, SAVE_EBX 192 incl %eax C size + 1 193 mov up, %ebx C vp 194 mov GPARAM_SRC2, up C up 195 shr %eax C (size+1)\2 196 mov %ebp, SAVE_EBP 197 mov %eax, GPARAM_SIZE 198 jnc L(entry2) C size odd 199 200 add %edx, %edx C size even 201 mov %ecx, %ebp 202 mov (up), %ecx 203 lea -4(rp), rp 204 lea -4(%ebx), %ebx 205 lea (%ebp,%ecx,M), %eax 206 lea 4(up), up 207 jmp L(enteven2) 208 209 ALIGN(16) 210L(oop2): 211 lea (%ecx,%eax,M), %ebp 212 shr $RSH, %eax 213 mov 4(up), %ecx 214 add %edx, %edx 215 lea 8(up), up 216 mov (%ebx), %edx 217 M4_inst %ebp, %edx 218 lea (%eax,%ecx,M), %eax 219 mov %edx, (rp) 220L(enteven2): 221 mov 4(%ebx), %edx 222 lea 8(%ebx), %ebx 223 M4_inst %eax, %edx 224 mov %edx, 4(rp) 225 sbb %edx, %edx 226 shr $RSH, %ecx 227 lea 8(rp), rp 228L(entry2): 229 mov (up), %eax 230 decl GPARAM_SIZE 231 jnz L(oop2) 232 233 lea (%ecx,%eax,M), %ebp 234 shr $RSH, %eax 235 shr %edx 236 mov (%ebx), %edx 237 M4_inst %ebp, %edx 238 mov %edx, (rp) 239 mov SAVE_UP, up 240 adc $0, %eax 241 mov SAVE_EBP, %ebp 242 mov SAVE_EBX, %ebx 243 pop rp FRAME_popl() 244 ret 245EPILOGUE() 246 247ASM_END() 248