1dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y. 2 3dnl Copyright 2001, 2002, 2003, 2004, 2006 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb (approx) 24C dst!=src1,2 dst==src1 dst==src2 25C P4 m2: 4.5 ?7.25 ?6.75 26C P4 m3: 5.3 ? ? 27 28C mp_limb_t mpn_addlsh1_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 29C mp_size_t size); 30C 31C The slightly strange combination of indexing and pointer incrementing 32C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or 33C src2 is a slowdown. 34C 35C The dependent chain is simply the paddq of x+2*y to the previous carry, 36C then psrlq to get the new carry. That makes 4 c/l the target speed, which 37C is almost achieved for separate src/dst but when src==dst the write 38C combining anomalies slow it down. 39 40defframe(PARAM_SIZE, 16) 41defframe(PARAM_SRC2, 12) 42defframe(PARAM_SRC1, 8) 43defframe(PARAM_DST, 4) 44 45dnl re-use parameter space 46define(SAVE_EBX,`PARAM_SRC1') 47 48 TEXT 49 ALIGN(8) 50 51PROLOGUE(mpn_addlsh1_n) 52deflit(`FRAME',0) 53 54 movl PARAM_SRC1, %eax 55 movl %ebx, SAVE_EBX 56 57 movl PARAM_SRC2, %ebx 58 pxor %mm0, %mm0 C initial carry 59 60 movl PARAM_DST, %edx 61 62 movl PARAM_SIZE, %ecx 63 64 leal (%edx,%ecx,4), %edx C dst end 65 negl %ecx C -size 66 67L(top): 68 C eax src1 end 69 C ebx src2 end 70 C ecx counter, limbs, negative 71 C edx dst end 72 C mm0 carry 73 74 movd (%eax), %mm1 75 movd (%ebx), %mm2 76 psrlq $32, %mm0 77 leal 4(%eax), %eax 78 leal 4(%ebx), %ebx 79 80 paddq %mm2, %mm1 81 paddq %mm2, %mm1 82 83 paddq %mm1, %mm0 84 85 movd %mm0, (%edx,%ecx,4) 86 addl $1, %ecx 87 jnz L(top) 88 89 90 psrlq $32, %mm0 91 movl SAVE_EBX, %ebx 92 movd %mm0, %eax 93 emms 94 ret 95 96EPILOGUE() 97