1dnl Intel Pentium-4 mpn_sub_n -- mpn subtraction. 2 3dnl Copyright 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2 24C 6.0 cycles/limb if dst==src1 or dst==src2 25C P4 Prescott: >= 5 cycles/limb 26 27 28C mp_limb_t mpn_sub_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 29C mp_size_t size); 30C mp_limb_t mpn_sub_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 31C mp_size_t size, mp_limb_t carry); 32C 33C The main loop code is 2x unrolled so that the carry bit can alternate 34C between mm0 and mm1. 35 36defframe(PARAM_CARRY,20) 37defframe(PARAM_SIZE, 16) 38defframe(PARAM_SRC2, 12) 39defframe(PARAM_SRC1, 8) 40defframe(PARAM_DST, 4) 41 42dnl re-use parameter space 43define(SAVE_EBX,`PARAM_SRC1') 44 45 TEXT 46 ALIGN(8) 47 48PROLOGUE(mpn_sub_nc) 49deflit(`FRAME',0) 50 51 movd PARAM_CARRY, %mm0 52 jmp L(start_nc) 53 54EPILOGUE() 55 56 ALIGN(8) 57PROLOGUE(mpn_sub_n) 58deflit(`FRAME',0) 59 pxor %mm0, %mm0 60L(start_nc): 61 movl PARAM_SRC1, %eax 62 movl %ebx, SAVE_EBX 63 movl PARAM_SRC2, %ebx 64 movl PARAM_DST, %edx 65 movl PARAM_SIZE, %ecx 66 67 leal (%eax,%ecx,4), %eax C src1 end 68 leal (%ebx,%ecx,4), %ebx C src2 end 69 leal (%edx,%ecx,4), %edx C dst end 70 negl %ecx C -size 71 72L(top): 73 C eax src1 end 74 C ebx src2 end 75 C ecx counter, limbs, negative 76 C edx dst end 77 C mm0 carry bit 78 79 movd (%eax,%ecx,4), %mm1 80 movd (%ebx,%ecx,4), %mm2 81 psubq %mm2, %mm1 82 83 psubq %mm0, %mm1 84 movd %mm1, (%edx,%ecx,4) 85 86 psrlq $63, %mm1 87 88 addl $1, %ecx 89 jz L(done_mm1) 90 91 movd (%eax,%ecx,4), %mm0 92 movd (%ebx,%ecx,4), %mm2 93 psubq %mm2, %mm0 94 95 psubq %mm1, %mm0 96 movd %mm0, (%edx,%ecx,4) 97 98 psrlq $63, %mm0 99 100 addl $1, %ecx 101 jnz L(top) 102 103 104 movd %mm0, %eax 105 movl SAVE_EBX, %ebx 106 emms 107 ret 108 109L(done_mm1): 110 movd %mm1, %eax 111 movl SAVE_EBX, %ebx 112 emms 113 ret 114 115EPILOGUE() 116