submul_1.asm revision 1.1.1.1
1dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and 2dnl subtract the result from a second limb vector. 3 4dnl Copyright 2001, 2002 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C P4: 7 cycles/limb, unstable timing, at least on early Pentium4 silicon 25C (stepping 10). 26 27 28C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 29C mp_limb_t mult); 30C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 31C mp_limb_t mult, mp_limb_t carry); 32C 33C This code is not particularly good at 7 c/l. The dependent chain is only 34C 4 c/l and there's only 4 MMX unit instructions, so it's not clear why that 35C speed isn't achieved. 36C 37C The arrangements made here to get a two instruction dependent chain are 38C slightly subtle. In the loop the carry (or borrow rather) is a negative 39C so that a paddq can be used to give a low limb ready to store, and a high 40C limb ready to become the new carry after a psrlq. 41C 42C If the carry was a simple twos complement negative then the psrlq shift 43C would need to bring in 0 bits or 1 bits according to whether the high was 44C zero or non-zero, since a non-zero value would represent a negative 45C needing sign extension. That wouldn't be particularly easy to arrange and 46C certainly would add an instruction to the dependent chain, so instead an 47C offset is applied so that the high limb will be 0xFFFFFFFF+c. With c in 48C the range -0xFFFFFFFF to 0, the value 0xFFFFFFFF+c is in the range 0 to 49C 0xFFFFFFFF and is therefore always positive and can always have 0 bits 50C shifted in, which is what psrlq does. 51C 52C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be 53C done off the dependent chain. The total adjustment then is to add 54C 0xFFFFFFFF00000000 to offset the new carry, and subtract 55C 0x00000000FFFFFFFF to remove the offset from the current carry, for a net 56C add of 0xFFFFFFFE00000001. In the code this is applied to the destination 57C limb when fetched. 58C 59C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement 60C negative, which is how it's undone for the return value, but that doesn't 61C seem as clear. 62 63defframe(PARAM_CARRY, 20) 64defframe(PARAM_MULTIPLIER,16) 65defframe(PARAM_SIZE, 12) 66defframe(PARAM_SRC, 8) 67defframe(PARAM_DST, 4) 68 69 TEXT 70 ALIGN(16) 71 72PROLOGUE(mpn_submul_1c) 73deflit(`FRAME',0) 74 movd PARAM_CARRY, %mm1 75 jmp L(start_1c) 76EPILOGUE() 77 78PROLOGUE(mpn_submul_1) 79deflit(`FRAME',0) 80 pxor %mm1, %mm1 C initial borrow 81 82L(start_1c): 83 movl PARAM_SRC, %eax 84 pcmpeqd %mm0, %mm0 85 86 movd PARAM_MULTIPLIER, %mm7 87 pcmpeqd %mm6, %mm6 88 89 movl PARAM_DST, %edx 90 psrlq $32, %mm0 C 0x00000000FFFFFFFF 91 92 movl PARAM_SIZE, %ecx 93 psllq $32, %mm6 C 0xFFFFFFFF00000000 94 95 psubq %mm0, %mm6 C 0xFFFFFFFE00000001 96 97 psubq %mm1, %mm0 C 0xFFFFFFFF - borrow 98 99 100 C eax src, incrementing 101 C ebx 102 C ecx loop counter, decrementing 103 C edx dst, incrementing 104 C 105 C mm0 0xFFFFFFFF - borrow 106 C mm6 0xFFFFFFFE00000001 107 C mm7 multiplier 108 109L(loop): 110 movd (%eax), %mm1 C src 111 leal 4(%eax), %eax 112 movd (%edx), %mm2 C dst 113 paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 114 pmuludq %mm7, %mm1 115 psubq %mm1, %mm2 C prod 116 paddq %mm2, %mm0 C borrow 117 subl $1, %ecx 118 movd %mm0, (%edx) C result 119 psrlq $32, %mm0 120 leal 4(%edx), %edx 121 jnz L(loop) 122 123 movd %mm0, %eax 124 notl %eax 125 emms 126 ret 127 128EPILOGUE() 129