com.asm revision 1.1.1.1
1dnl AMD Athlon mpn_com -- mpn bitwise one's complement. 2 3dnl Copyright 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K7: 1.0 cycles/limb 24 25 26C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); 27C 28C The loop form below is necessary for the claimed speed. It needs to be 29C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it 30C fits in a BTB entry. The adjustments to %eax and %edx avoid offsets on 31C the movq's and achieve the necessary size. 32C 33C If both src and dst are 4mod8, the loop runs at 1.5 c/l. So long as one 34C of the two is 0mod8, it runs at 1.0 c/l. On that basis dst is checked 35C (offset by the size, as per the loop addressing) and one high limb 36C processed separately to get alignment. 37C 38C The padding for the nails case is unattractive, but shouldn't cost any 39C cycles. Explicit .byte's guarantee the desired instructions, at a point 40C where we're probably stalled waiting for loads anyway. 41C 42C Enhancements: 43C 44C The combination load/pxor/store might be able to be unrolled to approach 45C 0.5 c/l if desired. 46 47defframe(PARAM_SIZE,12) 48defframe(PARAM_SRC, 8) 49defframe(PARAM_DST, 4) 50 51 TEXT 52 ALIGN(16) 53 54PROLOGUE(mpn_com) 55deflit(`FRAME',0) 56 57 movl PARAM_DST, %edx 58 movl PARAM_SIZE, %ecx 59 pcmpeqd %mm7, %mm7 60 61 leal (%edx,%ecx,4), %eax 62 andl $4, %eax 63ifelse(GMP_NAIL_BITS,0,, 64` psrld $GMP_NAIL_BITS, %mm7') C GMP_NUMB_MASK 65 66 movl PARAM_SRC, %eax 67 movd -4(%eax,%ecx,4), %mm0 C src high limb 68 69ifelse(GMP_NAIL_BITS,0,, 70` C padding for alignment below 71 .byte 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 C lea 0(%esi),%esi 72 .byte 0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 C lea 0(%edi),%edi 73') 74 75 jz L(aligned) 76 77 pxor %mm7, %mm0 78 movd %mm0, -4(%edx,%ecx,4) C dst high limb 79 decl %ecx 80 jz L(done) 81L(aligned): 82 83 addl $4, %eax 84 addl $4, %edx 85 decl %ecx 86 jz L(one) 87 88 C offset 0x30 for no nails, or 0x40 for nails 89 ALIGN(16) 90L(top): 91 C eax src 92 C ebx 93 C ecx counter 94 C edx dst 95 96 subl $2, %ecx 97 movq (%eax,%ecx,4), %mm0 98 pxor %mm7, %mm0 99 movq %mm0, (%edx,%ecx,4) 100 jg L(top) 101 102 jnz L(done) C if size even 103 104L(one): 105 movd -4(%eax), %mm0 C src low limb 106 pxor %mm7, %mm0 107 movd %mm0, -4(%edx) C dst low limb 108 109L(done): 110 emms 111 112 ret 113 114EPILOGUE() 115