1dnl Intel Pentium mpn_com -- mpn ones complement. 2 3dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C P5: 1.75 cycles/limb 35 36 37NAILS_SUPPORT(0-31) 38 39 40C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); 41C 42C This code is similar to mpn_copyi, basically there's just some "xorl 43C $GMP_NUMB_MASK"s inserted. 44C 45C Alternatives: 46C 47C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst 48C are the same alignment mod 8, but it doesn't seem worth the trouble for 49C just that case (there'd need to be some plain integer available too for 50C the unaligned case). 51 52defframe(PARAM_SIZE,12) 53defframe(PARAM_SRC, 8) 54defframe(PARAM_DST, 4) 55 56 TEXT 57 ALIGN(8) 58PROLOGUE(mpn_com) 59deflit(`FRAME',0) 60 61 movl PARAM_SRC, %eax 62 movl PARAM_SIZE, %ecx 63 64 pushl %esi FRAME_pushl() 65 pushl %edi FRAME_pushl() 66 67 leal (%eax,%ecx,4), %eax 68 xorl $-1, %ecx C -size-1 69 70 movl PARAM_DST, %edx 71 addl $8, %ecx C -size+7 72 73 jns L(end) 74 75 movl (%edx), %esi C fetch destination cache line 76 nop 77 78L(top): 79 C eax &src[size] 80 C ebx 81 C ecx counter, limbs, negative 82 C edx dst, incrementing 83 C esi scratch 84 C edi scratch 85 C ebp 86 87 movl 28(%edx), %esi C destination prefetch 88 addl $32, %edx 89 90 movl -28(%eax,%ecx,4), %esi 91 movl -24(%eax,%ecx,4), %edi 92 xorl $GMP_NUMB_MASK, %esi 93 xorl $GMP_NUMB_MASK, %edi 94 movl %esi, -32(%edx) 95 movl %edi, -28(%edx) 96 97 movl -20(%eax,%ecx,4), %esi 98 movl -16(%eax,%ecx,4), %edi 99 xorl $GMP_NUMB_MASK, %esi 100 xorl $GMP_NUMB_MASK, %edi 101 movl %esi, -24(%edx) 102 movl %edi, -20(%edx) 103 104 movl -12(%eax,%ecx,4), %esi 105 movl -8(%eax,%ecx,4), %edi 106 xorl $GMP_NUMB_MASK, %esi 107 xorl $GMP_NUMB_MASK, %edi 108 movl %esi, -16(%edx) 109 movl %edi, -12(%edx) 110 111 movl -4(%eax,%ecx,4), %esi 112 movl (%eax,%ecx,4), %edi 113 xorl $GMP_NUMB_MASK, %esi 114 xorl $GMP_NUMB_MASK, %edi 115 movl %esi, -8(%edx) 116 movl %edi, -4(%edx) 117 118 addl $8, %ecx 119 js L(top) 120 121 122L(end): 123 C eax &src[size] 124 C ecx 0 to 7, representing respectively 7 to 0 limbs remaining 125 C edx dst, next location to store 126 127 subl $4, %ecx 128 nop 129 130 jns L(no4) 131 132 movl -12(%eax,%ecx,4), %esi 133 movl -8(%eax,%ecx,4), %edi 134 xorl $GMP_NUMB_MASK, %esi 135 xorl $GMP_NUMB_MASK, %edi 136 movl %esi, (%edx) 137 movl %edi, 4(%edx) 138 139 movl -4(%eax,%ecx,4), %esi 140 movl (%eax,%ecx,4), %edi 141 xorl $GMP_NUMB_MASK, %esi 142 xorl $GMP_NUMB_MASK, %edi 143 movl %esi, 8(%edx) 144 movl %edi, 12(%edx) 145 146 addl $16, %edx 147 addl $4, %ecx 148L(no4): 149 150 subl $2, %ecx 151 nop 152 153 jns L(no2) 154 155 movl -4(%eax,%ecx,4), %esi 156 movl (%eax,%ecx,4), %edi 157 xorl $GMP_NUMB_MASK, %esi 158 xorl $GMP_NUMB_MASK, %edi 159 movl %esi, (%edx) 160 movl %edi, 4(%edx) 161 162 addl $8, %edx 163 addl $2, %ecx 164L(no2): 165 166 popl %edi 167 jnz L(done) 168 169 movl -4(%eax), %ecx 170 171 xorl $GMP_NUMB_MASK, %ecx 172 popl %esi 173 174 movl %ecx, (%edx) 175 ret 176 177L(done): 178 popl %esi 179 ret 180 181EPILOGUE() 182