mod_34lsub1.asm revision 1.1.1.1
1dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1. 2 3dnl Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C Pentium4: 1.0 cycles/limb 24 25 26C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 27C 28C Enhancements: 29C 30C There might a couple of cycles to save by using plain integer code for 31C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to 32C about 46 (inclusive of some function call overheads). 33 34defframe(PARAM_SIZE, 8) 35defframe(PARAM_SRC, 4) 36 37dnl re-use parameter space 38define(SAVE_EBX, `PARAM_SRC') 39define(SAVE_ESI, `PARAM_SIZE') 40 41 TEXT 42 ALIGN(16) 43PROLOGUE(mpn_mod_34lsub1) 44deflit(`FRAME',0) 45 46 movl PARAM_SIZE, %ecx 47 movl PARAM_SRC, %edx 48 movl (%edx), %eax 49 50 subl $2, %ecx 51 ja L(three_or_more) 52 jne L(one) 53 54 movl 4(%edx), %edx 55 movl %eax, %ecx 56 shrl $24, %eax C src[0] high 57 58 andl $0x00FFFFFF, %ecx C src[0] low 59 addl %ecx, %eax 60 61 movl %edx, %ecx 62 shll $8, %edx 63 64 shrl $16, %ecx C src[1] low 65 addl %ecx, %eax 66 67 andl $0x00FFFF00, %edx C src[1] high 68 addl %edx, %eax 69 70L(one): 71 ret 72 73 74L(three_or_more): 75 pxor %mm0, %mm0 76 pxor %mm1, %mm1 77 pxor %mm2, %mm2 78 79 pcmpeqd %mm7, %mm7 80 psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits 81 82 pcmpeqd %mm6, %mm6 83 psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits 84 85L(top): 86 C eax 87 C ebx 88 C ecx counter, size-2 to 0, -1 or -2 89 C edx src, incrementing 90 C 91 C mm0 sum 0mod3 92 C mm1 sum 1mod3 93 C mm2 sum 2mod3 94 C mm3 95 C mm4 96 C mm5 97 C mm6 0x0000000000FFFFFF 98 C mm7 0x00000000FFFFFFFF 99 100 movd (%edx), %mm3 101 paddq %mm3, %mm0 102 103 movd 4(%edx), %mm3 104 paddq %mm3, %mm1 105 106 movd 8(%edx), %mm3 107 paddq %mm3, %mm2 108 109 addl $12, %edx 110 subl $3, %ecx 111 ja L(top) 112 113 114 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively 115 116 addl $1, %ecx 117 js L(combine) C 0 more 118 119 movd (%edx), %mm3 120 paddq %mm3, %mm0 121 122 jz L(combine) C 1 more 123 124 movd 4(%edx), %mm3 125 paddq %mm3, %mm1 126 127L(combine): 128 movq %mm7, %mm3 C low halves 129 pand %mm0, %mm3 130 131 movq %mm7, %mm4 132 pand %mm1, %mm4 133 134 movq %mm7, %mm5 135 pand %mm2, %mm5 136 137 psrlq $32, %mm0 C high halves 138 psrlq $32, %mm1 139 psrlq $32, %mm2 140 141 paddq %mm0, %mm4 C fold high halves to give 33 bits each 142 paddq %mm1, %mm5 143 paddq %mm2, %mm3 144 145 psllq $8, %mm4 C combine at respective offsets 146 psllq $16, %mm5 147 paddq %mm4, %mm3 148 paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits 149 150 pand %mm3, %mm6 C fold at 24 bits 151 psrlq $24, %mm3 152 153 paddq %mm6, %mm3 154 movd %mm3, %eax 155 156 ASSERT(z, C nothing left in high dword 157 `psrlq $32, %mm3 158 movd %mm3, %ecx 159 orl %ecx, %ecx') 160 161 emms 162 ret 163 164EPILOGUE() 165