1dnl AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1. 2 3dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6: 2.66 cycles/limb 24 25 26C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 27C 28C An attempt was made to use a loop like 29C 30C L(top): 31C adcl (%edx), %eax 32C adcl 4(%edx), %ebx 33C adcl 8(%edx), %esi 34C leal 12(%edx), %edx 35C loop L(top) 36C 37C with %ecx starting from floor(size/3), but it still measured 2.66 c/l. 38C The form used instead can save about 6 cycles by not dividing by 3. 39C 40C In the code used, putting the "leal"s at the top of the loop is necessary 41C for the claimed speed, anywhere else costs an extra cycle per loop. 42C Perhaps a tight loop like this needs short decode instructions at the 43C branch target, which would explain the leal/loop form above taking 8 44C cycles instead of 7 too. 45 46defframe(PARAM_SIZE, 8) 47defframe(PARAM_SRC, 4) 48 49dnl re-use parameter space 50define(SAVE_EBX, `PARAM_SIZE') 51define(SAVE_ESI, `PARAM_SRC') 52 53 TEXT 54 ALIGN(16) 55PROLOGUE(mpn_mod_34lsub1) 56deflit(`FRAME',0) 57 58 movl PARAM_SIZE, %eax 59 movl PARAM_SRC, %edx 60 61 subl $2, %eax 62 ja L(three_or_more) 63 64Zdisp( movl, 0,(%edx), %eax) C avoid code cache line boundary 65 jne L(one) 66 67 movl %eax, %ecx 68 movl 4(%edx), %edx 69 70 shrl $24, %eax C src[0] high 71 andl $0x00FFFFFF, %ecx C src[0] low 72 73 addl %ecx, %eax 74 movl %edx, %ecx 75 76 shll $8, %edx 77 andl $0x00FFFF00, %edx C src[1] high 78 79 shrl $16, %ecx C src[1] low 80 addl %ecx, %eax 81 82 addl %edx, %eax 83 84L(one): 85 ret 86 87 88L(three_or_more): 89 C eax size-2 90 C ebx 91 C ecx 92 C edx src 93 94 movl %ebx, SAVE_EBX 95 xorl %ebx, %ebx 96 97 movl %esi, SAVE_ESI 98 pushl %edi FRAME_pushl() 99 100 xorl %esi, %esi 101 xorl %edi, %edi C and clear carry flag 102 103L(top): 104 C eax counter, limbs 105 C ebx acc 0mod3 106 C ecx 107 C edx src, incrementing 108 C esi acc 1mod3 109 C edi acc 2mod3 110 C ebp 111 112 leal -2(%eax), %eax 113 leal 12(%edx), %edx 114 115 adcl -12(%edx), %ebx 116 adcl -8(%edx), %esi 117 adcl -4(%edx), %edi 118 119 decl %eax 120 jg L(top) 121 122 123 C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively 124 125 movb $0, %cl 126 incl %eax 127 128 js L(combine) C 0 more 129 130Zdisp( adcl, 0,(%edx), %ebx) C avoid code cache line crossings 131 132 movb $8, %cl 133 decl %eax 134 135 js L(combine) C 1 more 136 137 adcl 4(%edx), %esi 138 139 movb $16, %cl 140 141 142L(combine): 143 sbbl %edx, %edx 144 145 shll %cl, %edx C carry 146 movl %ebx, %eax C 0mod3 147 148 shrl $24, %eax C 0mod3 high 149 andl $0x00FFFFFF, %ebx C 0mod3 low 150 151 subl %edx, %eax C apply carry 152 movl %esi, %ecx C 1mod3 153 154 shrl $16, %esi C 1mod3 high 155 addl %ebx, %eax C apply 0mod3 low 156 157 andl $0x0000FFFF, %ecx 158 addl %esi, %eax C apply 1mod3 high 159 160 shll $8, %ecx C 1mod3 low 161 movl %edi, %edx C 2mod3 162 163 shrl $8, %edx C 2mod3 high 164 addl %ecx, %eax C apply 1mod3 low 165 166 addl %edx, %eax C apply 2mod3 high 167 andl $0x000000FF, %edi 168 169 shll $16, %edi C 2mod3 low 170 movl SAVE_EBX, %ebx 171 172 addl %edi, %eax C apply 2mod3 low 173 movl SAVE_ESI, %esi 174 175 popl %edi 176 177 ret 178 179EPILOGUE() 180