1dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C Athlon: 11.0 25C Hammer: 7.0 26 27 28C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 29C mp_limb_t divisor); 30C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 31C mp_limb_t divisor, mp_limb_t carry); 32C 33C With the loop running at just 11 cycles it doesn't seem worth bothering to 34C check for high<divisor to save one step. 35C 36C Using a divl for size==1 measures slower than the modexact method, which 37C is not too surprising since for the latter it's only about 24 cycles to 38C calculate the modular inverse. 39 40defframe(PARAM_CARRY, 16) 41defframe(PARAM_DIVISOR,12) 42defframe(PARAM_SIZE, 8) 43defframe(PARAM_SRC, 4) 44 45defframe(SAVE_EBX, -4) 46defframe(SAVE_ESI, -8) 47defframe(SAVE_EDI, -12) 48defframe(SAVE_EBP, -16) 49 50deflit(STACK_SPACE, 16) 51 52 TEXT 53 54 ALIGN(16) 55PROLOGUE(mpn_modexact_1c_odd) 56deflit(`FRAME',0) 57 58 movl PARAM_CARRY, %ecx 59 jmp L(start_1c) 60 61EPILOGUE() 62 63 64 ALIGN(16) 65PROLOGUE(mpn_modexact_1_odd) 66deflit(`FRAME',0) 67 68 xorl %ecx, %ecx 69L(start_1c): 70 movl PARAM_DIVISOR, %eax 71 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 72 73 movl %esi, SAVE_ESI 74 movl PARAM_DIVISOR, %esi 75 76 movl %edi, SAVE_EDI 77 78 shrl %eax C d/2 79 80 andl $127, %eax 81 82ifdef(`PIC',` 83 LEA( binvert_limb_table, %edi) 84 movzbl (%eax,%edi), %edi C inv 8 bits 85',` 86 movzbl binvert_limb_table(%eax), %edi C inv 8 bits 87') 88 89 xorl %edx, %edx C initial extra carry 90 leal (%edi,%edi), %eax C 2*inv 91 92 imull %edi, %edi C inv*inv 93 94 movl %ebp, SAVE_EBP 95 movl PARAM_SIZE, %ebp 96 97 movl %ebx, SAVE_EBX 98 movl PARAM_SRC, %ebx 99 100 imull %esi, %edi C inv*inv*d 101 102 subl %edi, %eax C inv = 2*inv - inv*inv*d 103 leal (%eax,%eax), %edi C 2*inv 104 105 imull %eax, %eax C inv*inv 106 107 imull %esi, %eax C inv*inv*d 108 109 leal (%ebx,%ebp,4), %ebx C src end 110 negl %ebp C -size 111 112 subl %eax, %edi C inv = 2*inv - inv*inv*d 113 114 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 115 movl %esi, %eax 116 imull %edi, %eax 117 cmpl $1, %eax') 118 119 120C The dependent chain here is 121C 122C cycles 123C subl %edx, %eax 1 124C imull %edi, %eax 4 125C mull %esi 6 (high limb) 126C ---- 127C total 11 128C 129C Out of order execution hides the load latency for the source data, so no 130C special scheduling is required. 131 132L(top): 133 C eax src limb 134 C ebx src end ptr 135 C ecx next carry bit, 0 or 1 (or initial carry param) 136 C edx carry limb, high of last product 137 C esi divisor 138 C edi inverse 139 C ebp counter, limbs, negative 140 141 movl (%ebx,%ebp,4), %eax 142 143 subl %ecx, %eax C apply carry bit 144 movl $0, %ecx 145 146 setc %cl C new carry bit 147 148 subl %edx, %eax C apply carry limb 149 adcl $0, %ecx 150 151 imull %edi, %eax 152 153 mull %esi 154 155 incl %ebp 156 jnz L(top) 157 158 159 movl SAVE_ESI, %esi 160 movl SAVE_EDI, %edi 161 leal (%ecx,%edx), %eax 162 163 movl SAVE_EBX, %ebx 164 movl SAVE_EBP, %ebp 165 addl $STACK_SPACE, %esp 166 167 ret 168 169EPILOGUE() 170