1dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2000-2003, 2007 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C K6: 10.0 cycles/limb 35 36 37C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 38C mp_limb_t divisor); 39C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 40C mp_limb_t divisor, mp_limb_t carry); 41C 42C A special case for high<divisor at the end measured only about 4 cycles 43C faster, and so isn't used. 44C 45C A special case for size==1 using a divl rather than the inverse measured 46C only about 5 cycles faster, and so isn't used. When size==1 and 47C high<divisor it can skip a division and be a full 24 cycles faster, but 48C this isn't an important case. 49 50defframe(PARAM_CARRY, 16) 51defframe(PARAM_DIVISOR,12) 52defframe(PARAM_SIZE, 8) 53defframe(PARAM_SRC, 4) 54 55 TEXT 56 57 ALIGN(32) 58PROLOGUE(mpn_modexact_1c_odd) 59deflit(`FRAME',0) 60 61 movl PARAM_DIVISOR, %ecx 62 pushl %esi FRAME_pushl() 63 64 movl PARAM_CARRY, %edx 65 jmp L(start_1c) 66 67EPILOGUE() 68 69 70 ALIGN(16) 71PROLOGUE(mpn_modexact_1_odd) 72deflit(`FRAME',0) 73 74 movl PARAM_DIVISOR, %ecx 75 pushl %esi FRAME_pushl() 76 77 xorl %edx, %edx 78L(start_1c): 79 pushl %edi FRAME_pushl() 80 81 shrl %ecx C d/2 82 movl PARAM_DIVISOR, %esi 83 84 andl $127, %ecx C d/2, 7 bits 85 pushl %ebp FRAME_pushl() 86 87ifdef(`PIC',` 88 LEA( binvert_limb_table, %edi) 89Zdisp( movzbl, 0,(%ecx,%edi), %edi) C inv 8 bits 90',` 91 movzbl binvert_limb_table(%ecx), %edi C inv 8 bits 92') 93 leal (%edi,%edi), %ecx C 2*inv 94 95 imull %edi, %edi C inv*inv 96 97 movl PARAM_SRC, %eax 98 movl PARAM_SIZE, %ebp 99 100 imull %esi, %edi C inv*inv*d 101 102 pushl %ebx FRAME_pushl() 103 leal (%eax,%ebp,4), %ebx C src end 104 105 subl %edi, %ecx C inv = 2*inv - inv*inv*d 106 leal (%ecx,%ecx), %edi C 2*inv 107 108 imull %ecx, %ecx C inv*inv 109 110 movl (%eax), %eax C src low limb 111 negl %ebp C -size 112 113 imull %esi, %ecx C inv*inv*d 114 115 subl %ecx, %edi C inv = 2*inv - inv*inv*d 116 117 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 118 pushl %eax 119 movl %esi, %eax 120 imull %edi, %eax 121 cmpl $1, %eax 122 popl %eax') 123 124 jmp L(entry) 125 126 127C Rotating the mul to the top of the loop saves 1 cycle, presumably by 128C hiding the loop control under the imul latency. 129C 130C The run time is 10 cycles, but decoding is only 9 (and the dependent chain 131C only 8). It's not clear how to get down to 9 cycles. 132C 133C The xor and rcl to handle the carry bit could be an sbb instead, with the 134C the carry bit add becoming a sub, but that doesn't save anything. 135 136L(top): 137 C eax (low product) 138 C ebx src end 139 C ecx carry bit, 0 or 1 140 C edx (high product, being carry limb) 141 C esi divisor 142 C edi inverse 143 C ebp counter, limbs, negative 144 145 mull %esi 146 147 movl (%ebx,%ebp,4), %eax 148 addl %ecx, %edx C apply carry bit to carry limb 149 150L(entry): 151 xorl %ecx, %ecx 152 subl %edx, %eax C apply carry limb 153 154 rcll %ecx 155 156 imull %edi, %eax 157 158 incl %ebp 159 jnz L(top) 160 161 162 163 popl %ebx 164 popl %ebp 165 166 mull %esi 167 168 popl %edi 169 popl %esi 170 171 leal (%ecx,%edx), %eax 172 173 ret 174 175EPILOGUE() 176ASM_END() 177