mode1o.asm revision 1.1.1.1
1dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P6: 10.0 cycles/limb 24 25 26C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 27C mp_limb_t divisor); 28C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 29C mp_limb_t divisor, mp_limb_t carry); 30C 31C It's not worth skipping a step at the end when high<divisor since the main 32C loop is only 10 cycles. 33 34defframe(PARAM_CARRY, 16) 35defframe(PARAM_DIVISOR,12) 36defframe(PARAM_SIZE, 8) 37defframe(PARAM_SRC, 4) 38 39dnl Not enough room under modexact_1 to make these re-use the parameter 40dnl space, unfortunately. 41defframe(SAVE_EBX, -4) 42defframe(SAVE_ESI, -8) 43defframe(SAVE_EDI, -12) 44deflit(STACK_SPACE, 12) 45 46 TEXT 47 48 ALIGN(16) 49PROLOGUE(mpn_modexact_1c_odd) 50deflit(`FRAME',0) 51 52 movl PARAM_CARRY, %ecx 53 jmp L(start_1c) 54 55EPILOGUE() 56 57 ALIGN(16) 58PROLOGUE(mpn_modexact_1_odd) 59deflit(`FRAME',0) 60 61 xorl %ecx, %ecx 62L(start_1c): 63 movl PARAM_DIVISOR, %eax 64 65 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 66 67 movl %esi, SAVE_ESI 68 movl PARAM_SRC, %esi 69 70 shrl %eax C d/2 71 movl %edi, SAVE_EDI 72 73 andl $127, %eax 74 75ifdef(`PIC',` 76 LEA( binvert_limb_table, %edi) 77 movzbl (%eax,%edi), %edi C inv 8 bits 78',` 79 movzbl binvert_limb_table(%eax), %edi C inv 8 bits 80') 81 82 xorl %edx, %edx C initial extra carry 83 leal (%edi,%edi), %eax C 2*inv 84 85 imull %edi, %edi C inv*inv 86 87 movl %ebx, SAVE_EBX 88 movl PARAM_SIZE, %ebx 89 90 imull PARAM_DIVISOR, %edi C inv*inv*d 91 92 subl %edi, %eax C inv = 2*inv - inv*inv*d 93 leal (%eax,%eax), %edi C 2*inv 94 95 imull %eax, %eax C inv*inv 96 97 imull PARAM_DIVISOR, %eax C inv*inv*d 98 99 leal (%esi,%ebx,4), %esi C src end 100 negl %ebx C -size 101 102 subl %eax, %edi C inv = 2*inv - inv*inv*d 103 104 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 105 movl PARAM_DIVISOR, %eax 106 imull %edi, %eax 107 cmpl $1, %eax') 108 109 110C The dependent chain here is 111C 112C subl %edx, %eax 1 113C imull %edi, %eax 4 114C mull PARAM_DIVISOR 5 115C ---- 116C total 10 117C 118C and this is the measured speed. No special scheduling is necessary, out 119C of order execution hides the load latency. 120 121L(top): 122 C eax scratch (src limb) 123 C ebx counter, limbs, negative 124 C ecx carry bit, 0 or 1 125 C edx carry limb, high of last product 126 C esi &src[size] 127 C edi inverse 128 C ebp 129 130 movl (%esi,%ebx,4), %eax 131 subl %ecx, %eax 132 133 sbbl %ecx, %ecx 134 subl %edx, %eax 135 136 sbbl $0, %ecx 137 138 imull %edi, %eax 139 140 negl %ecx 141 142 mull PARAM_DIVISOR 143 144 incl %ebx 145 jnz L(top) 146 147 148 movl SAVE_ESI, %esi 149 leal (%ecx,%edx), %eax 150 151 movl SAVE_EDI, %edi 152 153 movl SAVE_EBX, %ebx 154 addl $STACK_SPACE, %esp 155 156 ret 157 158EPILOGUE() 159