1dnl Intel P6 mpn_lshsub_n -- mpn papillion support. 2 3dnl Copyright 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12) 34 35C (1) The loop is not scheduled in any way, and scheduling attempts have not 36C improved speed on P6/13. Presumably, the K7 will want scheduling, if it 37C at all wants to use MMX. 38C (2) We could save a register by not alternatingly using eax and edx in the 39C loop. 40 41define(`rp', `%edi') 42define(`up', `%esi') 43define(`vp', `%ebx') 44define(`n', `%ecx') 45define(`cnt', `%mm7') 46 47ASM_START() 48 49 TEXT 50 ALIGN(16) 51 52PROLOGUE(mpn_lshsub_n) 53 push %edi 54 push %esi 55 push %ebx 56 57 mov 16(%esp), rp 58 mov 20(%esp), up 59 mov 24(%esp), vp 60 mov 28(%esp), n 61 mov $32, %eax 62 sub 32(%esp), %eax 63 movd %eax, cnt 64 65 lea (up,n,4), up 66 lea (vp,n,4), vp 67 lea (rp,n,4), rp 68 69 neg n 70 mov n, %eax 71 and $-8, n 72 and $7, %eax 73 shl %eax C eax = 2x 74 lea (%eax,%eax,4), %edx C edx = 10x 75ifdef(`PIC',` 76 call L(pic_calc) 77L(here): 78',` 79 lea L(ent)(%eax,%edx,2), %eax C eax = 22x 80') 81 82 pxor %mm1, %mm1 83 pxor %mm0, %mm0 84 85 jmp *%eax 86 87ifdef(`PIC',` 88L(pic_calc): 89 C See mpn/x86/README about old gas bugs 90 lea (%eax,%edx,2), %eax 91 add $L(ent)-L(here), %eax 92 add (%esp), %eax 93 ret_internal 94') 95 96L(end): C compute (cy<<cnt) | (edx>>(32-cnt)) 97 sbb %eax, %eax 98 neg %eax 99 mov 32(%esp), %ecx 100 shld %cl, %edx, %eax 101 102 emms 103 104 pop %ebx 105 pop %esi 106 pop %edi 107 ret 108 ALIGN(16) 109L(top): jecxz L(end) 110L(ent): mov 0(up,n,4), %eax 111 sbb 0(vp,n,4), %eax 112 movd %eax, %mm0 113 punpckldq %mm0, %mm1 114 psrlq %mm7, %mm1 115 movd %mm1, 0(rp,n,4) 116 117 mov 4(up,n,4), %edx 118 sbb 4(vp,n,4), %edx 119 movd %edx, %mm1 120 punpckldq %mm1, %mm0 121 psrlq %mm7, %mm0 122 movd %mm0, 4(rp,n,4) 123 124 mov 8(up,n,4), %eax 125 sbb 8(vp,n,4), %eax 126 movd %eax, %mm0 127 punpckldq %mm0, %mm1 128 psrlq %mm7, %mm1 129 movd %mm1, 8(rp,n,4) 130 131 mov 12(up,n,4), %edx 132 sbb 12(vp,n,4), %edx 133 movd %edx, %mm1 134 punpckldq %mm1, %mm0 135 psrlq %mm7, %mm0 136 movd %mm0, 12(rp,n,4) 137 138 mov 16(up,n,4), %eax 139 sbb 16(vp,n,4), %eax 140 movd %eax, %mm0 141 punpckldq %mm0, %mm1 142 psrlq %mm7, %mm1 143 movd %mm1, 16(rp,n,4) 144 145 mov 20(up,n,4), %edx 146 sbb 20(vp,n,4), %edx 147 movd %edx, %mm1 148 punpckldq %mm1, %mm0 149 psrlq %mm7, %mm0 150 movd %mm0, 20(rp,n,4) 151 152 mov 24(up,n,4), %eax 153 sbb 24(vp,n,4), %eax 154 movd %eax, %mm0 155 punpckldq %mm0, %mm1 156 psrlq %mm7, %mm1 157 movd %mm1, 24(rp,n,4) 158 159 mov 28(up,n,4), %edx 160 sbb 28(vp,n,4), %edx 161 movd %edx, %mm1 162 punpckldq %mm1, %mm0 163 psrlq %mm7, %mm0 164 movd %mm0, 28(rp,n,4) 165 166 lea 8(n), n 167 jmp L(top) 168 169EPILOGUE() 170