1dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3dnl Copyright 2005, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23C TODO: 24C * Tweak eax/edx offsets in loop as to save some lea's 25C * Perhaps software pipeline small-case code 26 27C cycles/limb 28C P6 model 0-8,10-12) - 29C P6 model 9 (Banias) ? 30C P6 model 13 (Dothan) 5.24 31C P4 model 0-1 (Willamette): 5 32C P4 model 2 (Northwood): 5 33C P4 model 3-4 (Prescott): 5 34 35C INPUT PARAMETERS 36C rp sp + 4 37C up sp + 8 38C n sp + 12 39C v0 sp + 16 40 41 TEXT 42 ALIGN(16) 43PROLOGUE(mpn_addmul_1c) 44 mov 4(%esp), %edx 45 mov 8(%esp), %eax 46 mov 12(%esp), %ecx 47 movd 16(%esp), %mm7 48 movd 20(%esp), %mm6 49 jmp L(ent) 50EPILOGUE() 51 ALIGN(16) 52PROLOGUE(mpn_addmul_1) 53 mov 4(%esp), %edx 54 mov 8(%esp), %eax 55 mov 12(%esp), %ecx 56 movd 16(%esp), %mm7 57 pxor %mm6, %mm6 58L(ent): cmp $4, %ecx 59 jnc L(big) 60 61L(lp0): movd (%eax), %mm0 62 lea 4(%eax), %eax 63 movd (%edx), %mm4 64 lea 4(%edx), %edx 65 pmuludq %mm7, %mm0 66 paddq %mm0, %mm4 67 paddq %mm4, %mm6 68 movd %mm6, -4(%edx) 69 psrlq $32, %mm6 70 dec %ecx 71 jnz L(lp0) 72 movd %mm6, %eax 73 emms 74 ret 75 76L(big): and $3, %ecx 77 je L(0) 78 cmp $2, %ecx 79 jc L(1) 80 je L(2) 81 jmp L(3) C FIXME: one case should fall through 82 83L(0): movd (%eax), %mm3 84 sub 12(%esp), %ecx C loop count 85 lea -16(%eax), %eax 86 lea -12(%edx), %edx 87 pmuludq %mm7, %mm3 88 movd 20(%eax), %mm0 89 movd 12(%edx), %mm5 90 pmuludq %mm7, %mm0 91 movd 24(%eax), %mm1 92 paddq %mm3, %mm5 93 movd 16(%edx), %mm4 94 jmp L(00) 95 96L(1): movd (%eax), %mm2 97 sub 12(%esp), %ecx 98 lea -12(%eax), %eax 99 lea -8(%edx), %edx 100 movd 8(%edx), %mm4 101 pmuludq %mm7, %mm2 102 movd 16(%eax), %mm3 103 pmuludq %mm7, %mm3 104 movd 20(%eax), %mm0 105 paddq %mm2, %mm4 106 movd 12(%edx), %mm5 107 jmp L(01) 108 109L(2): movd (%eax), %mm1 110 sub 12(%esp), %ecx 111 lea -8(%eax), %eax 112 lea -4(%edx), %edx 113 pmuludq %mm7, %mm1 114 movd 12(%eax), %mm2 115 movd 4(%edx), %mm5 116 pmuludq %mm7, %mm2 117 movd 16(%eax), %mm3 118 paddq %mm1, %mm5 119 movd 8(%edx), %mm4 120 jmp L(10) 121 122L(3): movd (%eax), %mm0 123 sub 12(%esp), %ecx 124 lea -4(%eax), %eax 125 pmuludq %mm7, %mm0 126 movd 8(%eax), %mm1 127 movd (%edx), %mm4 128 pmuludq %mm7, %mm1 129 movd 12(%eax), %mm2 130 paddq %mm0, %mm4 131 movd 4(%edx), %mm5 132 133 ALIGN(16) 134L(top): pmuludq %mm7, %mm2 135 paddq %mm4, %mm6 136 movd 16(%eax), %mm3 137 paddq %mm1, %mm5 138 movd 8(%edx), %mm4 139 movd %mm6, 0(%edx) 140 psrlq $32, %mm6 141L(10): pmuludq %mm7, %mm3 142 paddq %mm5, %mm6 143 movd 20(%eax), %mm0 144 paddq %mm2, %mm4 145 movd 12(%edx), %mm5 146 movd %mm6, 4(%edx) 147 psrlq $32, %mm6 148L(01): pmuludq %mm7, %mm0 149 paddq %mm4, %mm6 150 movd 24(%eax), %mm1 151 paddq %mm3, %mm5 152 movd 16(%edx), %mm4 153 movd %mm6, 8(%edx) 154 psrlq $32, %mm6 155L(00): pmuludq %mm7, %mm1 156 paddq %mm5, %mm6 157 movd 28(%eax), %mm2 158 paddq %mm0, %mm4 159 movd 20(%edx), %mm5 160 movd %mm6, 12(%edx) 161 psrlq $32, %mm6 162 lea 16(%eax), %eax 163 lea 16(%edx), %edx 164 add $4, %ecx 165 jnz L(top) 166 167L(end): pmuludq %mm7, %mm2 168 paddq %mm4, %mm6 169 paddq %mm1, %mm5 170 movd 8(%edx), %mm4 171 movd %mm6, 0(%edx) 172 psrlq $32, %mm6 173 paddq %mm5, %mm6 174 paddq %mm2, %mm4 175 movd %mm6, 4(%edx) 176 psrlq $32, %mm6 177 paddq %mm4, %mm6 178 movd %mm6, 8(%edx) 179 psrlq $32, %mm6 180 movd %mm6, %eax 181 emms 182 ret 183EPILOGUE() 184