1dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34C cycles/limb 35C P6 model 0-8,10-12 - 36C P6 model 9 (Banias) 4.17 37C P6 model 13 (Dothan) 4.17 38C P4 model 0-1 (Willamette) 4 39C P4 model 2 (Northwood) 4 40C P4 model 3-4 (Prescott) 4.55 41 42C TODO: 43C * Tweak eax/edx offsets in loop as to save some lea's 44C * Perhaps software pipeline small-case code 45 46C INPUT PARAMETERS 47C rp sp + 4 48C up sp + 8 49C n sp + 12 50C v0 sp + 16 51 52 TEXT 53 ALIGN(16) 54PROLOGUE(mpn_mul_1) 55 pxor %mm6, %mm6 56L(ent): mov 4(%esp), %edx 57 mov 8(%esp), %eax 58 mov 12(%esp), %ecx 59 movd 16(%esp), %mm7 60 cmp $4, %ecx 61 jnc L(big) 62 63L(lp0): movd (%eax), %mm0 64 lea 4(%eax), %eax 65 lea 4(%edx), %edx 66 pmuludq %mm7, %mm0 67 paddq %mm0, %mm6 68 movd %mm6, -4(%edx) 69 psrlq $32, %mm6 70 dec %ecx 71 jnz L(lp0) 72 movd %mm6, %eax 73 emms 74 ret 75 76L(big): and $3, %ecx 77 je L(0) 78 cmp $2, %ecx 79 jc L(1) 80 je L(2) 81 jmp L(3) C FIXME: one case should fall through 82 83L(0): movd (%eax), %mm3 84 sub 12(%esp), %ecx C loop count 85 lea -16(%eax), %eax 86 lea -12(%edx), %edx 87 pmuludq %mm7, %mm3 88 movd 20(%eax), %mm0 89 pmuludq %mm7, %mm0 90 movd 24(%eax), %mm1 91 jmp L(00) 92 93L(1): movd (%eax), %mm2 94 sub 12(%esp), %ecx 95 lea -12(%eax), %eax 96 lea -8(%edx), %edx 97 pmuludq %mm7, %mm2 98 movd 16(%eax), %mm3 99 pmuludq %mm7, %mm3 100 movd 20(%eax), %mm0 101 jmp L(01) 102 103L(2): movd (%eax), %mm1 104 sub 12(%esp), %ecx 105 lea -8(%eax), %eax 106 lea -4(%edx), %edx 107 pmuludq %mm7, %mm1 108 movd 12(%eax), %mm2 109 pmuludq %mm7, %mm2 110 movd 16(%eax), %mm3 111 jmp L(10) 112 113L(3): movd (%eax), %mm0 114 sub 12(%esp), %ecx 115 lea -4(%eax), %eax 116 pmuludq %mm7, %mm0 117 movd 8(%eax), %mm1 118 pmuludq %mm7, %mm1 119 movd 12(%eax), %mm2 120 121 ALIGN(16) 122L(top): pmuludq %mm7, %mm2 123 paddq %mm0, %mm6 124 movd 16(%eax), %mm3 125 movd %mm6, 0(%edx) 126 psrlq $32, %mm6 127L(10): pmuludq %mm7, %mm3 128 paddq %mm1, %mm6 129 movd 20(%eax), %mm0 130 movd %mm6, 4(%edx) 131 psrlq $32, %mm6 132L(01): pmuludq %mm7, %mm0 133 paddq %mm2, %mm6 134 movd 24(%eax), %mm1 135 movd %mm6, 8(%edx) 136 psrlq $32, %mm6 137L(00): pmuludq %mm7, %mm1 138 paddq %mm3, %mm6 139 movd 28(%eax), %mm2 140 movd %mm6, 12(%edx) 141 psrlq $32, %mm6 142 lea 16(%eax), %eax 143 lea 16(%edx), %edx 144 add $4, %ecx 145 ja L(top) 146 147L(end): pmuludq %mm7, %mm2 148 paddq %mm0, %mm6 149 movd %mm6, 0(%edx) 150 psrlq $32, %mm6 151 paddq %mm1, %mm6 152 movd %mm6, 4(%edx) 153 psrlq $32, %mm6 154 paddq %mm2, %mm6 155 movd %mm6, 8(%edx) 156 psrlq $32, %mm6 157 movd %mm6, %eax 158 emms 159 ret 160EPILOGUE() 161PROLOGUE(mpn_mul_1c) 162 movd 20(%esp), %mm6 163 jmp L(ent) 164EPILOGUE() 165