1dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation, 4dnl Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C P5: 26C P6 model 0-8,10-12) 27C P6 model 9 (Banias) 28C P6 model 13 (Dothan) 29C P4 model 0 (Willamette) 30C P4 model 1 (?) 31C P4 model 2 (Northwood) 32C P4 model 3 (Prescott) 33C P4 model 4 (Nocona) 34C K6: 35C K7: 3.75 36C K8: 37 38C TODO 39C * Improve feed-in and wind-down code. We beat the old code for all n != 1, 40C but lose by 2x for n == 1. 41 42ifdef(`OPERATION_addmul_1',` 43 define(`ADDSUB', `add') 44 define(`func', `mpn_addmul_1') 45') 46ifdef(`OPERATION_submul_1',` 47 define(`ADDSUB', `sub') 48 define(`func', `mpn_submul_1') 49') 50 51MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 52 53ASM_START() 54 TEXT 55 ALIGN(16) 56PROLOGUE(func) 57 add $-16, %esp 58 mov %ebp, (%esp) 59 mov %ebx, 4(%esp) 60 mov %esi, 8(%esp) 61 mov %edi, 12(%esp) 62 63 mov 20(%esp), %edi 64 mov 24(%esp), %esi 65 mov 28(%esp), %eax 66 mov 32(%esp), %ecx 67 mov %eax, %ebx 68 shr $2, %eax 69 mov %eax, 28(%esp) 70 mov (%esi), %eax 71 and $3, %ebx 72 jz L(b0) 73 cmp $2, %ebx 74 jz L(b2) 75 jg L(b3) 76 77L(b1): lea -4(%esi), %esi 78 lea -4(%edi), %edi 79 mul %ecx 80 mov %eax, %ebx 81 mov %edx, %ebp 82 cmpl $0, 28(%esp) 83 jz L(cj1) 84 mov 8(%esi), %eax 85 jmp L(1) 86 87L(b2): mul %ecx 88 mov %eax, %ebp 89 mov 4(%esi), %eax 90 mov %edx, %ebx 91 cmpl $0, 28(%esp) 92 jne L(2) 93 jmp L(cj2) 94 95L(b3): lea -12(%esi), %esi 96 lea -12(%edi), %edi 97 mul %ecx 98 mov %eax, %ebx 99 mov %edx, %ebp 100 mov 16(%esi), %eax 101 incl 28(%esp) 102 jmp L(3) 103 104L(b0): lea -8(%esi), %esi 105 lea -8(%edi), %edi 106 mul %ecx 107 mov %eax, %ebp 108 mov 12(%esi), %eax 109 mov %edx, %ebx 110 jmp L(0) 111 112 ALIGN(16) 113L(top): lea 16(%edi), %edi 114L(2): mul %ecx 115 ADDSUB %ebp, 0(%edi) 116 mov $0, %ebp 117 adc %eax, %ebx 118 mov 8(%esi), %eax 119 adc %edx, %ebp 120L(1): mul %ecx 121 ADDSUB %ebx, 4(%edi) 122 mov $0, %ebx 123 adc %eax, %ebp 124 mov 12(%esi), %eax 125 adc %edx, %ebx 126L(0): mul %ecx 127 ADDSUB %ebp, 8(%edi) 128 mov $0, %ebp 129 adc %eax, %ebx 130 adc %edx, %ebp 131 mov 16(%esi), %eax 132L(3): mul %ecx 133 ADDSUB %ebx, 12(%edi) 134 adc %eax, %ebp 135 mov 20(%esi), %eax 136 lea 16(%esi), %esi 137 mov $0, %ebx 138 adc %edx, %ebx 139 decl 28(%esp) 140 jnz L(top) 141 142L(end): lea 16(%edi), %edi 143L(cj2): mul %ecx 144 ADDSUB %ebp, (%edi) 145 adc %eax, %ebx 146 adc $0, %edx 147L(cj1): ADDSUB %ebx, 4(%edi) 148 adc $0, %edx 149 mov %edx, %eax 150 mov (%esp), %ebp 151 mov 4(%esp), %ebx 152 mov 8(%esp), %esi 153 mov 12(%esp), %edi 154 add $16, %esp 155 ret 156EPILOGUE() 157ASM_END() 158