1dnl AMD64 mpn_add_n, mpn_sub_n 2 3dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C AMD K8,K9 1.5 25C AMD K10 1.5 26C Intel P4 ? 27C Intel core2 4.9 28C Intel corei ? 29C Intel atom 4 30C VIA nano 3.25 31 32C The inner loop of this code is the result of running a code generation and 33C optimization tool suite written by David Harvey and Torbjorn Granlund. 34 35C INPUT PARAMETERS 36define(`rp', `%rdi') 37define(`up', `%rsi') 38define(`vp', `%rdx') 39define(`n', `%rcx') 40define(`cy', `%r8') C (only for mpn_add_nc) 41 42ifdef(`OPERATION_add_n', ` 43 define(ADCSBB, adc) 44 define(func, mpn_add_n) 45 define(func_nc, mpn_add_nc)') 46ifdef(`OPERATION_sub_n', ` 47 define(ADCSBB, sbb) 48 define(func, mpn_sub_n) 49 define(func_nc, mpn_sub_nc)') 50 51MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 52 53ASM_START() 54 TEXT 55 ALIGN(16) 56PROLOGUE(func_nc) 57 mov R32(n), R32(%rax) 58 shr $2, n 59 and $3, R32(%rax) 60 bt $0, %r8 C cy flag <- carry parameter 61 jrcxz L(lt4) 62 63 mov (up), %r8 64 mov 8(up), %r9 65 dec n 66 jmp L(mid) 67 68EPILOGUE() 69 ALIGN(16) 70PROLOGUE(func) 71 mov R32(n), R32(%rax) 72 shr $2, n 73 and $3, R32(%rax) 74 jrcxz L(lt4) 75 76 mov (up), %r8 77 mov 8(up), %r9 78 dec n 79 jmp L(mid) 80 81L(lt4): dec R32(%rax) 82 mov (up), %r8 83 jnz L(2) 84 ADCSBB (vp), %r8 85 mov %r8, (rp) 86 adc %eax, %eax 87 ret 88 89L(2): dec R32(%rax) 90 mov 8(up), %r9 91 jnz L(3) 92 ADCSBB (vp), %r8 93 ADCSBB 8(vp), %r9 94 mov %r8, (rp) 95 mov %r9, 8(rp) 96 adc %eax, %eax 97 ret 98 99L(3): mov 16(up), %r10 100 ADCSBB (vp), %r8 101 ADCSBB 8(vp), %r9 102 ADCSBB 16(vp), %r10 103 mov %r8, (rp) 104 mov %r9, 8(rp) 105 mov %r10, 16(rp) 106 setc R8(%rax) 107 ret 108 109 ALIGN(16) 110L(top): ADCSBB (vp), %r8 111 ADCSBB 8(vp), %r9 112 ADCSBB 16(vp), %r10 113 ADCSBB 24(vp), %r11 114 mov %r8, (rp) 115 lea 32(up), up 116 mov %r9, 8(rp) 117 mov %r10, 16(rp) 118 dec n 119 mov %r11, 24(rp) 120 lea 32(vp), vp 121 mov (up), %r8 122 mov 8(up), %r9 123 lea 32(rp), rp 124L(mid): mov 16(up), %r10 125 mov 24(up), %r11 126 jnz L(top) 127 128L(end): lea 32(up), up 129 ADCSBB (vp), %r8 130 ADCSBB 8(vp), %r9 131 ADCSBB 16(vp), %r10 132 ADCSBB 24(vp), %r11 133 lea 32(vp), vp 134 mov %r8, (rp) 135 mov %r9, 8(rp) 136 mov %r10, 16(rp) 137 mov %r11, 24(rp) 138 lea 32(rp), rp 139 140 inc R32(%rax) 141 dec R32(%rax) 142 jnz L(lt4) 143 adc %eax, %eax 144 ret 145EPILOGUE() 146