1dnl X86-64 mpn_add_n, mpn_sub_n, optimized for Intel Atom. 2 3dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: 1.85 25C K10: ? 26C P4: ? 27C P6-15 (Core2): ? 28C P6-28 (Atom): 3 29 30C INPUT PARAMETERS 31define(`rp', `%rdi') 32define(`up', `%rsi') 33define(`vp', `%rdx') 34define(`n', `%rcx') 35define(`cy', `%r8') C (only for mpn_add_nc) 36 37ifdef(`OPERATION_add_n', ` 38 define(ADCSBB, adc) 39 define(func, mpn_add_n) 40 define(func_nc, mpn_add_nc)') 41ifdef(`OPERATION_sub_n', ` 42 define(ADCSBB, sbb) 43 define(func, mpn_sub_n) 44 define(func_nc, mpn_sub_nc)') 45 46MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 47 48ASM_START() 49 TEXT 50 ALIGN(16) 51PROLOGUE(func_nc) 52 jmp L(ent) 53EPILOGUE() 54PROLOGUE(func) 55 xor %r8, %r8 56L(ent): 57 mov R32(%rcx), R32(%rax) 58 shr $2, %rcx 59 and $3, R32(%rax) 60 jz L(b0) 61 cmp $2, R32(%rax) 62 jz L(b2) 63 jg L(b3) 64 65L(b1): mov (%rsi), %r10 66 test %rcx, %rcx 67 jnz L(gt1) 68 shr R32(%r8) C Set CF from argument 69 ADCSBB (%rdx), %r10 70 mov %r10, (%rdi) 71 mov R32(%rcx), R32(%rax) C zero rax 72 adc R32(%rax), R32(%rax) 73 ret 74L(gt1): shr R32(%r8) 75 ADCSBB (%rdx), %r10 76 mov 8(%rsi), %r11 77 lea 16(%rsi), %rsi 78 lea -16(%rdx), %rdx 79 lea -16(%rdi), %rdi 80 jmp L(m1) 81 82L(b2): mov (%rsi), %r9 83 mov 8(%rsi), %r10 84 lea -8(%rdx), %rdx 85 test %rcx, %rcx 86 jnz L(gt2) 87 shr R32(%r8) 88 lea -40(%rdi), %rdi 89 jmp L(e2) 90L(gt2): shr R32(%r8) 91 ADCSBB 8(%rdx), %r9 92 mov 16(%rsi), %r11 93 lea -8(%rsi), %rsi 94 lea -8(%rdi), %rdi 95 jmp L(m2) 96 97L(b3): mov (%rsi), %rax 98 mov 8(%rsi), %r9 99 mov 16(%rsi), %r10 100 test %rcx, %rcx 101 jnz L(gt3) 102 shr R32(%r8) 103 lea -32(%rdi), %rdi 104 jmp L(e3) 105L(gt3): shr R32(%r8) 106 ADCSBB (%rdx), %rax 107 jmp L(m3) 108 109L(b0): mov (%rsi), %r11 110 neg R32(%r8) 111 lea -24(%rdx), %rdx 112 lea -24(%rdi), %rdi 113 lea 8(%rsi), %rsi 114 jmp L(m0) 115 116 ALIGN(8) 117L(top): mov %r11, 24(%rdi) 118 ADCSBB (%rdx), %rax 119 lea 32(%rdi), %rdi 120L(m3): mov %rax, (%rdi) 121 ADCSBB 8(%rdx), %r9 122 mov 24(%rsi), %r11 123L(m2): mov %r9, 8(%rdi) 124 ADCSBB 16(%rdx), %r10 125 lea 32(%rsi), %rsi 126L(m1): mov %r10, 16(%rdi) 127L(m0): ADCSBB 24(%rdx), %r11 128 mov (%rsi), %rax 129 mov 8(%rsi), %r9 130 lea 32(%rdx), %rdx 131 dec %rcx 132 mov 16(%rsi), %r10 133 jnz L(top) 134 135 mov %r11, 24(%rdi) 136L(e3): ADCSBB (%rdx), %rax 137 mov %rax, 32(%rdi) 138L(e2): ADCSBB 8(%rdx), %r9 139 mov %r9, 40(%rdi) 140L(e1): ADCSBB 16(%rdx), %r10 141 mov %r10, 48(%rdi) 142 mov R32(%rcx), R32(%rax) C zero rax 143 adc R32(%rax), R32(%rax) 144 ret 145EPILOGUE() 146