1dnl ARM mpn_add_n/mpn_sub_n optimised for A15. 2 3dnl Copyright 2013 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb best 34C StrongARM: - 35C XScale ? 36C Cortex-A7 ? 37C Cortex-A8 ? 38C Cortex-A9 3.55 2.5 39C Cortex-A15 1.27 this 40 41C This was a major improvement compared to the code we had before, but it might 42C not be the best 8-way code possible. We've tried some permutations of auto- 43C increments and separate pointer updates, but they all ran at the same speed 44C on A15. 45 46C Architecture requirements: 47C v5 - 48C v5t - 49C v5te ldrd strd 50C v6 - 51C v6t2 - 52C v7a - 53 54define(`rp', `r0') 55define(`up', `r1') 56define(`vp', `r2') 57define(`n', `r3') 58 59ifdef(`OPERATION_add_n', ` 60 define(`ADDSUBC', adcs) 61 define(`IFADD', `$1') 62 define(`SETCY', `cmp $1, #1') 63 define(`RETVAL', `adc r0, n, #0') 64 define(`RETVAL2', `adc r0, n, #1') 65 define(`func', mpn_add_n) 66 define(`func_nc', mpn_add_nc)') 67ifdef(`OPERATION_sub_n', ` 68 define(`ADDSUBC', sbcs) 69 define(`IFADD', `') 70 define(`SETCY', `rsbs $1, $1, #0') 71 define(`RETVAL', `sbc r0, r0, r0 72 and r0, r0, #1') 73 define(`RETVAL2', `RETVAL') 74 define(`func', mpn_sub_n) 75 define(`func_nc', mpn_sub_nc)') 76 77MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 78 79ASM_START() 80PROLOGUE(func_nc) 81 ldr r12, [sp] 82 b L(ent) 83EPILOGUE() 84PROLOGUE(func) 85 mov r12, #0 86L(ent): push { r4-r9 } 87 88 ands r6, n, #3 89 mov n, n, lsr #2 90 beq L(b00) 91 cmp r6, #2 92 bcc L(b01) 93 beq L(b10) 94 95L(b11): ldr r5, [up], #4 96 ldr r7, [vp], #4 97 SETCY( r12) 98 ADDSUBC r9, r5, r7 99 ldrd r4, r5, [up, #0] 100 ldrd r6, r7, [vp, #0] 101 str r9, [rp], #-4 102 b L(lo) 103 104L(b00): ldrd r4, r5, [up], #-8 105 ldrd r6, r7, [vp], #-8 106 SETCY( r12) 107 sub rp, rp, #16 108 b L(mid) 109 110L(b01): ldr r5, [up], #-4 111 ldr r7, [vp], #-4 112 SETCY( r12) 113 ADDSUBC r9, r5, r7 114 str r9, [rp], #-12 115 tst n, n 116 beq L(wd1) 117L(gt1): ldrd r4, r5, [up, #8] 118 ldrd r6, r7, [vp, #8] 119 b L(mid) 120 121L(b10): ldrd r4, r5, [up] 122 ldrd r6, r7, [vp] 123 SETCY( r12) 124 sub rp, rp, #8 125 b L(lo) 126 127 ALIGN(16) 128L(top): ldrd r4, r5, [up, #8] 129 ldrd r6, r7, [vp, #8] 130 strd r8, r9, [rp, #8] 131L(mid): ADDSUBC r8, r4, r6 132 ADDSUBC r9, r5, r7 133 ldrd r4, r5, [up, #16] 134 ldrd r6, r7, [vp, #16] 135 strd r8, r9, [rp, #16] 136 ADDSUBC r8, r4, r6 137 ADDSUBC r9, r5, r7 138 sub n, n, #2 139 tst n, n 140 bmi L(dne) 141 ldrd r4, r5, [up, #24] 142 ldrd r6, r7, [vp, #24] 143 strd r8, r9, [rp, #24] 144 ADDSUBC r8, r4, r6 145 ADDSUBC r9, r5, r7 146 ldrd r4, r5, [up, #32]! 147 ldrd r6, r7, [vp, #32]! 148 strd r8, r9, [rp, #32]! 149L(lo): ADDSUBC r8, r4, r6 150 ADDSUBC r9, r5, r7 151 tst n, n 152 bne L(top) 153 154L(end): strd r8, r9, [rp, #8] 155L(wd1): RETVAL 156 pop { r4-r9 } 157 bx r14 158L(dne): strd r8, r9, [rp, #24] 159 RETVAL2 160 pop { r4-r9 } 161 bx r14 162EPILOGUE() 163