1dnl ARM mpn_addmul_1 optimised for A15. 2 3dnl Copyright 2012, 2013 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb best 34C StrongARM: - 35C XScale ? 36C Cortex-A7 ? 37C Cortex-A8 ? 38C Cortex-A9 6 3.25 39C Cortex-A15 2 this 40 41C This code uses umlal for adding in the rp[] data, keeping the recurrency path 42C separate from any multiply instructions. It performs well on A15, at umlal's 43C bandwidth. 44C 45C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm 46C for all loads and stores. Alternatively, it could do 2-way or 4-way, but 47C then alignment aware code will be necessary (adding O(1) bookkeeping 48C overhead). 49C 50C We don't use r12 due to ldrd and strd limitations. 51 52C Architecture requirements: 53C v5 - 54C v5t - 55C v5te ldrd strd 56C v6 - 57C v6t2 - 58C v7a - 59 60define(`rp', `r0') 61define(`up', `r1') 62define(`n', `r2') 63define(`v0', `r3') 64 65define(`w0', `r10') define(`w1', `r11') 66define(`u0', `r8') define(`u1', `r9') 67 68ASM_START() 69PROLOGUE(mpn_addmul_1) 70 push { r4-r11 } 71 72 ands r6, n, #3 73 sub n, n, #3 74 beq L(b00) 75 cmp r6, #2 76 bcc L(b01) 77 beq L(b10) 78 79L(b11): mov r6, #0 80 cmn r13, #0 C carry clear 81 ldr u1, [up], #-4 82 ldr w1, [rp], #-4 83 mov r7, #0 84 b L(mid) 85 86L(b00): ldrd u0, u1, [up] 87 ldrd w0, w1, [rp] 88 mov r6, #0 89 umlal w0, r6, u0, v0 90 cmn r13, #0 C carry clear 91 mov r7, #0 92 str w0, [rp] 93 b L(mid) 94 95L(b10): ldrd u0, u1, [up], #8 96 ldrd w0, w1, [rp] 97 mov r4, #0 98 umlal w0, r4, u0, v0 99 cmn r13, #0 C carry clear 100 mov r5, #0 101 str w0, [rp], #8 102 umlal w1, r5, u1, v0 103 tst n, n 104 bmi L(end) 105 b L(top) 106 107L(b01): mov r4, #0 108 ldr u1, [up], #4 109 ldr w1, [rp], #4 110 mov r5, #0 111 umlal w1, r5, u1, v0 112 tst n, n 113 bmi L(end) 114 115 ALIGN(16) 116L(top): ldrd u0, u1, [up, #0] 117 adcs r4, r4, w1 118 ldrd w0, w1, [rp, #0] 119 mov r6, #0 120 umlal w0, r6, u0, v0 C 1 2 121 adcs r5, r5, w0 122 mov r7, #0 123 strd r4, r5, [rp, #-4] 124L(mid): umlal w1, r7, u1, v0 C 2 3 125 ldrd u0, u1, [up, #8] 126 adcs r6, r6, w1 127 ldrd w0, w1, [rp, #8] 128 mov r4, #0 129 umlal w0, r4, u0, v0 C 3 4 130 adcs r7, r7, w0 131 mov r5, #0 132 strd r6, r7, [rp, #4] 133 umlal w1, r5, u1, v0 C 0 1 134 sub n, n, #4 135 add up, up, #16 136 add rp, rp, #16 137 tst n, n 138 bpl L(top) 139 140L(end): adcs r4, r4, w1 141 str r4, [rp, #-4] 142 adc r0, r5, #0 143 pop { r4-r11 } 144 bx r14 145EPILOGUE() 146