1dnl Power9 mpn_addmul_2. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2018 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C power9: 1.62 37 38C STATUS 39C * Not written with any power9 pipeline understanding. 40C * The 4x unrolling was not motivated by any timing tests. 41C * No local scheduling for performance tweaking has been done. 42C * Decrease load scheduling! 43 44define(`rp', `r3') 45define(`up', `r4') 46define(`n', `r5') C Note: Reused as scratch 47define(`vp', `r6') C Note: Reused for v1 48 49define(`v0', `r7') 50define(`v1', `r6') 51 52 53ASM_START() 54PROLOGUE(mpn_addmul_2) 55 std r26, -48(r1) 56 std r27, -40(r1) 57 std r28, -32(r1) 58 std r29, -24(r1) 59 std r30, -16(r1) 60 std r31, -8(r1) 61 62 subfic r0, r1, 0 C clear CA 63 subfo r0, r0, r0 C clear OV and r0 64 65 cmpdi cr7, n, 4 66 67 ld v0, 0(vp) 68 ld v1, 8(vp) 69 70 srdi r10, n, 2 71 mtctr r10 72 73 rldicl. r9, n, 0, 63 74 bne cr0, L(bx1) 75 76L(bx0): rldicl. r9, n, 63, 63 77 78 ld r28, 0(rp) 79 ld r8, 0(up) 80 ld r11, 8(rp) 81 ld r9, 8(up) 82 maddld( r26, r8, v0, r28) 83 maddhdu(r31, r8, v0, r28) 84 blt cr7, L(2) 85 ld r28, 16(rp) 86 mulld r5, r8, v1 87 mulhdu r10, r8, v1 88 bne cr0, L(b10) 89 90L(b00): addi up, up, -8 91 addi rp, rp, -24 92 b L(lo0) 93 94L(b10): addi up, up, 8 95 addi rp, rp, -8 96 b L(lo2) 97 98L(2): addi rp, rp, -8 99 mulld r5, r8, v1 100 mulhdu r10, r8, v1 101 b L(cj2) 102 103L(bx1): rldicl. r9, n, 63, 63 104 105 ld r29, 0(rp) 106 ld r9, 0(up) 107 ld r10, 8(rp) 108 ld r8, 8(up) 109 maddld( r27, r9, v0, r29) 110 maddhdu(r30, r9, v0, r29) 111 ld r29, 16(rp) 112 mulld r12, r9, v1 113 mulhdu r11, r9, v1 114 bne cr0, L(b11) 115 116L(b01): addi rp, rp, -16 117 b L(lo1) 118L(b11): addi up, up, 16 119 blt cr7, L(end) 120 121L(top): ld r9, 0(up) 122 maddld( r26, r8, v0, r10) C 0 4 -> adde 123 maddhdu(r31, r8, v0, r10) C 1 5 124 adde r0, r27, r0 C 7 11 125 ld r28, 24(rp) 126 std r0, 0(rp) 127 maddld( r5, r8, v1, r29) C 1 5 -> addex 128 maddhdu(r10, r8, v1, r29) C 2 6 129 addex( r0, r12, r30, 0) C 8 12 130L(lo2): ld r8, 8(up) 131 maddld( r27, r9, v0, r11) C 1 5 -> adde 132 maddhdu(r30, r9, v0, r11) C 2 6 133 adde r0, r26, r0 C 8 12 134 ld r29, 32(rp) 135 std r0, 8(rp) 136 maddld( r12, r9, v1, r28) C 2 6 -> addex 137 maddhdu(r11, r9, v1, r28) C 3 7 138 addex( r0, r5, r31, 0) C 5 9 13 139L(lo1): ld r9, 16(up) 140 maddld( r26, r8, v0, r10) C 2 6 -> adde 141 maddhdu(r31, r8, v0, r10) C 3 7 142 adde r0, r27, r0 C 5 9 13 143 ld r28, 40(rp) 144 std r0, 16(rp) 145 maddld( r5, r8, v1, r29) C 3 7 -> addex 146 maddhdu(r10, r8, v1, r29) C 4 8 147 addex( r0, r12, r30, 0) C 6 10 148L(lo0): ld r8, 24(up) 149 maddld( r27, r9, v0, r11) C 3 7 -> adde 150 maddhdu(r30, r9, v0, r11) C 4 8 151 adde r0, r26, r0 C 6 10 152 ld r29, 48(rp) 153 std r0, 24(rp) 154 maddld( r12, r9, v1, r28) C 4 8 -> addex 155 maddhdu(r11, r9, v1, r28) C 5 9 156 addex( r0, r5, r31, 0) C 7 11 157 addi up, up, 32 158 addi rp, rp, 32 159 bdnz L(top) 160 161L(end): ld r9, 0(up) 162 maddld( r26, r8, v0, r10) C 0 4 163 maddhdu(r31, r8, v0, r10) C 1 5 164 adde r0, r27, r0 C 7 11 165 std r0, 0(rp) C -4 166 maddld( r5, r8, v1, r29) C 1 5 167 maddhdu(r10, r8, v1, r29) C 2 6 168 addex( r0, r12, r30, 0) C 8 12 169L(cj2): maddld( r27, r9, v0, r11) C 1 5 -2 170 maddhdu(r30, r9, v0, r11) C 2 6 -1 171 adde r0, r26, r0 C 8 12 -3 172 std r0, 8(rp) C -3 173 mulld r12, r9, v1 C 2 6 -1 174 mulhdu r11, r9, v1 C 3 7 0 = return limb 175 addex( r0, r5, r31, 0) C 5 9 13 176 adde r0, r27, r0 C 5 9 13 -2 177 std r0, 16(rp) C -2 178 addex( r0, r12, r30, 0) C 6 10 -1 179 adde r0, r0, r10 C -1 180 std r0, 24(rp) C -1 181 li r4, 0 182 addze r3, r11 183 addex( r3, r3, r4, 0) 184 185L(ret): ld r26, -48(r1) 186 ld r27, -40(r1) 187 ld r28, -32(r1) 188 ld r29, -24(r1) 189 ld r30, -16(r1) 190 ld r31, -8(r1) 191 blr 192EPILOGUE() 193ASM_END() 194