1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi. 2 3dnl Copyright 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C 16-byte coaligned unaligned 23C cycles/limb cycles/limb 24C 7400,7410 (G4): 0.5 0.64 25C 744x,745x (G4+): 0.75 0.82 26C 970 (G5): 0.78 1.02 (64-bit limbs) 27 28C STATUS 29C * Works for all sizes and alignments. 30 31C TODO 32C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling 33C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 34C c/l for 970. 35C * Consider using VMX instructions also for head and tail, by using some 36C read-modify-write tricks. 37C * The VMX code is used from the smallest sizes it handles, but measurements 38C show a large speed bump at the cutoff points. Small copying (perhaps 39C using some read-modify-write technique) should be optimized. 40C * Make a mpn_com based on this code. 41 42define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 43define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 44define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 45 46 47ifelse(GMP_LIMB_BITS,32,` 48 define(`LIMB32',` $1') 49 define(`LIMB64',`') 50',` 51 define(`LIMB32',`') 52 define(`LIMB64',` $1') 53') 54 55C INPUT PARAMETERS 56define(`rp', `r3') 57define(`up', `r4') 58define(`n', `r5') 59 60define(`us', `v4') 61 62 63ASM_START() 64PROLOGUE(mpn_copyi) 65 66LIMB32(`cmpi cr7, n, 11 ') 67LIMB64(`cmpdi cr7, n, 5 ') 68 bge cr7, L(big) 69 70 or. r0, n, n 71 beqlr cr0 72 73C Handle small cases with plain operations 74 mtctr n 75L(topS): 76LIMB32(`lwz r0, 0(up) ') 77LIMB64(`ld r0, 0(up) ') 78 addi up, up, GMP_LIMB_BYTES 79LIMB32(`stw r0, 0(rp) ') 80LIMB64(`std r0, 0(rp) ') 81 addi rp, rp, GMP_LIMB_BYTES 82 bdnz L(topS) 83 blr 84 85C Handle large cases with VMX operations 86L(big): 87 mfspr r12, 256 88 oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 89 mtspr 256, r0 90 91LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 92LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 93 beq L(rp_aligned) 94 95 subfic r7, r7, LIMBS_PER_VR 96 subf n, r7, n 97L(top0): 98LIMB32(`lwz r0, 0(up) ') 99LIMB64(`ld r0, 0(up) ') 100 addi up, up, GMP_LIMB_BYTES 101LIMB32(`addic. r7, r7, -1 ') 102LIMB32(`stw r0, 0(rp) ') 103LIMB64(`std r0, 0(rp) ') 104 addi rp, rp, GMP_LIMB_BYTES 105LIMB32(`bne L(top0) ') 106 107L(rp_aligned): 108 109LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 110LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 111 112LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 113LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 114 mtctr r7 C copy n to count register 115 116 li r10, 16 117 118 beq L(up_aligned) 119 120 lvsl us, 0, up 121 122LIMB32(`andi. r0, n, 0x4 ') 123LIMB64(`andi. r0, n, 0x2 ') 124 beq L(1) 125 lvx v0, 0, up 126 lvx v2, r10, up 127 vperm v3, v0, v2, us 128 stvx v3, 0, rp 129 addi up, up, 32 130 addi rp, rp, 16 131 b L(lpu) 132L(1): lvx v2, 0, up 133 addi up, up, 16 134 b L(lpu) 135 136 ALIGN(32) 137L(lpu): lvx v0, 0, up 138 vperm v3, v2, v0, us 139 stvx v3, 0, rp 140 lvx v2, r10, up 141 addi up, up, 32 142 vperm v3, v0, v2, us 143 stvx v3, r10, rp 144 addi rp, rp, 32 145 bdnz L(lpu) 146 147 addi up, up, -16 148 b L(tail) 149 150L(up_aligned): 151 152LIMB32(`andi. r0, n, 0x4 ') 153LIMB64(`andi. r0, n, 0x2 ') 154 beq L(lpa) 155 lvx v0, 0, up 156 stvx v0, 0, rp 157 addi up, up, 16 158 addi rp, rp, 16 159 b L(lpa) 160 161 ALIGN(32) 162L(lpa): lvx v0, 0, up 163 lvx v1, r10, up 164 addi up, up, 32 165 nop 166 stvx v0, 0, rp 167 stvx v1, r10, rp 168 addi rp, rp, 32 169 bdnz L(lpa) 170 171L(tail): 172LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 173LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 174 beq L(ret) 175LIMB32(`li r10, 0 ') 176L(top2): 177LIMB32(`lwzx r0, r10, up ') 178LIMB64(`ld r0, 0(up) ') 179LIMB32(`addic. r7, r7, -1 ') 180LIMB32(`stwx r0, r10, rp ') 181LIMB64(`std r0, 0(rp) ') 182LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 183LIMB32(`bne L(top2) ') 184 185L(ret): mtspr 256, r12 186 blr 187EPILOGUE() 188