1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd. 2 3dnl Copyright 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C 16-byte coaligned unaligned 34C cycles/limb cycles/limb 35C 7400,7410 (G4): 0.5 0.64 36C 744x,745x (G4+): 0.75 0.82 37C 970 (G5): 0.78 1.02 (64-bit limbs) 38 39C STATUS 40C * Works for all sizes and alignments. 41 42C TODO 43C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling 44C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 45C c/l for 970. 46C * Consider using VMX instructions also for head and tail, by using some 47C read-modify-write tricks. 48C * The VMX code is used from the smallest sizes it handles, but measurements 49C show a large speed bump at the cutoff points. Small copying (perhaps 50C using some read-modify-write technique) should be optimized. 51C * Make an mpn_com based on this code. 52 53define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 54define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 55define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 56 57 58ifelse(GMP_LIMB_BITS,32,` 59 define(`LIMB32',` $1') 60 define(`LIMB64',`') 61',` 62 define(`LIMB32',`') 63 define(`LIMB64',` $1') 64') 65 66C INPUT PARAMETERS 67define(`rp', `r3') 68define(`up', `r4') 69define(`n', `r5') 70 71define(`us', `v4') 72 73 74ASM_START() 75PROLOGUE(mpn_copyd) 76 77LIMB32(`slwi. r0, n, 2 ') 78LIMB64(`sldi. r0, n, 3 ') 79 add rp, rp, r0 80 add up, up, r0 81 82LIMB32(`cmpi cr7, n, 11 ') 83LIMB64(`cmpdi cr7, n, 5 ') 84 bge cr7, L(big) 85 86 beqlr cr0 87 88C Handle small cases with plain operations 89 mtctr n 90L(topS): 91LIMB32(`lwz r0, -4(up) ') 92LIMB64(`ld r0, -8(up) ') 93 addi up, up, -GMP_LIMB_BYTES 94LIMB32(`stw r0, -4(rp) ') 95LIMB64(`std r0, -8(rp) ') 96 addi rp, rp, -GMP_LIMB_BYTES 97 bdnz L(topS) 98 blr 99 100C Handle large cases with VMX operations 101L(big): 102 addi rp, rp, -16 103 addi up, up, -16 104 mfspr r12, 256 105 oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 106 mtspr 256, r0 107 108LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 109LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 110 beq L(rp_aligned) 111 112 subf n, r7, n 113L(top0): 114LIMB32(`lwz r0, 12(up) ') 115LIMB64(`ld r0, 8(up) ') 116 addi up, up, -GMP_LIMB_BYTES 117LIMB32(`addic. r7, r7, -1 ') 118LIMB32(`stw r0, 12(rp) ') 119LIMB64(`std r0, 8(rp) ') 120 addi rp, rp, -GMP_LIMB_BYTES 121LIMB32(`bne L(top0) ') 122 123L(rp_aligned): 124 125LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 126LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 127 128LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 129LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 130 mtctr r7 C copy n to count register 131 132 li r10, -16 133 134 beq L(up_aligned) 135 136 lvsl us, 0, up 137 138 addi up, up, 16 139LIMB32(`andi. r0, n, 0x4 ') 140LIMB64(`andi. r0, n, 0x2 ') 141 beq L(1) 142 lvx v0, 0, up 143 lvx v2, r10, up 144 vperm v3, v2, v0, us 145 stvx v3, 0, rp 146 addi up, up, -32 147 addi rp, rp, -16 148 b L(lpu) 149L(1): lvx v2, 0, up 150 addi up, up, -16 151 b L(lpu) 152 153 ALIGN(32) 154L(lpu): lvx v0, 0, up 155 vperm v3, v0, v2, us 156 stvx v3, 0, rp 157 lvx v2, r10, up 158 addi up, up, -32 159 vperm v3, v2, v0, us 160 stvx v3, r10, rp 161 addi rp, rp, -32 162 bdnz L(lpu) 163 164 b L(tail) 165 166L(up_aligned): 167 168LIMB32(`andi. r0, n, 0x4 ') 169LIMB64(`andi. r0, n, 0x2 ') 170 beq L(lpa) 171 lvx v0, 0, up 172 stvx v0, 0, rp 173 addi up, up, -16 174 addi rp, rp, -16 175 b L(lpa) 176 177 ALIGN(32) 178L(lpa): lvx v0, 0, up 179 lvx v1, r10, up 180 addi up, up, -32 181 nop 182 stvx v0, 0, rp 183 stvx v1, r10, rp 184 addi rp, rp, -32 185 bdnz L(lpa) 186 187L(tail): 188LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 189LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 190 beq L(ret) 191LIMB32(`li r10, 12 ') 192L(top2): 193LIMB32(`lwzx r0, r10, up ') 194LIMB64(`ld r0, 8(up) ') 195LIMB32(`addic. r7, r7, -1 ') 196LIMB32(`stwx r0, r10, rp ') 197LIMB64(`std r0, 8(rp) ') 198LIMB32(`addi r10, r10, -GMP_LIMB_BYTES') 199LIMB32(`bne L(top2) ') 200 201L(ret): mtspr 256, r12 202 blr 203EPILOGUE() 204