copyd.asm revision 1.1.1.1
1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd. 2 3dnl Copyright 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C 16-byte coaligned unaligned 23C cycles/limb cycles/limb 24C 7400,7410 (G4): 0.5 0.64 25C 744x,745x (G4+): 0.75 0.82 26C 970 (G5): 0.78 1.02 (64-bit limbs) 27 28C STATUS 29C * Works for all sizes and alignments. 30 31C TODO 32C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling 33C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 34C c/l for 970. 35C * Consider using VMX instructions also for head and tail, by using some 36C read-modify-write tricks. 37C * The VMX code is used from the smallest sizes it handles, but measurements 38C show a large speed bump at the cutoff points. Small copying (perhaps 39C using some read-modify-write technique) should be optimized. 40C * Make a mpn_com based on this code. 41 42define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 43define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 44define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 45 46 47ifelse(GMP_LIMB_BITS,32,` 48 define(`LIMB32',` $1') 49 define(`LIMB64',`') 50',` 51 define(`LIMB32',`') 52 define(`LIMB64',` $1') 53') 54 55C INPUT PARAMETERS 56define(`rp', `r3') 57define(`up', `r4') 58define(`n', `r5') 59 60define(`us', `v4') 61 62 63ASM_START() 64PROLOGUE(mpn_copyd) 65 66LIMB32(`slwi. r0, n, 2 ') 67LIMB64(`sldi. r0, n, 3 ') 68 add rp, rp, r0 69 add up, up, r0 70 71LIMB32(`cmpi cr7, n, 11 ') 72LIMB64(`cmpdi cr7, n, 5 ') 73 bge cr7, L(big) 74 75 beqlr cr0 76 77C Handle small cases with plain operations 78 mtctr n 79L(topS): 80LIMB32(`lwz r0, -4(up) ') 81LIMB64(`ld r0, -8(up) ') 82 addi up, up, -GMP_LIMB_BYTES 83LIMB32(`stw r0, -4(rp) ') 84LIMB64(`std r0, -8(rp) ') 85 addi rp, rp, -GMP_LIMB_BYTES 86 bdnz L(topS) 87 blr 88 89C Handle large cases with VMX operations 90L(big): 91 addi rp, rp, -16 92 addi up, up, -16 93 mfspr r12, 256 94 oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 95 mtspr 256, r0 96 97LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 98LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 99 beq L(rp_aligned) 100 101 subf n, r7, n 102L(top0): 103LIMB32(`lwz r0, 12(up) ') 104LIMB64(`ld r0, 8(up) ') 105 addi up, up, -GMP_LIMB_BYTES 106LIMB32(`addic. r7, r7, -1 ') 107LIMB32(`stw r0, 12(rp) ') 108LIMB64(`std r0, 8(rp) ') 109 addi rp, rp, -GMP_LIMB_BYTES 110LIMB32(`bne L(top0) ') 111 112L(rp_aligned): 113 114LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 115LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 116 117LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 118LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 119 mtctr r7 C copy n to count register 120 121 li r10, -16 122 123 beq L(up_aligned) 124 125 lvsl us, 0, up 126 127 addi up, up, 16 128LIMB32(`andi. r0, n, 0x4 ') 129LIMB64(`andi. r0, n, 0x2 ') 130 beq L(1) 131 lvx v0, 0, up 132 lvx v2, r10, up 133 vperm v3, v2, v0, us 134 stvx v3, 0, rp 135 addi up, up, -32 136 addi rp, rp, -16 137 b L(lpu) 138L(1): lvx v2, 0, up 139 addi up, up, -16 140 b L(lpu) 141 142 ALIGN(32) 143L(lpu): lvx v0, 0, up 144 vperm v3, v0, v2, us 145 stvx v3, 0, rp 146 lvx v2, r10, up 147 addi up, up, -32 148 vperm v3, v2, v0, us 149 stvx v3, r10, rp 150 addi rp, rp, -32 151 bdnz L(lpu) 152 153 b L(tail) 154 155L(up_aligned): 156 157LIMB32(`andi. r0, n, 0x4 ') 158LIMB64(`andi. r0, n, 0x2 ') 159 beq L(lpa) 160 lvx v0, 0, up 161 stvx v0, 0, rp 162 addi up, up, -16 163 addi rp, rp, -16 164 b L(lpa) 165 166 ALIGN(32) 167L(lpa): lvx v0, 0, up 168 lvx v1, r10, up 169 addi up, up, -32 170 nop 171 stvx v0, 0, rp 172 stvx v1, r10, rp 173 addi rp, rp, -32 174 bdnz L(lpa) 175 176L(tail): 177LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 178LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 179 beq L(ret) 180LIMB32(`li r10, 12 ') 181L(top2): 182LIMB32(`lwzx r0, r10, up ') 183LIMB64(`ld r0, 8(up) ') 184LIMB32(`addic. r7, r7, -1 ') 185LIMB32(`stwx r0, r10, rp ') 186LIMB64(`std r0, 8(rp) ') 187LIMB32(`addi r10, r10, -GMP_LIMB_BYTES') 188LIMB32(`bne L(top2) ') 189 190L(ret): mtspr 256, r12 191 blr 192EPILOGUE() 193