1dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. 2 3dnl Copyright 2002, 2003, 2005 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C 603e: ? 25C 604e: 3 26C 75x (G3): 3 27C 7400,7410 (G4): 3 28C 744x,745x (G4+): 3 29C power4/ppc970: 2.5 30C power5: 2.5 31 32C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 33C 34C There seems no need to schedule the loads back, the code is still 3.0 c/l 35C on 750/7400 no matter where they're placed. 36C 37C Alternatives: 38C 39C Fetching half words would allow add instead for accumulating, instead of 40C adde and its serialization. An outer loop would be required though, since 41C 2^16 halfwords can overflow. lhz+add would be 2.0 c/l, but if there's 42C also a bdz or bdnz for each and a pointer update say every three limbs 43C then the total would be 2.67 c/l which isn't much faster than the current 44C simpler code. 45 46ASM_START() 47PROLOGUE(mpn_mod_34lsub1) 48 49 C r3 src 50 C r4 size 51 52 mtctr r4 53 addic r6, r3, 8 C &src[2], and clear CA 54 55 lwz r3, 0(r3) C acc0 = src[0] 56 bdz L(done) 57 58 lwz r4, -4(r6) C acc1 = src[1] 59 bdz L(two) 60 61 lwz r5, 0(r6) C acc2 = src[2] 62 lis r7, 0 C no carry if just three limbs 63 64 bdz L(three) 65 lis r7, 1 C 0x10000 carry pos 66 67L(top): 68 C r3 acc0 69 C r4 acc1 70 C r5 acc2 71 C r6 src, incrementing 72 C r7 carry pos 73 74 lwz r0, 4(r6) 75 adde r3, r3, r0 76 bdz L(end0) 77 78 lwz r0, 8(r6) 79 adde r4, r4, r0 80 bdz L(end1) 81 82 lwzu r0, 12(r6) 83 adde r5, r5, r0 84 bdnz L(top) 85 86 87 srwi r7, r7, 8 88L(end0): 89 srwi r7, r7, 8 90L(end1): 91 subfe r0, r0, r0 C -1 if not CA 92 93 andc r7, r7, r0 C final carry, 0x10000, 0x100, 1 or 0 94L(three): 95 rlwinm r6, r3, 0,8,31 C acc0 low 96 97 add r7, r7, r6 98 rlwinm r6, r3, 8,24,31 C acc0 high 99 100 add r7, r7, r6 101 rlwinm r6, r4, 8,8,23 C acc1 low 102 103 add r7, r7, r6 104 rlwinm r6, r4, 16,16,31 C acc1 high 105 106 add r7, r7, r6 107 rlwinm r6, r5, 16,8,15 C acc2 low 108 109 add r7, r7, r6 110 rlwinm r6, r5, 24,8,31 C acc2 high 111 112 add r3, r7, r6 113 114L(done): 115 blr 116 117L(two): 118 C r3 acc0 119 C r4 acc1 120 121 rlwinm r5, r3, 8,24,31 C acc0 high 122 rlwinm r3, r3, 0,8,31 C acc0 low 123 124 add r3, r3, r5 C acc0 high + low 125 rlwinm r5, r4, 16,16,31 C acc1 high 126 127 add r3, r3, r5 C add acc1 high 128 rlwinm r5, r4, 8,8,23 C acc1 low 129 130 add r3, r3, r5 C add acc1 low 131 132 blr 133 134EPILOGUE() 135