1dnl Alpha mpn_com -- mpn one's complement. 2 3dnl Copyright 2003 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C EV4: 4.75 25C EV5: 2.0 26C EV6: 1.5 27 28 29C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); 30C 31C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total 32C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop 33C will be 1.5+2/N c/l. 34C 35C 2 cycles of loop control are unavoidable, for pointer updates and the 36C taken branch bubble, but also since ldq cannot issue two cycles after stq 37C (and with a run of stqs that means neither of two cycles at the end of the 38C loop. 39C 40C The fbeq is forced into the second cycle of the loop using unops, since 41C the first time through it must wait for the cvtqt result. Once that 42C result is ready (a 1 cycle stall) then both the branch and following loads 43C can issue together. 44C 45C The main loop handles an odd count of limbs, being two limbs loaded before 46C each size test, plus one pipelined around from the previous iteration (or 47C setup in the entry sequence). 48C 49C An even number of limbs is handled by an explicit dst[0]=~src[0] in the 50C entry sequence, and an increment of the pointers. For an odd size there's 51C no increment and the first store in the loop (r24) is a repeat of dst[0]. 52C 53C Note that the load for r24 after the possible pointer increment is done 54C before the explicit store to dst[0], in case src==dst. 55 56 57ASM_START() 58 59FLOAT64(L(dat), 2.0) 60 61 ALIGN(16) 62 63PROLOGUE(mpn_com,gp) 64 65 C r16 dst 66 C r17 src 67 C r18 size 68 69 lda r30, -16(r30) C temporary stack space 70 lda r7, -3(r18) C size - 3 71 72 ldq r20, 0(r17) C src[0] 73 srl r7, 1, r6 C (size-3)/2 74 75 stq r6, 8(r30) C (size-3)/2 76 and r7, 1, r5 C 1 if size even 77 78 LEA( r8, L(dat)) 79 s8addq r5, r17, r17 C skip src[0] if even 80 81 ornot r31, r20, r20 C ~src[0] 82 unop 83 84 ldt f0, 8(r30) C (size-3)/2 85 ldq r24, 0(r17) C src[0 or 1] 86 87 stq r20, 0(r16) C dst[0] 88 s8addq r5, r16, r19 C skip dst[0] if even 89 90 ldt f1, 0(r8) C data 2.0 91 lda r30, 16(r30) C restore stack 92 unop 93 cvtqt f0, f0 C (size-3)/2 as float 94 95 ornot r31, r24, r24 96 blt r7, L(done_1) C if size<=2 97 unop 98 unop 99 100 101 C 16-byte alignment here 102L(top): 103 C r17 src, incrementing 104 C r19 dst, incrementing 105 C r24 dst[i] result, ready to store 106 C f0 (size-3)/2, decrementing 107 C f1 2.0 108 109 ldq r20, 8(r17) C src[i+1] 110 ldq r21, 16(r17) C src[i+2] 111 unop 112 unop 113 114 fbeq f0, L(done_2) 115 unop 116 ldq r22, 24(r17) C src[i+3] 117 ldq r23, 32(r17) C src[i+4] 118 119 stq r24, 0(r19) C dst[i] 120 ornot r31, r20, r20 121 subt f0, f1, f0 C count -= 2 122 unop 123 124 stq r20, 8(r19) C dst[i+1] 125 ornot r31, r21, r21 126 unop 127 unop 128 129 stq r21, 16(r19) C dst[i+2] 130 ornot r31, r22, r22 131 132 stq r22, 24(r19) C dst[i+3] 133 ornot r31, r23, r24 134 135 lda r17, 32(r17) C src += 4 136 lda r19, 32(r19) C dst += 4 137 unop 138 fbge f0, L(top) 139 140 141L(done_1): 142 C r19 &dst[size-1] 143 C r24 result for dst[size-1] 144 145 stq r24, 0(r19) C dst[size-1] 146 ret r31, (r26), 1 147 148 149L(done_2): 150 C r19 &dst[size-3] 151 C r20 src[size-2] 152 C r21 src[size-1] 153 C r24 result for dst[size-3] 154 155 stq r24, 0(r19) C dst[size-3] 156 ornot r31, r20, r20 157 158 stq r20, 8(r19) C dst[size-2] 159 ornot r31, r21, r21 160 161 stq r21, 16(r19) C dst[size-1] 162 ret r31, (r26), 1 163 164EPILOGUE() 165ASM_END() 166