add_n.s revision 1.1.1.1
1; mc88100 mpn_add_n -- Add two limb vectors of the same length > 0 and store 2; sum in a third limb vector. 3 4; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc. 5 6; This file is part of the GNU MP Library. 7 8; The GNU MP Library is free software; you can redistribute it and/or modify 9; it under the terms of the GNU Lesser General Public License as published by 10; the Free Software Foundation; either version 3 of the License, or (at your 11; option) any later version. 12 13; The GNU MP Library is distributed in the hope that it will be useful, but 14; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16; License for more details. 17 18; You should have received a copy of the GNU Lesser General Public License 19; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21 22; INPUT PARAMETERS 23; res_ptr r2 24; s1_ptr r3 25; s2_ptr r4 26; size r5 27 28; This code has been optimized to run one instruction per clock, avoiding 29; load stalls and writeback contention. As a result, the instruction 30; order is not always natural. 31 32; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, 33; but on the 88110, it seems to run much slower, 6.6 clocks/limb. 34 35 text 36 align 16 37 global ___gmpn_add_n 38___gmpn_add_n: 39 ld r6,r3,0 ; read first limb from s1_ptr 40 extu r10,r5,3 41 ld r7,r4,0 ; read first limb from s2_ptr 42 43 subu.co r5,r0,r5 ; (clear carry as side effect) 44 mak r5,r5,3<4> 45 bcnd eq0,r5,Lzero 46 47 or r12,r0,lo16(Lbase) 48 or.u r12,r12,hi16(Lbase) 49 addu r12,r12,r5 ; r12 is address for entering in loop 50 51 extu r5,r5,2 ; divide by 4 52 subu r2,r2,r5 ; adjust res_ptr 53 subu r3,r3,r5 ; adjust s1_ptr 54 subu r4,r4,r5 ; adjust s2_ptr 55 56 or r8,r6,r0 57 58 jmp.n r12 59 or r9,r7,r0 60 61Loop: addu r3,r3,32 62 st r8,r2,28 63 addu r4,r4,32 64 ld r6,r3,0 65 addu r2,r2,32 66 ld r7,r4,0 67Lzero: subu r10,r10,1 ; add 0 + 8r limbs (adj loop cnt) 68Lbase: ld r8,r3,4 69 addu.cio r6,r6,r7 70 ld r9,r4,4 71 st r6,r2,0 72 ld r6,r3,8 ; add 7 + 8r limbs 73 addu.cio r8,r8,r9 74 ld r7,r4,8 75 st r8,r2,4 76 ld r8,r3,12 ; add 6 + 8r limbs 77 addu.cio r6,r6,r7 78 ld r9,r4,12 79 st r6,r2,8 80 ld r6,r3,16 ; add 5 + 8r limbs 81 addu.cio r8,r8,r9 82 ld r7,r4,16 83 st r8,r2,12 84 ld r8,r3,20 ; add 4 + 8r limbs 85 addu.cio r6,r6,r7 86 ld r9,r4,20 87 st r6,r2,16 88 ld r6,r3,24 ; add 3 + 8r limbs 89 addu.cio r8,r8,r9 90 ld r7,r4,24 91 st r8,r2,20 92 ld r8,r3,28 ; add 2 + 8r limbs 93 addu.cio r6,r6,r7 94 ld r9,r4,28 95 st r6,r2,24 96 bcnd.n ne0,r10,Loop ; add 1 + 8r limbs 97 addu.cio r8,r8,r9 98 99 st r8,r2,28 ; store most significant limb 100 101 jmp.n r1 102 addu.ci r2,r0,r0 ; return carry-out from most sign. limb 103