lshift.asm revision 1.1.1.1
1dnl S/390-64 mpn_lshift. 2 3dnl Copyright 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C z900 7 24C z990 3 25C z9 ? 26C z10 ? 27C z196 ? 28 29C NOTES 30C * This uses discrete loads and stores in a software pipeline. Using lmg and 31C stmg is not faster. 32C * One could assume more pipelining could approach 2.5 c/l, but we have not 33C found any 8-way loop that runs better than the current 4-way loop. 34C * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4, 35C similarly to the x86_64 sqr_basecase feed-in. 36 37C INPUT PARAMETERS 38define(`rp', `%r2') 39define(`up', `%r3') 40define(`n', `%r4') 41define(`cnt', `%r5') 42 43define(`tnc', `%r6') 44 45ASM_START() 46PROLOGUE(mpn_lshift) 47 cghi n, 3 48 jh L(gt1) 49 50 stmg %r6, %r7, 48(%r15) 51 larl %r1, L(tab)-4 52 lcgr tnc, cnt 53 sllg n, n, 2 54 b 0(n,%r1) 55L(tab): j L(n1) 56 j L(n2) 57 j L(n3) 58 59L(n1): lg %r1, 0(up) 60 sllg %r0, %r1, 0(cnt) 61 stg %r0, 0(rp) 62 srlg %r2, %r1, 0(tnc) 63 lg %r6, 48(%r15) C restoring r7 not needed 64 br %r14 65 66L(n2): lg %r1, 8(up) 67 srlg %r4, %r1, 0(tnc) 68 sllg %r0, %r1, 0(cnt) 69 j L(cj) 70 71L(n3): lg %r1, 16(up) 72 srlg %r4, %r1, 0(tnc) 73 sllg %r0, %r1, 0(cnt) 74 lg %r1, 8(up) 75 srlg %r7, %r1, 0(tnc) 76 ogr %r7, %r0 77 sllg %r0, %r1, 0(cnt) 78 stg %r7, 16(rp) 79L(cj): lg %r1, 0(up) 80 srlg %r7, %r1, 0(tnc) 81 ogr %r7, %r0 82 sllg %r0, %r1, 0(cnt) 83 stg %r7, 8(rp) 84 stg %r0, 0(rp) 85 lgr %r2, %r4 86 lmg %r6, %r7, 48(%r15) 87 br %r14 88 89L(gt1): stmg %r6, %r13, 48(%r15) 90 lcgr tnc, cnt C tnc = -cnt 91 92 sllg %r1, n, 3 93 srlg %r0, n, 2 C loop count 94 95 agr up, %r1 C point up at end of U 96 agr rp, %r1 C point rp at end of R 97 aghi up, -56 98 aghi rp, -40 99 100 lghi %r7, 3 101 ngr %r7, n 102 je L(b0) 103 cghi %r7, 2 104 jl L(b1) 105 je L(b2) 106 107L(b3): lg %r7, 48(up) 108 srlg %r9, %r7, 0(tnc) 109 sllg %r11, %r7, 0(cnt) 110 lg %r8, 40(up) 111 lg %r7, 32(up) 112 srlg %r4, %r8, 0(tnc) 113 sllg %r13, %r8, 0(cnt) 114 ogr %r11, %r4 115 la rp, 16(rp) 116 j L(lm3) 117 118L(b2): lg %r8, 48(up) 119 lg %r7, 40(up) 120 srlg %r9, %r8, 0(tnc) 121 sllg %r13, %r8, 0(cnt) 122 la rp, 24(rp) 123 la up, 8(up) 124 j L(lm2) 125 126L(b1): lg %r7, 48(up) 127 srlg %r9, %r7, 0(tnc) 128 sllg %r11, %r7, 0(cnt) 129 lg %r8, 40(up) 130 lg %r7, 32(up) 131 srlg %r4, %r8, 0(tnc) 132 sllg %r10, %r8, 0(cnt) 133 ogr %r11, %r4 134 la rp, 32(rp) 135 la up, 16(up) 136 j L(lm1) 137 138L(b0): lg %r8, 48(up) 139 lg %r7, 40(up) 140 srlg %r9, %r8, 0(tnc) 141 sllg %r10, %r8, 0(cnt) 142 la rp, 40(rp) 143 la up, 24(up) 144 j L(lm0) 145 146C ALIGN(16) 147L(top): srlg %r4, %r8, 0(tnc) 148 sllg %r13, %r8, 0(cnt) 149 ogr %r11, %r4 150 stg %r10, 24(rp) 151L(lm3): stg %r11, 16(rp) 152L(lm2): srlg %r12, %r7, 0(tnc) 153 sllg %r11, %r7, 0(cnt) 154 lg %r8, 24(up) 155 lg %r7, 16(up) 156 ogr %r13, %r12 157 srlg %r4, %r8, 0(tnc) 158 sllg %r10, %r8, 0(cnt) 159 ogr %r11, %r4 160 stg %r13, 8(rp) 161L(lm1): stg %r11, 0(rp) 162L(lm0): srlg %r12, %r7, 0(tnc) 163 aghi rp, -32 164 sllg %r11, %r7, 0(cnt) 165 lg %r8, 8(up) 166 lg %r7, 0(up) 167 aghi up, -32 168 ogr %r10, %r12 169 brctg %r0, L(top) 170 171L(end): srlg %r4, %r8, 0(tnc) 172 sllg %r13, %r8, 0(cnt) 173 ogr %r11, %r4 174 stg %r10, 24(rp) 175 stg %r11, 16(rp) 176 srlg %r12, %r7, 0(tnc) 177 sllg %r11, %r7, 0(cnt) 178 ogr %r13, %r12 179 stg %r13, 8(rp) 180 stg %r11, 0(rp) 181 lgr %r2, %r9 182 183 lmg %r6, %r13, 48(%r15) 184 br %r14 185EPILOGUE() 186