1dnl AMD K7 mpn_lshift -- mpn left shift. 2 3dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K7: 1.21 cycles/limb (at 16 limbs/loop). 24 25 26 27dnl K7: UNROLL_COUNT cycles/limb 28dnl 4 1.51 29dnl 8 1.26 30dnl 16 1.21 31dnl 32 1.2 32dnl Maximum possible with the current code is 64. 33 34deflit(UNROLL_COUNT, 16) 35 36 37C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 38C unsigned shift); 39C 40C Shift src,size left by shift many bits and store the result in dst,size. 41C Zeros are shifted in at the right. The bits shifted out at the left are 42C the return value. 43C 44C The comments in mpn_rshift apply here too. 45 46ifdef(`PIC',` 47deflit(UNROLL_THRESHOLD, 10) 48',` 49deflit(UNROLL_THRESHOLD, 10) 50') 51 52defframe(PARAM_SHIFT,16) 53defframe(PARAM_SIZE, 12) 54defframe(PARAM_SRC, 8) 55defframe(PARAM_DST, 4) 56 57defframe(SAVE_EDI, -4) 58defframe(SAVE_ESI, -8) 59defframe(SAVE_EBX, -12) 60deflit(SAVE_SIZE, 12) 61 62 TEXT 63 ALIGN(32) 64 65PROLOGUE(mpn_lshift) 66deflit(`FRAME',0) 67 68 movl PARAM_SIZE, %eax 69 movl PARAM_SRC, %edx 70 subl $SAVE_SIZE, %esp 71deflit(`FRAME',SAVE_SIZE) 72 73 movl PARAM_SHIFT, %ecx 74 movl %edi, SAVE_EDI 75 76 movl PARAM_DST, %edi 77 decl %eax 78 jnz L(more_than_one_limb) 79 80 movl (%edx), %edx 81 82 shldl( %cl, %edx, %eax) C eax was decremented to zero 83 84 shll %cl, %edx 85 86 movl %edx, (%edi) 87 movl SAVE_EDI, %edi 88 addl $SAVE_SIZE, %esp 89 90 ret 91 92 93C ----------------------------------------------------------------------------- 94L(more_than_one_limb): 95 C eax size-1 96 C ebx 97 C ecx shift 98 C edx src 99 C esi 100 C edi dst 101 C ebp 102 103 movd PARAM_SHIFT, %mm6 104 movd (%edx,%eax,4), %mm5 C src high limb 105 cmp $UNROLL_THRESHOLD-1, %eax 106 107 jae L(unroll) 108 negl %ecx 109 movd (%edx), %mm4 C src low limb 110 111 addl $32, %ecx 112 113 movd %ecx, %mm7 114 115L(simple_top): 116 C eax loop counter, limbs 117 C ebx 118 C ecx 119 C edx src 120 C esi 121 C edi dst 122 C ebp 123 C 124 C mm0 scratch 125 C mm4 src low limb 126 C mm5 src high limb 127 C mm6 shift 128 C mm7 32-shift 129 130 movq -4(%edx,%eax,4), %mm0 131 decl %eax 132 133 psrlq %mm7, %mm0 134 135 movd %mm0, 4(%edi,%eax,4) 136 jnz L(simple_top) 137 138 139 psllq %mm6, %mm5 140 psllq %mm6, %mm4 141 142 psrlq $32, %mm5 143 movd %mm4, (%edi) C dst low limb 144 145 movd %mm5, %eax C return value 146 147 movl SAVE_EDI, %edi 148 addl $SAVE_SIZE, %esp 149 emms 150 151 ret 152 153 154C ----------------------------------------------------------------------------- 155 ALIGN(16) 156L(unroll): 157 C eax size-1 158 C ebx (saved) 159 C ecx shift 160 C edx src 161 C esi 162 C edi dst 163 C ebp 164 C 165 C mm5 src high limb, for return value 166 C mm6 lshift 167 168 movl %esi, SAVE_ESI 169 movl %ebx, SAVE_EBX 170 leal -4(%edx,%eax,4), %edx C &src[size-2] 171 172 testb $4, %dl 173 movq (%edx), %mm1 C src high qword 174 175 jz L(start_src_aligned) 176 177 178 C src isn't aligned, process high limb (marked xxx) separately to 179 C make it so 180 C 181 C source -4(edx,%eax,4) 182 C | 183 C +-------+-------+-------+-- 184 C | xxx | 185 C +-------+-------+-------+-- 186 C 0mod8 4mod8 0mod8 187 C 188 C dest -4(edi,%eax,4) 189 C | 190 C +-------+-------+-- 191 C | xxx | | 192 C +-------+-------+-- 193 194 psllq %mm6, %mm1 195 subl $4, %edx 196 movl %eax, PARAM_SIZE C size-1 197 198 psrlq $32, %mm1 199 decl %eax C size-2 is new size-1 200 201 movd %mm1, 4(%edi,%eax,4) 202 movq (%edx), %mm1 C new src high qword 203L(start_src_aligned): 204 205 206 leal -4(%edi,%eax,4), %edi C &dst[size-2] 207 psllq %mm6, %mm5 208 209 testl $4, %edi 210 psrlq $32, %mm5 C return value 211 212 jz L(start_dst_aligned) 213 214 215 C dst isn't aligned, subtract 4 bytes to make it so, and pretend the 216 C shift is 32 bits extra. High limb of dst (marked xxx) handled 217 C here separately. 218 C 219 C source %edx 220 C +-------+-------+-- 221 C | mm1 | 222 C +-------+-------+-- 223 C 0mod8 4mod8 224 C 225 C dest %edi 226 C +-------+-------+-------+-- 227 C | xxx | 228 C +-------+-------+-------+-- 229 C 0mod8 4mod8 0mod8 230 231 movq %mm1, %mm0 232 psllq %mm6, %mm1 233 addl $32, %ecx C shift+32 234 235 psrlq $32, %mm1 236 237 movd %mm1, 4(%edi) 238 movq %mm0, %mm1 239 subl $4, %edi 240 241 movd %ecx, %mm6 C new lshift 242L(start_dst_aligned): 243 244 decl %eax C size-2, two last limbs handled at end 245 movq %mm1, %mm2 C copy of src high qword 246 negl %ecx 247 248 andl $-2, %eax C round size down to even 249 addl $64, %ecx 250 251 movl %eax, %ebx 252 negl %eax 253 254 andl $UNROLL_MASK, %eax 255 decl %ebx 256 257 shll %eax 258 259 movd %ecx, %mm7 C rshift = 64-lshift 260 261ifdef(`PIC',` 262 call L(pic_calc) 263L(here): 264',` 265 leal L(entry) (%eax,%eax,4), %esi 266') 267 shrl $UNROLL_LOG2, %ebx C loop counter 268 269 leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx 270 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi 271 movl PARAM_SIZE, %eax C for use at end 272 jmp *%esi 273 274 275ifdef(`PIC',` 276L(pic_calc): 277 C See mpn/x86/README about old gas bugs 278 leal (%eax,%eax,4), %esi 279 addl $L(entry)-L(here), %esi 280 addl (%esp), %esi 281 282 ret_internal 283') 284 285 286C ----------------------------------------------------------------------------- 287 ALIGN(32) 288L(top): 289 C eax size (for use at end) 290 C ebx loop counter 291 C ecx rshift 292 C edx src 293 C esi computed jump 294 C edi dst 295 C ebp 296 C 297 C mm0 scratch 298 C mm1 \ carry (alternating, mm2 first) 299 C mm2 / 300 C mm6 lshift 301 C mm7 rshift 302 C 303 C 10 code bytes/limb 304 C 305 C The two chunks differ in whether mm1 or mm2 hold the carry. 306 C The computed jump puts the initial carry in both mm1 and mm2. 307 308L(entry): 309deflit(CHUNK_COUNT, 4) 310forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 311 deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 312 deflit(`disp1', eval(disp0 - 8)) 313 314Zdisp( movq, disp0,(%edx), %mm0) 315 psllq %mm6, %mm2 316 317 movq %mm0, %mm1 318 psrlq %mm7, %mm0 319 320 por %mm2, %mm0 321Zdisp( movq, %mm0, disp0,(%edi)) 322 323 324Zdisp( movq, disp1,(%edx), %mm0) 325 psllq %mm6, %mm1 326 327 movq %mm0, %mm2 328 psrlq %mm7, %mm0 329 330 por %mm1, %mm0 331Zdisp( movq, %mm0, disp1,(%edi)) 332') 333 334 subl $UNROLL_BYTES, %edx 335 subl $UNROLL_BYTES, %edi 336 decl %ebx 337 338 jns L(top) 339 340 341 342define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))') 343 344L(end): 345 testb $1, %al 346 movl SAVE_EBX, %ebx 347 psllq %mm6, %mm2 C wanted left shifted in all cases below 348 349 movd %mm5, %eax 350 351 movl SAVE_ESI, %esi 352 jz L(end_even) 353 354 355L(end_odd): 356 357 C Size odd, destination was aligned. 358 C 359 C source edx+8 edx+4 360 C --+---------------+-------+ 361 C | mm2 | | 362 C --+---------------+-------+ 363 C 364 C dest edi 365 C --+---------------+---------------+-------+ 366 C | written | | | 367 C --+---------------+---------------+-------+ 368 C 369 C mm6 = shift 370 C mm7 = ecx = 64-shift 371 372 373 C Size odd, destination was unaligned. 374 C 375 C source edx+8 edx+4 376 C --+---------------+-------+ 377 C | mm2 | | 378 C --+---------------+-------+ 379 C 380 C dest edi 381 C --+---------------+---------------+ 382 C | written | | 383 C --+---------------+---------------+ 384 C 385 C mm6 = shift+32 386 C mm7 = ecx = 64-(shift+32) 387 388 389 C In both cases there's one extra limb of src to fetch and combine 390 C with mm2 to make a qword at (%edi), and in the aligned case 391 C there's an extra limb of dst to be formed from that extra src limb 392 C left shifted. 393 394 movd disp(4) (%edx), %mm0 395 testb $32, %cl 396 397 movq %mm0, %mm1 398 psllq $32, %mm0 399 400 psrlq %mm7, %mm0 401 psllq %mm6, %mm1 402 403 por %mm2, %mm0 404 405 movq %mm0, disp(0) (%edi) 406 jz L(end_odd_unaligned) 407 movd %mm1, disp(-4) (%edi) 408L(end_odd_unaligned): 409 410 movl SAVE_EDI, %edi 411 addl $SAVE_SIZE, %esp 412 emms 413 414 ret 415 416 417L(end_even): 418 419 C Size even, destination was aligned. 420 C 421 C source edx+8 422 C --+---------------+ 423 C | mm2 | 424 C --+---------------+ 425 C 426 C dest edi 427 C --+---------------+---------------+ 428 C | written | | 429 C --+---------------+---------------+ 430 C 431 C mm6 = shift 432 C mm7 = ecx = 64-shift 433 434 435 C Size even, destination was unaligned. 436 C 437 C source edx+8 438 C --+---------------+ 439 C | mm2 | 440 C --+---------------+ 441 C 442 C dest edi+4 443 C --+---------------+-------+ 444 C | written | | 445 C --+---------------+-------+ 446 C 447 C mm6 = shift+32 448 C mm7 = ecx = 64-(shift+32) 449 450 451 C The movq for the aligned case overwrites the movd for the 452 C unaligned case. 453 454 movq %mm2, %mm0 455 psrlq $32, %mm2 456 457 testb $32, %cl 458 movd %mm2, disp(4) (%edi) 459 460 jz L(end_even_unaligned) 461 movq %mm0, disp(0) (%edi) 462L(end_even_unaligned): 463 464 movl SAVE_EDI, %edi 465 addl $SAVE_SIZE, %esp 466 emms 467 468 ret 469 470EPILOGUE() 471