1/* Intel Pentium-4 mpn_lshift -- left shift. 2 * 3 * Copyright 2001, 2002 Free Software Foundation, Inc. 4 * 5 * This file is part of Libgcrypt. 6 * 7 * Libgcrypt is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU Lesser General Public License as 9 * published by the Free Software Foundation; either version 2.1 of 10 * the License, or (at your option) any later version. 11 * 12 * Libgcrypt is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this program; if not, write to the Free Software 19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA 20 * 21 * Note: This code is heavily based on the GNU MP Library. 22 * Actually it's the same code with only minor changes in the 23 * way the data is stored; this is to support the abstraction 24 * of an optional secure memory allocation which may be used 25 * to avoid revealing of sensitive data due to paging etc. 26 */ 27 28 29#include "sysdep.h" 30#include "asm-syntax.h" 31 32 33/******************* 34 * mpi_limb_t 35 * _gcry_mpih_lshift( mpi_ptr_t wp, (sp + 4) 36 * mpi_ptr_t up, (sp + 8) 37 * mpi_size_t usize, (sp + 12) 38 * unsigned cnt) (sp + 16) 39 * 40 * P4 Willamette, Northwood: 1.75 cycles/limb 41 * P4 Prescott: 2.0 cycles/limb 42 */ 43 44.text 45 ALIGN (3) 46 .globl C_SYMBOL_NAME(_gcry_mpih_lshift) 47C_SYMBOL_NAME(_gcry_mpih_lshift:) 48 49 50 pushl %ebx 51 pushl %edi 52 53 54 movl 20(%esp), %eax 55 movl 12(%esp), %edx 56 57 movl 16(%esp), %ebx 58 movl 24(%esp), %ecx 59 60 cmp $5, %eax 61 jae .Lunroll 62 63 movl -4(%ebx,%eax,4), %edi 64 decl %eax 65 66 jnz .Lsimple 67 68 shldl %cl, %edi, %eax 69 70 shll %cl, %edi 71 72 movl %edi, (%edx) 73 popl %edi 74 75 popl %ebx 76 77 ret 78 79 80 81 82 83.Lsimple: 84 85 86 87 88 89 90 91 92 93 movd (%ebx,%eax,4), %mm5 94 95 movd %ecx, %mm6 96 negl %ecx 97 98 psllq %mm6, %mm5 99 addl $32, %ecx 100 101 movd %ecx, %mm7 102 psrlq $32, %mm5 103 104 105.Lsimple_top: 106 107 108 109 110 111 112 113 114 115 116 117 118 movq -4(%ebx,%eax,4), %mm0 119 decl %eax 120 121 psrlq %mm7, %mm0 122 123 124 125 movd %mm0, 4(%edx,%eax,4) 126 jnz .Lsimple_top 127 128 129 movd (%ebx), %mm0 130 131 movd %mm5, %eax 132 psllq %mm6, %mm0 133 134 popl %edi 135 popl %ebx 136 137 movd %mm0, (%edx) 138 139 emms 140 141 ret 142 143 144 145 146 147 .align 8, 0x90 148.Lunroll: 149 150 151 152 153 154 155 156 157 158 movd -4(%ebx,%eax,4), %mm5 159 leal (%ebx,%eax,4), %edi 160 161 movd %ecx, %mm6 162 andl $4, %edi 163 164 psllq %mm6, %mm5 165 jz .Lstart_src_aligned 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 movq -8(%ebx,%eax,4), %mm0 186 187 psllq %mm6, %mm0 188 decl %eax 189 190 psrlq $32, %mm0 191 192 193 194 movd %mm0, (%edx,%eax,4) 195.Lstart_src_aligned: 196 197 movq -8(%ebx,%eax,4), %mm1 198 leal (%edx,%eax,4), %edi 199 200 andl $4, %edi 201 psrlq $32, %mm5 202 203 movq -16(%ebx,%eax,4), %mm3 204 jz .Lstart_dst_aligned 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 movq %mm1, %mm0 226 addl $32, %ecx 227 228 psllq %mm6, %mm0 229 230 movd %ecx, %mm6 231 psrlq $32, %mm0 232 233 234 235 movd %mm0, -4(%edx,%eax,4) 236 subl $4, %edx 237.Lstart_dst_aligned: 238 239 240 psllq %mm6, %mm1 241 negl %ecx 242 243 addl $64, %ecx 244 movq %mm3, %mm2 245 246 movd %ecx, %mm7 247 subl $8, %eax 248 249 psrlq %mm7, %mm3 250 251 por %mm1, %mm3 252 jc .Lfinish 253 254 255 256 257 .align 8, 0x90 258.Lunroll_loop: 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 movq 8(%ebx,%eax,4), %mm0 276 psllq %mm6, %mm2 277 278 movq %mm0, %mm1 279 psrlq %mm7, %mm0 280 281 movq %mm3, 24(%edx,%eax,4) 282 por %mm2, %mm0 283 284 movq (%ebx,%eax,4), %mm3 285 psllq %mm6, %mm1 286 287 movq %mm0, 16(%edx,%eax,4) 288 movq %mm3, %mm2 289 290 psrlq %mm7, %mm3 291 subl $4, %eax 292 293 por %mm1, %mm3 294 jnc .Lunroll_loop 295 296 297 298.Lfinish: 299 300 301 testb $2, %al 302 303 jz .Lfinish_no_two 304 305 movq 8(%ebx,%eax,4), %mm0 306 psllq %mm6, %mm2 307 308 movq %mm0, %mm1 309 psrlq %mm7, %mm0 310 311 movq %mm3, 24(%edx,%eax,4) 312 por %mm2, %mm0 313 314 movq %mm1, %mm2 315 movq %mm0, %mm3 316 317 subl $2, %eax 318.Lfinish_no_two: 319 320 321 322 323 324 325 326 testb $1, %al 327 movd %mm5, %eax 328 329 popl %edi 330 jz .Lfinish_zero 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 movd (%ebx), %mm0 372 psllq %mm6, %mm2 373 374 movq %mm3, 12(%edx) 375 psllq $32, %mm0 376 377 movq %mm0, %mm1 378 psrlq %mm7, %mm0 379 380 por %mm2, %mm0 381 psllq %mm6, %mm1 382 383 movq %mm0, 4(%edx) 384 psrlq $32, %mm1 385 386 andl $32, %ecx 387 popl %ebx 388 389 jz .Lfinish_one_unaligned 390 391 movd %mm1, (%edx) 392.Lfinish_one_unaligned: 393 394 emms 395 396 ret 397 398 399 400 401.Lfinish_zero: 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 movq %mm3, 8(%edx) 440 andl $32, %ecx 441 442 psllq %mm6, %mm2 443 jz .Lfinish_zero_unaligned 444 445 movq %mm2, (%edx) 446.Lfinish_zero_unaligned: 447 448 psrlq $32, %mm2 449 popl %ebx 450 451 movd %mm5, %eax 452 453 movd %mm2, 4(%edx) 454 455 emms 456 457 ret 458