1/* Intel Pentium-4 mpn_rshift -- right shift. 2 * 3 * Copyright 2001, 2002 Free Software Foundation, Inc. 4 * 5 * This file is part of Libgcrypt. 6 * 7 * Libgcrypt is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU Lesser General Public License as 9 * published by the Free Software Foundation; either version 2.1 of 10 * the License, or (at your option) any later version. 11 * 12 * Libgcrypt is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this program; if not, write to the Free Software 19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA 20 * 21 * Note: This code is heavily based on the GNU MP Library. 22 * Actually it's the same code with only minor changes in the 23 * way the data is stored; this is to support the abstraction 24 * of an optional secure memory allocation which may be used 25 * to avoid revealing of sensitive data due to paging etc. 26 */ 27 28 29#include "sysdep.h" 30#include "asm-syntax.h" 31 32 33/******************* 34 * mpi_limb_t 35 * _gcry_mpih_rshift( mpi_ptr_t wp, (sp + 4) 36 * mpi_ptr_t up, (sp + 8) 37 * mpi_size_t usize, (sp + 12) 38 * unsigned cnt) (sp + 16) 39 * 40 * P4 Willamette, Northwood: 1.75 cycles/limb 41 * P4 Prescott: 2.0 cycles/limb 42 */ 43 44.text 45 ALIGN (3) 46 .globl C_SYMBOL_NAME(_gcry_mpih_rshift) 47C_SYMBOL_NAME(_gcry_mpih_rshift:) 48 pushl %ebx 49 pushl %edi 50 51 52 movl 20(%esp), %eax 53 movl 12(%esp), %edx 54 55 movl 16(%esp), %ebx 56 movl 24(%esp), %ecx 57 58 cmp $5, %eax 59 jae .Lunroll 60 61 decl %eax 62 movl (%ebx), %edi 63 64 jnz .Lsimple 65 66 shrdl %cl, %edi, %eax 67 68 shrl %cl, %edi 69 70 movl %edi, (%edx) 71 popl %edi 72 73 popl %ebx 74 75 ret 76 77 78 79 80 81 .align 8, 0x90 82.Lsimple: 83 84 85 86 87 88 89 90 91 92 movd (%ebx), %mm5 93 leal (%ebx,%eax,4), %ebx 94 95 movd %ecx, %mm6 96 leal -4(%edx,%eax,4), %edx 97 98 psllq $32, %mm5 99 negl %eax 100 101 102 103 104 105 106 107.Lsimple_top: 108 109 110 111 112 113 114 115 116 117 movq (%ebx,%eax,4), %mm0 118 incl %eax 119 120 psrlq %mm6, %mm0 121 122 movd %mm0, (%edx,%eax,4) 123 jnz .Lsimple_top 124 125 126 movd (%ebx), %mm0 127 psrlq %mm6, %mm5 128 129 psrlq %mm6, %mm0 130 popl %edi 131 132 movd %mm5, %eax 133 popl %ebx 134 135 movd %mm0, 4(%edx) 136 137 emms 138 139 ret 140 141 142 143 144 145 .align 8, 0x90 146.Lunroll: 147 148 149 150 151 152 153 154 155 156 movd (%ebx), %mm5 157 movl $4, %edi 158 159 movd %ecx, %mm6 160 testl %edi, %ebx 161 162 psllq $32, %mm5 163 jz .Lstart_src_aligned 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 movq (%ebx), %mm0 181 182 psrlq %mm6, %mm0 183 addl $4, %ebx 184 185 decl %eax 186 187 movd %mm0, (%edx) 188 addl $4, %edx 189.Lstart_src_aligned: 190 191 192 movq (%ebx), %mm1 193 testl %edi, %edx 194 195 psrlq %mm6, %mm5 196 jz .Lstart_dst_aligned 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 movq %mm1, %mm0 215 addl $32, %ecx 216 217 psrlq %mm6, %mm0 218 219 movd %ecx, %mm6 220 221 movd %mm0, (%edx) 222 addl $4, %edx 223.Lstart_dst_aligned: 224 225 226 movq 8(%ebx), %mm3 227 negl %ecx 228 229 movq %mm3, %mm2 230 addl $64, %ecx 231 232 movd %ecx, %mm7 233 psrlq %mm6, %mm1 234 235 leal -12(%ebx,%eax,4), %ebx 236 leal -20(%edx,%eax,4), %edx 237 238 psllq %mm7, %mm3 239 subl $7, %eax 240 241 por %mm1, %mm3 242 negl %eax 243 244 jns .Lfinish 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 .align 8, 0x90 261.Lunroll_loop: 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 movq (%ebx,%eax,4), %mm0 279 psrlq %mm6, %mm2 280 281 movq %mm0, %mm1 282 psllq %mm7, %mm0 283 284 movq %mm3, -8(%edx,%eax,4) 285 por %mm2, %mm0 286 287 movq 8(%ebx,%eax,4), %mm3 288 psrlq %mm6, %mm1 289 290 movq %mm0, (%edx,%eax,4) 291 movq %mm3, %mm2 292 293 psllq %mm7, %mm3 294 addl $4, %eax 295 296 por %mm1, %mm3 297 js .Lunroll_loop 298 299 300.Lfinish: 301 302 303 testb $2, %al 304 305 jnz .Lfinish_no_two 306 307 movq (%ebx,%eax,4), %mm0 308 psrlq %mm6, %mm2 309 310 movq %mm0, %mm1 311 psllq %mm7, %mm0 312 313 movq %mm3, -8(%edx,%eax,4) 314 por %mm2, %mm0 315 316 movq %mm1, %mm2 317 movq %mm0, %mm3 318 319 addl $2, %eax 320.Lfinish_no_two: 321 322 323 324 325 326 327 328 testb $1, %al 329 popl %edi 330 331 movd %mm5, %eax 332 jnz .Lfinish_zero 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 movd 8(%ebx), %mm0 373 psrlq %mm6, %mm2 374 375 movq %mm0, %mm1 376 psllq %mm7, %mm0 377 378 movq %mm3, (%edx) 379 por %mm2, %mm0 380 381 psrlq %mm6, %mm1 382 andl $32, %ecx 383 384 popl %ebx 385 jz .Lfinish_one_unaligned 386 387 388 movd %mm1, 16(%edx) 389.Lfinish_one_unaligned: 390 391 movq %mm0, 8(%edx) 392 393 emms 394 395 ret 396 397 398 399 400.Lfinish_zero: 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 movq %mm3, 4(%edx) 440 psrlq %mm6, %mm2 441 442 movd %mm2, 12(%edx) 443 andl $32, %ecx 444 445 popl %ebx 446 jz .Lfinish_zero_unaligned 447 448 movq %mm2, 12(%edx) 449.Lfinish_zero_unaligned: 450 451 emms 452 453 ret 454