1305153Sjkim/* $FreeBSD: stable/11/secure/lib/libcrypto/arm/armv4-mont.S 337982 2018-08-17 18:32:53Z jkim $ */ 2305153Sjkim/* Do not modify. This file is auto-generated from armv4-mont.pl. */ 3304636Sjkim#include "arm_arch.h" 4304636Sjkim 5304636Sjkim.text 6304636Sjkim.code 32 7304636Sjkim 8304636Sjkim#if __ARM_MAX_ARCH__>=7 9304636Sjkim.align 5 10304636Sjkim.LOPENSSL_armcap: 11304636Sjkim.word OPENSSL_armcap_P-bn_mul_mont 12304636Sjkim#endif 13304636Sjkim 14304636Sjkim.global bn_mul_mont 15304636Sjkim.type bn_mul_mont,%function 16304636Sjkim 17304636Sjkim.align 5 18304636Sjkimbn_mul_mont: 19304636Sjkim ldr ip,[sp,#4] @ load num 20304636Sjkim stmdb sp!,{r0,r2} @ sp points at argument block 21304636Sjkim#if __ARM_MAX_ARCH__>=7 22304636Sjkim tst ip,#7 23304636Sjkim bne .Lialu 24304636Sjkim adr r0,bn_mul_mont 25304636Sjkim ldr r2,.LOPENSSL_armcap 26304636Sjkim ldr r0,[r0,r2] 27304636Sjkim tst r0,#1 @ NEON available? 28304636Sjkim ldmia sp, {r0,r2} 29304636Sjkim beq .Lialu 30304636Sjkim add sp,sp,#8 31304636Sjkim b bn_mul8x_mont_neon 32304636Sjkim.align 4 33304636Sjkim.Lialu: 34304636Sjkim#endif 35304636Sjkim cmp ip,#2 36304636Sjkim mov r0,ip @ load num 37304636Sjkim movlt r0,#0 38304636Sjkim addlt sp,sp,#2*4 39304636Sjkim blt .Labrt 40304636Sjkim 41304636Sjkim stmdb sp!,{r4-r12,lr} @ save 10 registers 42304636Sjkim 43304636Sjkim mov r0,r0,lsl#2 @ rescale r0 for byte count 44304636Sjkim sub sp,sp,r0 @ alloca(4*num) 45304636Sjkim sub sp,sp,#4 @ +extra dword 46304636Sjkim sub r0,r0,#4 @ "num=num-1" 47304636Sjkim add r4,r2,r0 @ &bp[num-1] 48304636Sjkim 49304636Sjkim add r0,sp,r0 @ r0 to point at &tp[num-1] 50304636Sjkim ldr r8,[r0,#14*4] @ &n0 51304636Sjkim ldr r2,[r2] @ bp[0] 52304636Sjkim ldr r5,[r1],#4 @ ap[0],ap++ 53304636Sjkim ldr r6,[r3],#4 @ np[0],np++ 54304636Sjkim ldr r8,[r8] @ *n0 55304636Sjkim str r4,[r0,#15*4] @ save &bp[num] 56304636Sjkim 57304636Sjkim umull r10,r11,r5,r2 @ ap[0]*bp[0] 58304636Sjkim str r8,[r0,#14*4] @ save n0 value 59304636Sjkim mul r8,r10,r8 @ "tp[0]"*n0 60304636Sjkim mov r12,#0 61304636Sjkim umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" 62304636Sjkim mov r4,sp 63304636Sjkim 64304636Sjkim.L1st: 65304636Sjkim ldr r5,[r1],#4 @ ap[j],ap++ 66304636Sjkim mov r10,r11 67304636Sjkim ldr r6,[r3],#4 @ np[j],np++ 68304636Sjkim mov r11,#0 69304636Sjkim umlal r10,r11,r5,r2 @ ap[j]*bp[0] 70304636Sjkim mov r14,#0 71304636Sjkim umlal r12,r14,r6,r8 @ np[j]*n0 72304636Sjkim adds r12,r12,r10 73304636Sjkim str r12,[r4],#4 @ tp[j-1]=,tp++ 74304636Sjkim adc r12,r14,#0 75304636Sjkim cmp r4,r0 76304636Sjkim bne .L1st 77304636Sjkim 78304636Sjkim adds r12,r12,r11 79304636Sjkim ldr r4,[r0,#13*4] @ restore bp 80304636Sjkim mov r14,#0 81304636Sjkim ldr r8,[r0,#14*4] @ restore n0 82304636Sjkim adc r14,r14,#0 83304636Sjkim str r12,[r0] @ tp[num-1]= 84304636Sjkim str r14,[r0,#4] @ tp[num]= 85304636Sjkim 86304636Sjkim.Louter: 87304636Sjkim sub r7,r0,sp @ "original" r0-1 value 88304636Sjkim sub r1,r1,r7 @ "rewind" ap to &ap[1] 89304636Sjkim ldr r2,[r4,#4]! @ *(++bp) 90304636Sjkim sub r3,r3,r7 @ "rewind" np to &np[1] 91304636Sjkim ldr r5,[r1,#-4] @ ap[0] 92304636Sjkim ldr r10,[sp] @ tp[0] 93304636Sjkim ldr r6,[r3,#-4] @ np[0] 94304636Sjkim ldr r7,[sp,#4] @ tp[1] 95304636Sjkim 96304636Sjkim mov r11,#0 97304636Sjkim umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] 98304636Sjkim str r4,[r0,#13*4] @ save bp 99304636Sjkim mul r8,r10,r8 100304636Sjkim mov r12,#0 101304636Sjkim umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" 102304636Sjkim mov r4,sp 103304636Sjkim 104304636Sjkim.Linner: 105304636Sjkim ldr r5,[r1],#4 @ ap[j],ap++ 106304636Sjkim adds r10,r11,r7 @ +=tp[j] 107304636Sjkim ldr r6,[r3],#4 @ np[j],np++ 108304636Sjkim mov r11,#0 109304636Sjkim umlal r10,r11,r5,r2 @ ap[j]*bp[i] 110304636Sjkim mov r14,#0 111304636Sjkim umlal r12,r14,r6,r8 @ np[j]*n0 112304636Sjkim adc r11,r11,#0 113304636Sjkim ldr r7,[r4,#8] @ tp[j+1] 114304636Sjkim adds r12,r12,r10 115304636Sjkim str r12,[r4],#4 @ tp[j-1]=,tp++ 116304636Sjkim adc r12,r14,#0 117304636Sjkim cmp r4,r0 118304636Sjkim bne .Linner 119304636Sjkim 120304636Sjkim adds r12,r12,r11 121304636Sjkim mov r14,#0 122304636Sjkim ldr r4,[r0,#13*4] @ restore bp 123304636Sjkim adc r14,r14,#0 124304636Sjkim ldr r8,[r0,#14*4] @ restore n0 125304636Sjkim adds r12,r12,r7 126304636Sjkim ldr r7,[r0,#15*4] @ restore &bp[num] 127304636Sjkim adc r14,r14,#0 128304636Sjkim str r12,[r0] @ tp[num-1]= 129304636Sjkim str r14,[r0,#4] @ tp[num]= 130304636Sjkim 131304636Sjkim cmp r4,r7 132304636Sjkim bne .Louter 133304636Sjkim 134304636Sjkim ldr r2,[r0,#12*4] @ pull rp 135304636Sjkim add r0,r0,#4 @ r0 to point at &tp[num] 136304636Sjkim sub r5,r0,sp @ "original" num value 137304636Sjkim mov r4,sp @ "rewind" r4 138304636Sjkim mov r1,r4 @ "borrow" r1 139304636Sjkim sub r3,r3,r5 @ "rewind" r3 to &np[0] 140304636Sjkim 141304636Sjkim subs r7,r7,r7 @ "clear" carry flag 142304636Sjkim.Lsub: ldr r7,[r4],#4 143304636Sjkim ldr r6,[r3],#4 144304636Sjkim sbcs r7,r7,r6 @ tp[j]-np[j] 145304636Sjkim str r7,[r2],#4 @ rp[j]= 146304636Sjkim teq r4,r0 @ preserve carry 147304636Sjkim bne .Lsub 148304636Sjkim sbcs r14,r14,#0 @ upmost carry 149304636Sjkim mov r4,sp @ "rewind" r4 150304636Sjkim sub r2,r2,r5 @ "rewind" r2 151304636Sjkim 152337982Sjkim.Lcopy: ldr r7,[r4] @ conditional copy 153337982Sjkim ldr r5,[r2] 154304636Sjkim str sp,[r4],#4 @ zap tp 155337982Sjkim#ifdef __thumb2__ 156337982Sjkim it cc 157337982Sjkim#endif 158337982Sjkim movcc r5,r7 159337982Sjkim str r5,[r2],#4 160337982Sjkim teq r4,r0 @ preserve carry 161304636Sjkim bne .Lcopy 162304636Sjkim 163304636Sjkim add sp,r0,#4 @ skip over tp[num+1] 164304636Sjkim ldmia sp!,{r4-r12,lr} @ restore registers 165304636Sjkim add sp,sp,#2*4 @ skip over {r0,r2} 166304636Sjkim mov r0,#1 167304636Sjkim.Labrt: 168304636Sjkim#if __ARM_ARCH__>=5 169304636Sjkim bx lr @ .word 0xe12fff1e 170304636Sjkim#else 171304636Sjkim tst lr,#1 172304636Sjkim moveq pc,lr @ be binary compatible with V4, yet 173304636Sjkim .word 0xe12fff1e @ interoperable with Thumb ISA:-) 174304636Sjkim#endif 175304636Sjkim.size bn_mul_mont,.-bn_mul_mont 176304636Sjkim#if __ARM_MAX_ARCH__>=7 177304636Sjkim.arch armv7-a 178304636Sjkim.fpu neon 179304636Sjkim 180304636Sjkim.type bn_mul8x_mont_neon,%function 181304636Sjkim.align 5 182304636Sjkimbn_mul8x_mont_neon: 183304636Sjkim mov ip,sp 184304636Sjkim stmdb sp!,{r4-r11} 185304636Sjkim vstmdb sp!,{d8-d15} @ ABI specification says so 186304636Sjkim ldmia ip,{r4-r5} @ load rest of parameter block 187304636Sjkim 188304636Sjkim sub r7,sp,#16 189304636Sjkim vld1.32 {d28[0]}, [r2,:32]! 190304636Sjkim sub r7,r7,r5,lsl#4 191304636Sjkim vld1.32 {d0-d3}, [r1]! @ can't specify :32 :-( 192304636Sjkim and r7,r7,#-64 193304636Sjkim vld1.32 {d30[0]}, [r4,:32] 194304636Sjkim mov sp,r7 @ alloca 195304636Sjkim veor d8,d8,d8 196304636Sjkim subs r8,r5,#8 197304636Sjkim vzip.16 d28,d8 198304636Sjkim 199304636Sjkim vmull.u32 q6,d28,d0[0] 200304636Sjkim vmull.u32 q7,d28,d0[1] 201304636Sjkim vmull.u32 q8,d28,d1[0] 202304636Sjkim vshl.i64 d10,d13,#16 203304636Sjkim vmull.u32 q9,d28,d1[1] 204304636Sjkim 205304636Sjkim vadd.u64 d10,d10,d12 206304636Sjkim veor d8,d8,d8 207304636Sjkim vmul.u32 d29,d10,d30 208304636Sjkim 209304636Sjkim vmull.u32 q10,d28,d2[0] 210304636Sjkim vld1.32 {d4-d7}, [r3]! 211304636Sjkim vmull.u32 q11,d28,d2[1] 212304636Sjkim vmull.u32 q12,d28,d3[0] 213304636Sjkim vzip.16 d29,d8 214304636Sjkim vmull.u32 q13,d28,d3[1] 215304636Sjkim 216304636Sjkim bne .LNEON_1st 217304636Sjkim 218304636Sjkim @ special case for num=8, everything is in register bank... 219304636Sjkim 220304636Sjkim vmlal.u32 q6,d29,d4[0] 221304636Sjkim sub r9,r5,#1 222304636Sjkim vmlal.u32 q7,d29,d4[1] 223304636Sjkim vmlal.u32 q8,d29,d5[0] 224304636Sjkim vmlal.u32 q9,d29,d5[1] 225304636Sjkim 226304636Sjkim vmlal.u32 q10,d29,d6[0] 227304636Sjkim vmov q5,q6 228304636Sjkim vmlal.u32 q11,d29,d6[1] 229304636Sjkim vmov q6,q7 230304636Sjkim vmlal.u32 q12,d29,d7[0] 231304636Sjkim vmov q7,q8 232304636Sjkim vmlal.u32 q13,d29,d7[1] 233304636Sjkim vmov q8,q9 234304636Sjkim vmov q9,q10 235304636Sjkim vshr.u64 d10,d10,#16 236304636Sjkim vmov q10,q11 237304636Sjkim vmov q11,q12 238304636Sjkim vadd.u64 d10,d10,d11 239304636Sjkim vmov q12,q13 240304636Sjkim veor q13,q13 241304636Sjkim vshr.u64 d10,d10,#16 242304636Sjkim 243304636Sjkim b .LNEON_outer8 244304636Sjkim 245304636Sjkim.align 4 246304636Sjkim.LNEON_outer8: 247304636Sjkim vld1.32 {d28[0]}, [r2,:32]! 248304636Sjkim veor d8,d8,d8 249304636Sjkim vzip.16 d28,d8 250304636Sjkim vadd.u64 d12,d12,d10 251304636Sjkim 252304636Sjkim vmlal.u32 q6,d28,d0[0] 253304636Sjkim vmlal.u32 q7,d28,d0[1] 254304636Sjkim vmlal.u32 q8,d28,d1[0] 255304636Sjkim vshl.i64 d10,d13,#16 256304636Sjkim vmlal.u32 q9,d28,d1[1] 257304636Sjkim 258304636Sjkim vadd.u64 d10,d10,d12 259304636Sjkim veor d8,d8,d8 260304636Sjkim subs r9,r9,#1 261304636Sjkim vmul.u32 d29,d10,d30 262304636Sjkim 263304636Sjkim vmlal.u32 q10,d28,d2[0] 264304636Sjkim vmlal.u32 q11,d28,d2[1] 265304636Sjkim vmlal.u32 q12,d28,d3[0] 266304636Sjkim vzip.16 d29,d8 267304636Sjkim vmlal.u32 q13,d28,d3[1] 268304636Sjkim 269304636Sjkim vmlal.u32 q6,d29,d4[0] 270304636Sjkim vmlal.u32 q7,d29,d4[1] 271304636Sjkim vmlal.u32 q8,d29,d5[0] 272304636Sjkim vmlal.u32 q9,d29,d5[1] 273304636Sjkim 274304636Sjkim vmlal.u32 q10,d29,d6[0] 275304636Sjkim vmov q5,q6 276304636Sjkim vmlal.u32 q11,d29,d6[1] 277304636Sjkim vmov q6,q7 278304636Sjkim vmlal.u32 q12,d29,d7[0] 279304636Sjkim vmov q7,q8 280304636Sjkim vmlal.u32 q13,d29,d7[1] 281304636Sjkim vmov q8,q9 282304636Sjkim vmov q9,q10 283304636Sjkim vshr.u64 d10,d10,#16 284304636Sjkim vmov q10,q11 285304636Sjkim vmov q11,q12 286304636Sjkim vadd.u64 d10,d10,d11 287304636Sjkim vmov q12,q13 288304636Sjkim veor q13,q13 289304636Sjkim vshr.u64 d10,d10,#16 290304636Sjkim 291304636Sjkim bne .LNEON_outer8 292304636Sjkim 293304636Sjkim vadd.u64 d12,d12,d10 294304636Sjkim mov r7,sp 295304636Sjkim vshr.u64 d10,d12,#16 296304636Sjkim mov r8,r5 297304636Sjkim vadd.u64 d13,d13,d10 298304636Sjkim add r6,sp,#16 299304636Sjkim vshr.u64 d10,d13,#16 300304636Sjkim vzip.16 d12,d13 301304636Sjkim 302304636Sjkim b .LNEON_tail2 303304636Sjkim 304304636Sjkim.align 4 305304636Sjkim.LNEON_1st: 306304636Sjkim vmlal.u32 q6,d29,d4[0] 307304636Sjkim vld1.32 {d0-d3}, [r1]! 308304636Sjkim vmlal.u32 q7,d29,d4[1] 309304636Sjkim subs r8,r8,#8 310304636Sjkim vmlal.u32 q8,d29,d5[0] 311304636Sjkim vmlal.u32 q9,d29,d5[1] 312304636Sjkim 313304636Sjkim vmlal.u32 q10,d29,d6[0] 314304636Sjkim vld1.32 {d4-d5}, [r3]! 315304636Sjkim vmlal.u32 q11,d29,d6[1] 316304636Sjkim vst1.64 {q6-q7}, [r7,:256]! 317304636Sjkim vmlal.u32 q12,d29,d7[0] 318304636Sjkim vmlal.u32 q13,d29,d7[1] 319304636Sjkim vst1.64 {q8-q9}, [r7,:256]! 320304636Sjkim 321304636Sjkim vmull.u32 q6,d28,d0[0] 322304636Sjkim vld1.32 {d6-d7}, [r3]! 323304636Sjkim vmull.u32 q7,d28,d0[1] 324304636Sjkim vst1.64 {q10-q11}, [r7,:256]! 325304636Sjkim vmull.u32 q8,d28,d1[0] 326304636Sjkim vmull.u32 q9,d28,d1[1] 327304636Sjkim vst1.64 {q12-q13}, [r7,:256]! 328304636Sjkim 329304636Sjkim vmull.u32 q10,d28,d2[0] 330304636Sjkim vmull.u32 q11,d28,d2[1] 331304636Sjkim vmull.u32 q12,d28,d3[0] 332304636Sjkim vmull.u32 q13,d28,d3[1] 333304636Sjkim 334304636Sjkim bne .LNEON_1st 335304636Sjkim 336304636Sjkim vmlal.u32 q6,d29,d4[0] 337304636Sjkim add r6,sp,#16 338304636Sjkim vmlal.u32 q7,d29,d4[1] 339304636Sjkim sub r1,r1,r5,lsl#2 @ rewind r1 340304636Sjkim vmlal.u32 q8,d29,d5[0] 341304636Sjkim vld1.64 {q5}, [sp,:128] 342304636Sjkim vmlal.u32 q9,d29,d5[1] 343304636Sjkim sub r9,r5,#1 344304636Sjkim 345304636Sjkim vmlal.u32 q10,d29,d6[0] 346304636Sjkim vst1.64 {q6-q7}, [r7,:256]! 347304636Sjkim vmlal.u32 q11,d29,d6[1] 348304636Sjkim vshr.u64 d10,d10,#16 349304636Sjkim vld1.64 {q6}, [r6, :128]! 350304636Sjkim vmlal.u32 q12,d29,d7[0] 351304636Sjkim vst1.64 {q8-q9}, [r7,:256]! 352304636Sjkim vmlal.u32 q13,d29,d7[1] 353304636Sjkim 354304636Sjkim vst1.64 {q10-q11}, [r7,:256]! 355304636Sjkim vadd.u64 d10,d10,d11 356304636Sjkim veor q4,q4,q4 357304636Sjkim vst1.64 {q12-q13}, [r7,:256]! 358304636Sjkim vld1.64 {q7-q8}, [r6, :256]! 359304636Sjkim vst1.64 {q4}, [r7,:128] 360304636Sjkim vshr.u64 d10,d10,#16 361304636Sjkim 362304636Sjkim b .LNEON_outer 363304636Sjkim 364304636Sjkim.align 4 365304636Sjkim.LNEON_outer: 366304636Sjkim vld1.32 {d28[0]}, [r2,:32]! 367304636Sjkim sub r3,r3,r5,lsl#2 @ rewind r3 368304636Sjkim vld1.32 {d0-d3}, [r1]! 369304636Sjkim veor d8,d8,d8 370304636Sjkim mov r7,sp 371304636Sjkim vzip.16 d28,d8 372304636Sjkim sub r8,r5,#8 373304636Sjkim vadd.u64 d12,d12,d10 374304636Sjkim 375304636Sjkim vmlal.u32 q6,d28,d0[0] 376304636Sjkim vld1.64 {q9-q10},[r6,:256]! 377304636Sjkim vmlal.u32 q7,d28,d0[1] 378304636Sjkim vmlal.u32 q8,d28,d1[0] 379304636Sjkim vld1.64 {q11-q12},[r6,:256]! 380304636Sjkim vmlal.u32 q9,d28,d1[1] 381304636Sjkim 382304636Sjkim vshl.i64 d10,d13,#16 383304636Sjkim veor d8,d8,d8 384304636Sjkim vadd.u64 d10,d10,d12 385304636Sjkim vld1.64 {q13},[r6,:128]! 386304636Sjkim vmul.u32 d29,d10,d30 387304636Sjkim 388304636Sjkim vmlal.u32 q10,d28,d2[0] 389304636Sjkim vld1.32 {d4-d7}, [r3]! 390304636Sjkim vmlal.u32 q11,d28,d2[1] 391304636Sjkim vmlal.u32 q12,d28,d3[0] 392304636Sjkim vzip.16 d29,d8 393304636Sjkim vmlal.u32 q13,d28,d3[1] 394304636Sjkim 395304636Sjkim.LNEON_inner: 396304636Sjkim vmlal.u32 q6,d29,d4[0] 397304636Sjkim vld1.32 {d0-d3}, [r1]! 398304636Sjkim vmlal.u32 q7,d29,d4[1] 399304636Sjkim subs r8,r8,#8 400304636Sjkim vmlal.u32 q8,d29,d5[0] 401304636Sjkim vmlal.u32 q9,d29,d5[1] 402304636Sjkim vst1.64 {q6-q7}, [r7,:256]! 403304636Sjkim 404304636Sjkim vmlal.u32 q10,d29,d6[0] 405304636Sjkim vld1.64 {q6}, [r6, :128]! 406304636Sjkim vmlal.u32 q11,d29,d6[1] 407304636Sjkim vst1.64 {q8-q9}, [r7,:256]! 408304636Sjkim vmlal.u32 q12,d29,d7[0] 409304636Sjkim vld1.64 {q7-q8}, [r6, :256]! 410304636Sjkim vmlal.u32 q13,d29,d7[1] 411304636Sjkim vst1.64 {q10-q11}, [r7,:256]! 412304636Sjkim 413304636Sjkim vmlal.u32 q6,d28,d0[0] 414304636Sjkim vld1.64 {q9-q10}, [r6, :256]! 415304636Sjkim vmlal.u32 q7,d28,d0[1] 416304636Sjkim vst1.64 {q12-q13}, [r7,:256]! 417304636Sjkim vmlal.u32 q8,d28,d1[0] 418304636Sjkim vld1.64 {q11-q12}, [r6, :256]! 419304636Sjkim vmlal.u32 q9,d28,d1[1] 420304636Sjkim vld1.32 {d4-d7}, [r3]! 421304636Sjkim 422304636Sjkim vmlal.u32 q10,d28,d2[0] 423304636Sjkim vld1.64 {q13}, [r6, :128]! 424304636Sjkim vmlal.u32 q11,d28,d2[1] 425304636Sjkim vmlal.u32 q12,d28,d3[0] 426304636Sjkim vmlal.u32 q13,d28,d3[1] 427304636Sjkim 428304636Sjkim bne .LNEON_inner 429304636Sjkim 430304636Sjkim vmlal.u32 q6,d29,d4[0] 431304636Sjkim add r6,sp,#16 432304636Sjkim vmlal.u32 q7,d29,d4[1] 433304636Sjkim sub r1,r1,r5,lsl#2 @ rewind r1 434304636Sjkim vmlal.u32 q8,d29,d5[0] 435304636Sjkim vld1.64 {q5}, [sp,:128] 436304636Sjkim vmlal.u32 q9,d29,d5[1] 437304636Sjkim subs r9,r9,#1 438304636Sjkim 439304636Sjkim vmlal.u32 q10,d29,d6[0] 440304636Sjkim vst1.64 {q6-q7}, [r7,:256]! 441304636Sjkim vmlal.u32 q11,d29,d6[1] 442304636Sjkim vld1.64 {q6}, [r6, :128]! 443304636Sjkim vshr.u64 d10,d10,#16 444304636Sjkim vst1.64 {q8-q9}, [r7,:256]! 445304636Sjkim vmlal.u32 q12,d29,d7[0] 446304636Sjkim vld1.64 {q7-q8}, [r6, :256]! 447304636Sjkim vmlal.u32 q13,d29,d7[1] 448304636Sjkim 449304636Sjkim vst1.64 {q10-q11}, [r7,:256]! 450304636Sjkim vadd.u64 d10,d10,d11 451304636Sjkim vst1.64 {q12-q13}, [r7,:256]! 452304636Sjkim vshr.u64 d10,d10,#16 453304636Sjkim 454304636Sjkim bne .LNEON_outer 455304636Sjkim 456304636Sjkim mov r7,sp 457304636Sjkim mov r8,r5 458304636Sjkim 459304636Sjkim.LNEON_tail: 460304636Sjkim vadd.u64 d12,d12,d10 461304636Sjkim vld1.64 {q9-q10}, [r6, :256]! 462304636Sjkim vshr.u64 d10,d12,#16 463304636Sjkim vadd.u64 d13,d13,d10 464304636Sjkim vld1.64 {q11-q12}, [r6, :256]! 465304636Sjkim vshr.u64 d10,d13,#16 466304636Sjkim vld1.64 {q13}, [r6, :128]! 467304636Sjkim vzip.16 d12,d13 468304636Sjkim 469304636Sjkim.LNEON_tail2: 470304636Sjkim vadd.u64 d14,d14,d10 471304636Sjkim vst1.32 {d12[0]}, [r7, :32]! 472304636Sjkim vshr.u64 d10,d14,#16 473304636Sjkim vadd.u64 d15,d15,d10 474304636Sjkim vshr.u64 d10,d15,#16 475304636Sjkim vzip.16 d14,d15 476304636Sjkim 477304636Sjkim vadd.u64 d16,d16,d10 478304636Sjkim vst1.32 {d14[0]}, [r7, :32]! 479304636Sjkim vshr.u64 d10,d16,#16 480304636Sjkim vadd.u64 d17,d17,d10 481304636Sjkim vshr.u64 d10,d17,#16 482304636Sjkim vzip.16 d16,d17 483304636Sjkim 484304636Sjkim vadd.u64 d18,d18,d10 485304636Sjkim vst1.32 {d16[0]}, [r7, :32]! 486304636Sjkim vshr.u64 d10,d18,#16 487304636Sjkim vadd.u64 d19,d19,d10 488304636Sjkim vshr.u64 d10,d19,#16 489304636Sjkim vzip.16 d18,d19 490304636Sjkim 491304636Sjkim vadd.u64 d20,d20,d10 492304636Sjkim vst1.32 {d18[0]}, [r7, :32]! 493304636Sjkim vshr.u64 d10,d20,#16 494304636Sjkim vadd.u64 d21,d21,d10 495304636Sjkim vshr.u64 d10,d21,#16 496304636Sjkim vzip.16 d20,d21 497304636Sjkim 498304636Sjkim vadd.u64 d22,d22,d10 499304636Sjkim vst1.32 {d20[0]}, [r7, :32]! 500304636Sjkim vshr.u64 d10,d22,#16 501304636Sjkim vadd.u64 d23,d23,d10 502304636Sjkim vshr.u64 d10,d23,#16 503304636Sjkim vzip.16 d22,d23 504304636Sjkim 505304636Sjkim vadd.u64 d24,d24,d10 506304636Sjkim vst1.32 {d22[0]}, [r7, :32]! 507304636Sjkim vshr.u64 d10,d24,#16 508304636Sjkim vadd.u64 d25,d25,d10 509304636Sjkim vld1.64 {q6}, [r6, :128]! 510304636Sjkim vshr.u64 d10,d25,#16 511304636Sjkim vzip.16 d24,d25 512304636Sjkim 513304636Sjkim vadd.u64 d26,d26,d10 514304636Sjkim vst1.32 {d24[0]}, [r7, :32]! 515304636Sjkim vshr.u64 d10,d26,#16 516304636Sjkim vadd.u64 d27,d27,d10 517304636Sjkim vld1.64 {q7-q8}, [r6, :256]! 518304636Sjkim vshr.u64 d10,d27,#16 519304636Sjkim vzip.16 d26,d27 520304636Sjkim subs r8,r8,#8 521304636Sjkim vst1.32 {d26[0]}, [r7, :32]! 522304636Sjkim 523304636Sjkim bne .LNEON_tail 524304636Sjkim 525304636Sjkim vst1.32 {d10[0]}, [r7, :32] @ top-most bit 526304636Sjkim sub r3,r3,r5,lsl#2 @ rewind r3 527304636Sjkim subs r1,sp,#0 @ clear carry flag 528304636Sjkim add r2,sp,r5,lsl#2 529304636Sjkim 530304636Sjkim.LNEON_sub: 531304636Sjkim ldmia r1!, {r4-r7} 532304636Sjkim ldmia r3!, {r8-r11} 533304636Sjkim sbcs r8, r4,r8 534304636Sjkim sbcs r9, r5,r9 535304636Sjkim sbcs r10,r6,r10 536304636Sjkim sbcs r11,r7,r11 537304636Sjkim teq r1,r2 @ preserves carry 538304636Sjkim stmia r0!, {r8-r11} 539304636Sjkim bne .LNEON_sub 540304636Sjkim 541304636Sjkim ldr r10, [r1] @ load top-most bit 542304636Sjkim veor q0,q0,q0 543304636Sjkim sub r11,r2,sp @ this is num*4 544304636Sjkim veor q1,q1,q1 545304636Sjkim mov r1,sp 546304636Sjkim sub r0,r0,r11 @ rewind r0 547304636Sjkim mov r3,r2 @ second 3/4th of frame 548304636Sjkim sbcs r10,r10,#0 @ result is carry flag 549304636Sjkim 550304636Sjkim.LNEON_copy_n_zap: 551304636Sjkim ldmia r1!, {r4-r7} 552304636Sjkim ldmia r0, {r8-r11} 553304636Sjkim movcc r8, r4 554304636Sjkim vst1.64 {q0-q1}, [r3,:256]! @ wipe 555304636Sjkim movcc r9, r5 556304636Sjkim movcc r10,r6 557304636Sjkim vst1.64 {q0-q1}, [r3,:256]! @ wipe 558304636Sjkim movcc r11,r7 559304636Sjkim ldmia r1, {r4-r7} 560304636Sjkim stmia r0!, {r8-r11} 561304636Sjkim sub r1,r1,#16 562304636Sjkim ldmia r0, {r8-r11} 563304636Sjkim movcc r8, r4 564304636Sjkim vst1.64 {q0-q1}, [r1,:256]! @ wipe 565304636Sjkim movcc r9, r5 566304636Sjkim movcc r10,r6 567304636Sjkim vst1.64 {q0-q1}, [r3,:256]! @ wipe 568304636Sjkim movcc r11,r7 569304636Sjkim teq r1,r2 @ preserves carry 570304636Sjkim stmia r0!, {r8-r11} 571304636Sjkim bne .LNEON_copy_n_zap 572304636Sjkim 573304636Sjkim sub sp,ip,#96 574304636Sjkim vldmia sp!,{d8-d15} 575304636Sjkim ldmia sp!,{r4-r11} 576304636Sjkim bx lr @ .word 0xe12fff1e 577304636Sjkim.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 578304636Sjkim#endif 579304636Sjkim.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" 580304636Sjkim.align 2 581304636Sjkim#if __ARM_MAX_ARCH__>=7 582304636Sjkim.comm OPENSSL_armcap_P,4,4 583304636Sjkim#endif 584