1#include "arm_arch.h" 2 3.text 4.code 32 5.type mul_1x1_ialu,%function 6.align 5 7mul_1x1_ialu: 8 mov r4,#0 9 bic r5,r1,#3<<30 @ a1=a&0x3fffffff 10 str r4,[sp,#0] @ tab[0]=0 11 add r6,r5,r5 @ a2=a1<<1 12 str r5,[sp,#4] @ tab[1]=a1 13 eor r7,r5,r6 @ a1^a2 14 str r6,[sp,#8] @ tab[2]=a2 15 mov r8,r5,lsl#2 @ a4=a1<<2 16 str r7,[sp,#12] @ tab[3]=a1^a2 17 eor r9,r5,r8 @ a1^a4 18 str r8,[sp,#16] @ tab[4]=a4 19 eor r4,r6,r8 @ a2^a4 20 str r9,[sp,#20] @ tab[5]=a1^a4 21 eor r7,r7,r8 @ a1^a2^a4 22 str r4,[sp,#24] @ tab[6]=a2^a4 23 and r8,r12,r0,lsl#2 24 str r7,[sp,#28] @ tab[7]=a1^a2^a4 25 26 and r9,r12,r0,lsr#1 27 ldr r5,[sp,r8] @ tab[b & 0x7] 28 and r8,r12,r0,lsr#4 29 ldr r7,[sp,r9] @ tab[b >> 3 & 0x7] 30 and r9,r12,r0,lsr#7 31 ldr r6,[sp,r8] @ tab[b >> 6 & 0x7] 32 eor r5,r5,r7,lsl#3 @ stall 33 mov r4,r7,lsr#29 34 ldr r7,[sp,r9] @ tab[b >> 9 & 0x7] 35 36 and r8,r12,r0,lsr#10 37 eor r5,r5,r6,lsl#6 38 eor r4,r4,r6,lsr#26 39 ldr r6,[sp,r8] @ tab[b >> 12 & 0x7] 40 41 and r9,r12,r0,lsr#13 42 eor r5,r5,r7,lsl#9 43 eor r4,r4,r7,lsr#23 44 ldr r7,[sp,r9] @ tab[b >> 15 & 0x7] 45 46 and r8,r12,r0,lsr#16 47 eor r5,r5,r6,lsl#12 48 eor r4,r4,r6,lsr#20 49 ldr r6,[sp,r8] @ tab[b >> 18 & 0x7] 50 51 and r9,r12,r0,lsr#19 52 eor r5,r5,r7,lsl#15 53 eor r4,r4,r7,lsr#17 54 ldr r7,[sp,r9] @ tab[b >> 21 & 0x7] 55 56 and r8,r12,r0,lsr#22 57 eor r5,r5,r6,lsl#18 58 eor r4,r4,r6,lsr#14 59 ldr r6,[sp,r8] @ tab[b >> 24 & 0x7] 60 61 and r9,r12,r0,lsr#25 62 eor r5,r5,r7,lsl#21 63 eor r4,r4,r7,lsr#11 64 ldr r7,[sp,r9] @ tab[b >> 27 & 0x7] 65 66 tst r1,#1<<30 67 and r8,r12,r0,lsr#28 68 eor r5,r5,r6,lsl#24 69 eor r4,r4,r6,lsr#8 70 ldr r6,[sp,r8] @ tab[b >> 30 ] 71 72 eorne r5,r5,r0,lsl#30 73 eorne r4,r4,r0,lsr#2 74 tst r1,#1<<31 75 eor r5,r5,r7,lsl#27 76 eor r4,r4,r7,lsr#5 77 eorne r5,r5,r0,lsl#31 78 eorne r4,r4,r0,lsr#1 79 eor r5,r5,r6,lsl#30 80 eor r4,r4,r6,lsr#2 81 82 mov pc,lr 83.size mul_1x1_ialu,.-mul_1x1_ialu 84.global bn_GF2m_mul_2x2 85.type bn_GF2m_mul_2x2,%function 86.align 5 87bn_GF2m_mul_2x2: 88#if __ARM_MAX_ARCH__>=7 89 ldr r12,.LOPENSSL_armcap 90.Lpic: ldr r12,[pc,r12] 91 tst r12,#1 92 bne .LNEON 93#endif 94 stmdb sp!,{r4-r10,lr} 95 mov r10,r0 @ reassign 1st argument 96 mov r0,r3 @ r0=b1 97 ldr r3,[sp,#32] @ load b0 98 mov r12,#7<<2 99 sub sp,sp,#32 @ allocate tab[8] 100 101 bl mul_1x1_ialu @ a1�b1 102 str r5,[r10,#8] 103 str r4,[r10,#12] 104 105 eor r0,r0,r3 @ flip b0 and b1 106 eor r1,r1,r2 @ flip a0 and a1 107 eor r3,r3,r0 108 eor r2,r2,r1 109 eor r0,r0,r3 110 eor r1,r1,r2 111 bl mul_1x1_ialu @ a0�b0 112 str r5,[r10] 113 str r4,[r10,#4] 114 115 eor r1,r1,r2 116 eor r0,r0,r3 117 bl mul_1x1_ialu @ (a1+a0)�(b1+b0) 118 ldmia r10,{r6-r9} 119 eor r5,r5,r4 120 eor r4,r4,r7 121 eor r5,r5,r6 122 eor r4,r4,r8 123 eor r5,r5,r9 124 eor r4,r4,r9 125 str r4,[r10,#8] 126 eor r5,r5,r4 127 add sp,sp,#32 @ destroy tab[8] 128 str r5,[r10,#4] 129 130#if __ARM_ARCH__>=5 131 ldmia sp!,{r4-r10,pc} 132#else 133 ldmia sp!,{r4-r10,lr} 134 tst lr,#1 135 moveq pc,lr @ be binary compatible with V4, yet 136 .word 0xe12fff1e @ interoperable with Thumb ISA:-) 137#endif 138#if __ARM_MAX_ARCH__>=7 139.arch armv7-a 140.fpu neon 141 142.align 5 143.LNEON: 144 ldr r12, [sp] @ 5th argument 145 vmov.32 d26, r2, r1 146 vmov.32 d27, r12, r3 147 vmov.i64 d28, #0x0000ffffffffffff 148 vmov.i64 d29, #0x00000000ffffffff 149 vmov.i64 d30, #0x000000000000ffff 150 151 vext.8 d2, d26, d26, #1 @ A1 152 vmull.p8 q1, d2, d27 @ F = A1*B 153 vext.8 d0, d27, d27, #1 @ B1 154 vmull.p8 q0, d26, d0 @ E = A*B1 155 vext.8 d4, d26, d26, #2 @ A2 156 vmull.p8 q2, d4, d27 @ H = A2*B 157 vext.8 d16, d27, d27, #2 @ B2 158 vmull.p8 q8, d26, d16 @ G = A*B2 159 vext.8 d6, d26, d26, #3 @ A3 160 veor q1, q1, q0 @ L = E + F 161 vmull.p8 q3, d6, d27 @ J = A3*B 162 vext.8 d0, d27, d27, #3 @ B3 163 veor q2, q2, q8 @ M = G + H 164 vmull.p8 q0, d26, d0 @ I = A*B3 165 veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8 166 vand d3, d3, d28 167 vext.8 d16, d27, d27, #4 @ B4 168 veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16 169 vand d5, d5, d29 170 vmull.p8 q8, d26, d16 @ K = A*B4 171 veor q3, q3, q0 @ N = I + J 172 veor d2, d2, d3 173 veor d4, d4, d5 174 veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24 175 vand d7, d7, d30 176 vext.8 q1, q1, q1, #15 177 veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32 178 vmov.i64 d17, #0 179 vext.8 q2, q2, q2, #14 180 veor d6, d6, d7 181 vmull.p8 q0, d26, d27 @ D = A*B 182 vext.8 q8, q8, q8, #12 183 vext.8 q3, q3, q3, #13 184 veor q1, q1, q2 185 veor q3, q3, q8 186 veor q0, q0, q1 187 veor q0, q0, q3 188 189 vst1.32 {q0}, [r0] 190 bx lr @ bx lr 191#endif 192.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 193#if __ARM_MAX_ARCH__>=7 194.align 5 195.LOPENSSL_armcap: 196.word OPENSSL_armcap_P-(.Lpic+8) 197#endif 198.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" 199.align 5 200 201#if __ARM_MAX_ARCH__>=7 202.comm OPENSSL_armcap_P,4,4 203#endif 204