1#if defined(__aarch64__) 2#include <openssl/arm_arch.h> 3 4.text 5.arch armv8-a+crypto 6.globl gcm_init_v8 7.hidden gcm_init_v8 8.type gcm_init_v8,%function 9.align 4 10gcm_init_v8: 11 ld1 {v17.2d},[x1] //load input H 12 movi v19.16b,#0xe1 13 shl v19.2d,v19.2d,#57 //0xc2.0 14 ext v3.16b,v17.16b,v17.16b,#8 15 ushr v18.2d,v19.2d,#63 16 dup v17.4s,v17.s[1] 17 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 18 ushr v18.2d,v3.2d,#63 19 sshr v17.4s,v17.4s,#31 //broadcast carry bit 20 and v18.16b,v18.16b,v16.16b 21 shl v3.2d,v3.2d,#1 22 ext v18.16b,v18.16b,v18.16b,#8 23 and v16.16b,v16.16b,v17.16b 24 orr v3.16b,v3.16b,v18.16b //H<<<=1 25 eor v20.16b,v3.16b,v16.16b //twisted H 26 st1 {v20.2d},[x0],#16 //store Htable[0] 27 28 //calculate H^2 29 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 30 pmull v0.1q,v20.1d,v20.1d 31 eor v16.16b,v16.16b,v20.16b 32 pmull2 v2.1q,v20.2d,v20.2d 33 pmull v1.1q,v16.1d,v16.1d 34 35 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 36 eor v18.16b,v0.16b,v2.16b 37 eor v1.16b,v1.16b,v17.16b 38 eor v1.16b,v1.16b,v18.16b 39 pmull v18.1q,v0.1d,v19.1d //1st phase 40 41 ins v2.d[0],v1.d[1] 42 ins v1.d[1],v0.d[0] 43 eor v0.16b,v1.16b,v18.16b 44 45 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 46 pmull v0.1q,v0.1d,v19.1d 47 eor v18.16b,v18.16b,v2.16b 48 eor v22.16b,v0.16b,v18.16b 49 50 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 51 eor v17.16b,v17.16b,v22.16b 52 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 53 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] 54 55 ret 56.size gcm_init_v8,.-gcm_init_v8 57.globl gcm_gmult_v8 58.hidden gcm_gmult_v8 59.type gcm_gmult_v8,%function 60.align 4 61gcm_gmult_v8: 62 ld1 {v17.2d},[x0] //load Xi 63 movi v19.16b,#0xe1 64 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 65 shl v19.2d,v19.2d,#57 66#ifndef __ARMEB__ 67 rev64 v17.16b,v17.16b 68#endif 69 ext v3.16b,v17.16b,v17.16b,#8 70 71 pmull v0.1q,v20.1d,v3.1d //H.lo��Xi.lo 72 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 73 pmull2 v2.1q,v20.2d,v3.2d //H.hi��Xi.hi 74 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)��(Xi.lo+Xi.hi) 75 76 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 77 eor v18.16b,v0.16b,v2.16b 78 eor v1.16b,v1.16b,v17.16b 79 eor v1.16b,v1.16b,v18.16b 80 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 81 82 ins v2.d[0],v1.d[1] 83 ins v1.d[1],v0.d[0] 84 eor v0.16b,v1.16b,v18.16b 85 86 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 87 pmull v0.1q,v0.1d,v19.1d 88 eor v18.16b,v18.16b,v2.16b 89 eor v0.16b,v0.16b,v18.16b 90 91#ifndef __ARMEB__ 92 rev64 v0.16b,v0.16b 93#endif 94 ext v0.16b,v0.16b,v0.16b,#8 95 st1 {v0.2d},[x0] //write out Xi 96 97 ret 98.size gcm_gmult_v8,.-gcm_gmult_v8 99.globl gcm_ghash_v8 100.hidden gcm_ghash_v8 101.type gcm_ghash_v8,%function 102.align 4 103gcm_ghash_v8: 104 ld1 {v0.2d},[x0] //load [rotated] Xi 105 //"[rotated]" means that 106 //loaded value would have 107 //to be rotated in order to 108 //make it appear as in 109 //algorithm specification 110 subs x3,x3,#32 //see if x3 is 32 or larger 111 mov x12,#16 //x12 is used as post- 112 //increment for input pointer; 113 //as loop is modulo-scheduled 114 //x12 is zeroed just in time 115 //to preclude overstepping 116 //inp[len], which means that 117 //last block[s] are actually 118 //loaded twice, but last 119 //copy is not processed 120 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 121 movi v19.16b,#0xe1 122 ld1 {v22.2d},[x1] 123 csel x12,xzr,x12,eq //is it time to zero x12? 124 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 125 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 126 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 127#ifndef __ARMEB__ 128 rev64 v16.16b,v16.16b 129 rev64 v0.16b,v0.16b 130#endif 131 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 132 b.lo .Lodd_tail_v8 //x3 was less than 32 133 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 134#ifndef __ARMEB__ 135 rev64 v17.16b,v17.16b 136#endif 137 ext v7.16b,v17.16b,v17.16b,#8 138 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 139 pmull v4.1q,v20.1d,v7.1d //H��Ii+1 140 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 141 pmull2 v6.1q,v20.2d,v7.2d 142 b .Loop_mod2x_v8 143 144.align 4 145.Loop_mod2x_v8: 146 ext v18.16b,v3.16b,v3.16b,#8 147 subs x3,x3,#32 //is there more data? 148 pmull v0.1q,v22.1d,v3.1d //H^2.lo��Xi.lo 149 csel x12,xzr,x12,lo //is it time to zero x12? 150 151 pmull v5.1q,v21.1d,v17.1d 152 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 153 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi��Xi.hi 154 eor v0.16b,v0.16b,v4.16b //accumulate 155 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)��(Xi.lo+Xi.hi) 156 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 157 158 eor v2.16b,v2.16b,v6.16b 159 csel x12,xzr,x12,eq //is it time to zero x12? 160 eor v1.16b,v1.16b,v5.16b 161 162 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 163 eor v18.16b,v0.16b,v2.16b 164 eor v1.16b,v1.16b,v17.16b 165 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 166#ifndef __ARMEB__ 167 rev64 v16.16b,v16.16b 168#endif 169 eor v1.16b,v1.16b,v18.16b 170 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 171 172#ifndef __ARMEB__ 173 rev64 v17.16b,v17.16b 174#endif 175 ins v2.d[0],v1.d[1] 176 ins v1.d[1],v0.d[0] 177 ext v7.16b,v17.16b,v17.16b,#8 178 ext v3.16b,v16.16b,v16.16b,#8 179 eor v0.16b,v1.16b,v18.16b 180 pmull v4.1q,v20.1d,v7.1d //H��Ii+1 181 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 182 183 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 184 pmull v0.1q,v0.1d,v19.1d 185 eor v3.16b,v3.16b,v18.16b 186 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 187 eor v3.16b,v3.16b,v0.16b 188 pmull2 v6.1q,v20.2d,v7.2d 189 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 190 191 eor v2.16b,v2.16b,v18.16b 192 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 193 adds x3,x3,#32 //re-construct x3 194 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 195 b.eq .Ldone_v8 //is x3 zero? 196.Lodd_tail_v8: 197 ext v18.16b,v0.16b,v0.16b,#8 198 eor v3.16b,v3.16b,v0.16b //inp^=Xi 199 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 200 201 pmull v0.1q,v20.1d,v3.1d //H.lo��Xi.lo 202 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 203 pmull2 v2.1q,v20.2d,v3.2d //H.hi��Xi.hi 204 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)��(Xi.lo+Xi.hi) 205 206 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 207 eor v18.16b,v0.16b,v2.16b 208 eor v1.16b,v1.16b,v17.16b 209 eor v1.16b,v1.16b,v18.16b 210 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 211 212 ins v2.d[0],v1.d[1] 213 ins v1.d[1],v0.d[0] 214 eor v0.16b,v1.16b,v18.16b 215 216 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 217 pmull v0.1q,v0.1d,v19.1d 218 eor v18.16b,v18.16b,v2.16b 219 eor v0.16b,v0.16b,v18.16b 220 221.Ldone_v8: 222#ifndef __ARMEB__ 223 rev64 v0.16b,v0.16b 224#endif 225 ext v0.16b,v0.16b,v0.16b,#8 226 st1 {v0.2d},[x0] //write out Xi 227 228 ret 229.size gcm_ghash_v8,.-gcm_ghash_v8 230.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 231.align 2 232.align 2 233#endif 234