1/*===--------------- sm4intrin.h - SM4 intrinsics -----------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __IMMINTRIN_H 11#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead." 12#endif // __IMMINTRIN_H 13 14#ifndef __SM4INTRIN_H 15#define __SM4INTRIN_H 16 17/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic 18/// operates on independent 128-bit lanes. The calculated results are 19/// stored in \a dst. 20/// \headerfile <immintrin.h> 21/// 22/// \code 23/// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B) 24/// \endcode 25/// 26/// This intrinsic corresponds to the \c VSM4KEY4 instruction. 27/// 28/// \param __A 29/// A 128-bit vector of [4 x int]. 30/// \param __B 31/// A 128-bit vector of [4 x int]. 32/// \returns 33/// A 128-bit vector of [4 x int]. 34/// 35/// \code{.operation} 36/// DEFINE ROL32(dword, n) { 37/// count := n % 32 38/// dest := (dword << count) | (dword >> (32-count)) 39/// RETURN dest 40/// } 41/// DEFINE SBOX_BYTE(dword, i) { 42/// RETURN sbox[dword.byte[i]] 43/// } 44/// DEFINE lower_t(dword) { 45/// tmp.byte[0] := SBOX_BYTE(dword, 0) 46/// tmp.byte[1] := SBOX_BYTE(dword, 1) 47/// tmp.byte[2] := SBOX_BYTE(dword, 2) 48/// tmp.byte[3] := SBOX_BYTE(dword, 3) 49/// RETURN tmp 50/// } 51/// DEFINE L_KEY(dword) { 52/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) 53/// } 54/// DEFINE T_KEY(dword) { 55/// RETURN L_KEY(lower_t(dword)) 56/// } 57/// DEFINE F_KEY(X0, X1, X2, X3, round_key) { 58/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) 59/// } 60/// FOR i:= 0 to 0 61/// P[0] := __B.xmm[i].dword[0] 62/// P[1] := __B.xmm[i].dword[1] 63/// P[2] := __B.xmm[i].dword[2] 64/// P[3] := __B.xmm[i].dword[3] 65/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 66/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 67/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 68/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 69/// DEST.xmm[i].dword[0] := C[0] 70/// DEST.xmm[i].dword[1] := C[1] 71/// DEST.xmm[i].dword[2] := C[2] 72/// DEST.xmm[i].dword[3] := C[3] 73/// ENDFOR 74/// DEST[MAX:128] := 0 75/// \endcode 76#define _mm_sm4key4_epi32(A, B) \ 77 (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B) 78 79/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic 80/// operates on independent 128-bit lanes. The calculated results are 81/// stored in \a dst. 82/// \headerfile <immintrin.h> 83/// 84/// \code 85/// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B) 86/// \endcode 87/// 88/// This intrinsic corresponds to the \c VSM4KEY4 instruction. 89/// 90/// \param __A 91/// A 256-bit vector of [8 x int]. 92/// \param __B 93/// A 256-bit vector of [8 x int]. 94/// \returns 95/// A 256-bit vector of [8 x int]. 96/// 97/// \code{.operation} 98/// DEFINE ROL32(dword, n) { 99/// count := n % 32 100/// dest := (dword << count) | (dword >> (32-count)) 101/// RETURN dest 102/// } 103/// DEFINE SBOX_BYTE(dword, i) { 104/// RETURN sbox[dword.byte[i]] 105/// } 106/// DEFINE lower_t(dword) { 107/// tmp.byte[0] := SBOX_BYTE(dword, 0) 108/// tmp.byte[1] := SBOX_BYTE(dword, 1) 109/// tmp.byte[2] := SBOX_BYTE(dword, 2) 110/// tmp.byte[3] := SBOX_BYTE(dword, 3) 111/// RETURN tmp 112/// } 113/// DEFINE L_KEY(dword) { 114/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) 115/// } 116/// DEFINE T_KEY(dword) { 117/// RETURN L_KEY(lower_t(dword)) 118/// } 119/// DEFINE F_KEY(X0, X1, X2, X3, round_key) { 120/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) 121/// } 122/// FOR i:= 0 to 1 123/// P[0] := __B.xmm[i].dword[0] 124/// P[1] := __B.xmm[i].dword[1] 125/// P[2] := __B.xmm[i].dword[2] 126/// P[3] := __B.xmm[i].dword[3] 127/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 128/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 129/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 130/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 131/// DEST.xmm[i].dword[0] := C[0] 132/// DEST.xmm[i].dword[1] := C[1] 133/// DEST.xmm[i].dword[2] := C[2] 134/// DEST.xmm[i].dword[3] := C[3] 135/// ENDFOR 136/// DEST[MAX:256] := 0 137/// \endcode 138#define _mm256_sm4key4_epi32(A, B) \ 139 (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B) 140 141/// This intrinisc performs four rounds of SM4 encryption. The intrinisc 142/// operates on independent 128-bit lanes. The calculated results are 143/// stored in \a dst. 144/// \headerfile <immintrin.h> 145/// 146/// \code 147/// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B) 148/// \endcode 149/// 150/// This intrinsic corresponds to the \c VSM4RNDS4 instruction. 151/// 152/// \param __A 153/// A 128-bit vector of [4 x int]. 154/// \param __B 155/// A 128-bit vector of [4 x int]. 156/// \returns 157/// A 128-bit vector of [4 x int]. 158/// 159/// \code{.operation} 160/// DEFINE ROL32(dword, n) { 161/// count := n % 32 162/// dest := (dword << count) | (dword >> (32-count)) 163/// RETURN dest 164/// } 165/// DEFINE lower_t(dword) { 166/// tmp.byte[0] := SBOX_BYTE(dword, 0) 167/// tmp.byte[1] := SBOX_BYTE(dword, 1) 168/// tmp.byte[2] := SBOX_BYTE(dword, 2) 169/// tmp.byte[3] := SBOX_BYTE(dword, 3) 170/// RETURN tmp 171/// } 172/// DEFINE L_RND(dword) { 173/// tmp := dword 174/// tmp := tmp ^ ROL32(dword, 2) 175/// tmp := tmp ^ ROL32(dword, 10) 176/// tmp := tmp ^ ROL32(dword, 18) 177/// tmp := tmp ^ ROL32(dword, 24) 178/// RETURN tmp 179/// } 180/// DEFINE T_RND(dword) { 181/// RETURN L_RND(lower_t(dword)) 182/// } 183/// DEFINE F_RND(X0, X1, X2, X3, round_key) { 184/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) 185/// } 186/// FOR i:= 0 to 0 187/// P[0] := __B.xmm[i].dword[0] 188/// P[1] := __B.xmm[i].dword[1] 189/// P[2] := __B.xmm[i].dword[2] 190/// P[3] := __B.xmm[i].dword[3] 191/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 192/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 193/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 194/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 195/// DEST.xmm[i].dword[0] := C[0] 196/// DEST.xmm[i].dword[1] := C[1] 197/// DEST.xmm[i].dword[2] := C[2] 198/// DEST.xmm[i].dword[3] := C[3] 199/// ENDFOR 200/// DEST[MAX:128] := 0 201/// \endcode 202#define _mm_sm4rnds4_epi32(A, B) \ 203 (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B) 204 205/// This intrinisc performs four rounds of SM4 encryption. The intrinisc 206/// operates on independent 128-bit lanes. The calculated results are 207/// stored in \a dst. 208/// \headerfile <immintrin.h> 209/// 210/// \code 211/// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B) 212/// \endcode 213/// 214/// This intrinsic corresponds to the \c VSM4RNDS4 instruction. 215/// 216/// \param __A 217/// A 256-bit vector of [8 x int]. 218/// \param __B 219/// A 256-bit vector of [8 x int]. 220/// \returns 221/// A 256-bit vector of [8 x int]. 222/// 223/// \code{.operation} 224/// DEFINE ROL32(dword, n) { 225/// count := n % 32 226/// dest := (dword << count) | (dword >> (32-count)) 227/// RETURN dest 228/// } 229/// DEFINE lower_t(dword) { 230/// tmp.byte[0] := SBOX_BYTE(dword, 0) 231/// tmp.byte[1] := SBOX_BYTE(dword, 1) 232/// tmp.byte[2] := SBOX_BYTE(dword, 2) 233/// tmp.byte[3] := SBOX_BYTE(dword, 3) 234/// RETURN tmp 235/// } 236/// DEFINE L_RND(dword) { 237/// tmp := dword 238/// tmp := tmp ^ ROL32(dword, 2) 239/// tmp := tmp ^ ROL32(dword, 10) 240/// tmp := tmp ^ ROL32(dword, 18) 241/// tmp := tmp ^ ROL32(dword, 24) 242/// RETURN tmp 243/// } 244/// DEFINE T_RND(dword) { 245/// RETURN L_RND(lower_t(dword)) 246/// } 247/// DEFINE F_RND(X0, X1, X2, X3, round_key) { 248/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) 249/// } 250/// FOR i:= 0 to 0 251/// P[0] := __B.xmm[i].dword[0] 252/// P[1] := __B.xmm[i].dword[1] 253/// P[2] := __B.xmm[i].dword[2] 254/// P[3] := __B.xmm[i].dword[3] 255/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 256/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 257/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 258/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 259/// DEST.xmm[i].dword[0] := C[0] 260/// DEST.xmm[i].dword[1] := C[1] 261/// DEST.xmm[i].dword[2] := C[2] 262/// DEST.xmm[i].dword[3] := C[3] 263/// ENDFOR 264/// DEST[MAX:256] := 0 265/// \endcode 266#define _mm256_sm4rnds4_epi32(A, B) \ 267 (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B) 268 269#endif // __SM4INTRIN_H 270