1*06c3fb27SDimitry Andric /*===--------------- sm4intrin.h - SM4 intrinsics -----------------=== 2*06c3fb27SDimitry Andric * 3*06c3fb27SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*06c3fb27SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 5*06c3fb27SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*06c3fb27SDimitry Andric * 7*06c3fb27SDimitry Andric *===-----------------------------------------------------------------------=== 8*06c3fb27SDimitry Andric */ 9*06c3fb27SDimitry Andric 10*06c3fb27SDimitry Andric #ifndef __IMMINTRIN_H 11*06c3fb27SDimitry Andric #error "Never use <sm4intrin.h> directly; include <immintrin.h> instead." 12*06c3fb27SDimitry Andric #endif // __IMMINTRIN_H 13*06c3fb27SDimitry Andric 14*06c3fb27SDimitry Andric #ifndef __SM4INTRIN_H 15*06c3fb27SDimitry Andric #define __SM4INTRIN_H 16*06c3fb27SDimitry Andric 17*06c3fb27SDimitry Andric /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic 18*06c3fb27SDimitry Andric /// operates on independent 128-bit lanes. The calculated results are 19*06c3fb27SDimitry Andric /// stored in \a dst. 20*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 21*06c3fb27SDimitry Andric /// 22*06c3fb27SDimitry Andric /// \code 23*06c3fb27SDimitry Andric /// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B) 24*06c3fb27SDimitry Andric /// \endcode 25*06c3fb27SDimitry Andric /// 26*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VSM4KEY4 instruction. 27*06c3fb27SDimitry Andric /// 28*06c3fb27SDimitry Andric /// \param __A 29*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int]. 30*06c3fb27SDimitry Andric /// \param __B 31*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int]. 32*06c3fb27SDimitry Andric /// \returns 33*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int]. 34*06c3fb27SDimitry Andric /// 35*06c3fb27SDimitry Andric /// \code{.operation} 36*06c3fb27SDimitry Andric /// DEFINE ROL32(dword, n) { 37*06c3fb27SDimitry Andric /// count := n % 32 38*06c3fb27SDimitry Andric /// dest := (dword << count) | (dword >> (32-count)) 39*06c3fb27SDimitry Andric /// RETURN dest 40*06c3fb27SDimitry Andric /// } 41*06c3fb27SDimitry Andric /// DEFINE SBOX_BYTE(dword, i) { 42*06c3fb27SDimitry Andric /// RETURN sbox[dword.byte[i]] 43*06c3fb27SDimitry Andric /// } 44*06c3fb27SDimitry Andric /// DEFINE lower_t(dword) { 45*06c3fb27SDimitry Andric /// tmp.byte[0] := SBOX_BYTE(dword, 0) 46*06c3fb27SDimitry Andric /// tmp.byte[1] := SBOX_BYTE(dword, 1) 47*06c3fb27SDimitry Andric /// tmp.byte[2] := SBOX_BYTE(dword, 2) 48*06c3fb27SDimitry Andric /// tmp.byte[3] := SBOX_BYTE(dword, 3) 49*06c3fb27SDimitry Andric /// RETURN tmp 50*06c3fb27SDimitry Andric /// } 51*06c3fb27SDimitry Andric /// DEFINE L_KEY(dword) { 52*06c3fb27SDimitry Andric /// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) 53*06c3fb27SDimitry Andric /// } 54*06c3fb27SDimitry Andric /// DEFINE T_KEY(dword) { 55*06c3fb27SDimitry Andric /// RETURN L_KEY(lower_t(dword)) 56*06c3fb27SDimitry Andric /// } 57*06c3fb27SDimitry Andric /// DEFINE F_KEY(X0, X1, X2, X3, round_key) { 58*06c3fb27SDimitry Andric /// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) 59*06c3fb27SDimitry Andric /// } 60*06c3fb27SDimitry Andric /// FOR i:= 0 to 0 61*06c3fb27SDimitry Andric /// P[0] := __B.xmm[i].dword[0] 62*06c3fb27SDimitry Andric /// P[1] := __B.xmm[i].dword[1] 63*06c3fb27SDimitry Andric /// P[2] := __B.xmm[i].dword[2] 64*06c3fb27SDimitry Andric /// P[3] := __B.xmm[i].dword[3] 65*06c3fb27SDimitry Andric /// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 66*06c3fb27SDimitry Andric /// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 67*06c3fb27SDimitry Andric /// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 68*06c3fb27SDimitry Andric /// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 69*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[0] := C[0] 70*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[1] := C[1] 71*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[2] := C[2] 72*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[3] := C[3] 73*06c3fb27SDimitry Andric /// ENDFOR 74*06c3fb27SDimitry Andric /// DEST[MAX:128] := 0 75*06c3fb27SDimitry Andric /// \endcode 76*06c3fb27SDimitry Andric #define _mm_sm4key4_epi32(A, B) \ 77*06c3fb27SDimitry Andric (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B) 78*06c3fb27SDimitry Andric 79*06c3fb27SDimitry Andric /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic 80*06c3fb27SDimitry Andric /// operates on independent 128-bit lanes. The calculated results are 81*06c3fb27SDimitry Andric /// stored in \a dst. 82*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 83*06c3fb27SDimitry Andric /// 84*06c3fb27SDimitry Andric /// \code 85*06c3fb27SDimitry Andric /// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B) 86*06c3fb27SDimitry Andric /// \endcode 87*06c3fb27SDimitry Andric /// 88*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VSM4KEY4 instruction. 89*06c3fb27SDimitry Andric /// 90*06c3fb27SDimitry Andric /// \param __A 91*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int]. 92*06c3fb27SDimitry Andric /// \param __B 93*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int]. 94*06c3fb27SDimitry Andric /// \returns 95*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int]. 96*06c3fb27SDimitry Andric /// 97*06c3fb27SDimitry Andric /// \code{.operation} 98*06c3fb27SDimitry Andric /// DEFINE ROL32(dword, n) { 99*06c3fb27SDimitry Andric /// count := n % 32 100*06c3fb27SDimitry Andric /// dest := (dword << count) | (dword >> (32-count)) 101*06c3fb27SDimitry Andric /// RETURN dest 102*06c3fb27SDimitry Andric /// } 103*06c3fb27SDimitry Andric /// DEFINE SBOX_BYTE(dword, i) { 104*06c3fb27SDimitry Andric /// RETURN sbox[dword.byte[i]] 105*06c3fb27SDimitry Andric /// } 106*06c3fb27SDimitry Andric /// DEFINE lower_t(dword) { 107*06c3fb27SDimitry Andric /// tmp.byte[0] := SBOX_BYTE(dword, 0) 108*06c3fb27SDimitry Andric /// tmp.byte[1] := SBOX_BYTE(dword, 1) 109*06c3fb27SDimitry Andric /// tmp.byte[2] := SBOX_BYTE(dword, 2) 110*06c3fb27SDimitry Andric /// tmp.byte[3] := SBOX_BYTE(dword, 3) 111*06c3fb27SDimitry Andric /// RETURN tmp 112*06c3fb27SDimitry Andric /// } 113*06c3fb27SDimitry Andric /// DEFINE L_KEY(dword) { 114*06c3fb27SDimitry Andric /// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) 115*06c3fb27SDimitry Andric /// } 116*06c3fb27SDimitry Andric /// DEFINE T_KEY(dword) { 117*06c3fb27SDimitry Andric /// RETURN L_KEY(lower_t(dword)) 118*06c3fb27SDimitry Andric /// } 119*06c3fb27SDimitry Andric /// DEFINE F_KEY(X0, X1, X2, X3, round_key) { 120*06c3fb27SDimitry Andric /// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) 121*06c3fb27SDimitry Andric /// } 122*06c3fb27SDimitry Andric /// FOR i:= 0 to 1 123*06c3fb27SDimitry Andric /// P[0] := __B.xmm[i].dword[0] 124*06c3fb27SDimitry Andric /// P[1] := __B.xmm[i].dword[1] 125*06c3fb27SDimitry Andric /// P[2] := __B.xmm[i].dword[2] 126*06c3fb27SDimitry Andric /// P[3] := __B.xmm[i].dword[3] 127*06c3fb27SDimitry Andric /// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 128*06c3fb27SDimitry Andric /// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 129*06c3fb27SDimitry Andric /// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 130*06c3fb27SDimitry Andric /// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 131*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[0] := C[0] 132*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[1] := C[1] 133*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[2] := C[2] 134*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[3] := C[3] 135*06c3fb27SDimitry Andric /// ENDFOR 136*06c3fb27SDimitry Andric /// DEST[MAX:256] := 0 137*06c3fb27SDimitry Andric /// \endcode 138*06c3fb27SDimitry Andric #define _mm256_sm4key4_epi32(A, B) \ 139*06c3fb27SDimitry Andric (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B) 140*06c3fb27SDimitry Andric 141*06c3fb27SDimitry Andric /// This intrinisc performs four rounds of SM4 encryption. The intrinisc 142*06c3fb27SDimitry Andric /// operates on independent 128-bit lanes. The calculated results are 143*06c3fb27SDimitry Andric /// stored in \a dst. 144*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 145*06c3fb27SDimitry Andric /// 146*06c3fb27SDimitry Andric /// \code 147*06c3fb27SDimitry Andric /// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B) 148*06c3fb27SDimitry Andric /// \endcode 149*06c3fb27SDimitry Andric /// 150*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VSM4RNDS4 instruction. 151*06c3fb27SDimitry Andric /// 152*06c3fb27SDimitry Andric /// \param __A 153*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int]. 154*06c3fb27SDimitry Andric /// \param __B 155*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int]. 156*06c3fb27SDimitry Andric /// \returns 157*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int]. 158*06c3fb27SDimitry Andric /// 159*06c3fb27SDimitry Andric /// \code{.operation} 160*06c3fb27SDimitry Andric /// DEFINE ROL32(dword, n) { 161*06c3fb27SDimitry Andric /// count := n % 32 162*06c3fb27SDimitry Andric /// dest := (dword << count) | (dword >> (32-count)) 163*06c3fb27SDimitry Andric /// RETURN dest 164*06c3fb27SDimitry Andric /// } 165*06c3fb27SDimitry Andric /// DEFINE lower_t(dword) { 166*06c3fb27SDimitry Andric /// tmp.byte[0] := SBOX_BYTE(dword, 0) 167*06c3fb27SDimitry Andric /// tmp.byte[1] := SBOX_BYTE(dword, 1) 168*06c3fb27SDimitry Andric /// tmp.byte[2] := SBOX_BYTE(dword, 2) 169*06c3fb27SDimitry Andric /// tmp.byte[3] := SBOX_BYTE(dword, 3) 170*06c3fb27SDimitry Andric /// RETURN tmp 171*06c3fb27SDimitry Andric /// } 172*06c3fb27SDimitry Andric /// DEFINE L_RND(dword) { 173*06c3fb27SDimitry Andric /// tmp := dword 174*06c3fb27SDimitry Andric /// tmp := tmp ^ ROL32(dword, 2) 175*06c3fb27SDimitry Andric /// tmp := tmp ^ ROL32(dword, 10) 176*06c3fb27SDimitry Andric /// tmp := tmp ^ ROL32(dword, 18) 177*06c3fb27SDimitry Andric /// tmp := tmp ^ ROL32(dword, 24) 178*06c3fb27SDimitry Andric /// RETURN tmp 179*06c3fb27SDimitry Andric /// } 180*06c3fb27SDimitry Andric /// DEFINE T_RND(dword) { 181*06c3fb27SDimitry Andric /// RETURN L_RND(lower_t(dword)) 182*06c3fb27SDimitry Andric /// } 183*06c3fb27SDimitry Andric /// DEFINE F_RND(X0, X1, X2, X3, round_key) { 184*06c3fb27SDimitry Andric /// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) 185*06c3fb27SDimitry Andric /// } 186*06c3fb27SDimitry Andric /// FOR i:= 0 to 0 187*06c3fb27SDimitry Andric /// P[0] := __B.xmm[i].dword[0] 188*06c3fb27SDimitry Andric /// P[1] := __B.xmm[i].dword[1] 189*06c3fb27SDimitry Andric /// P[2] := __B.xmm[i].dword[2] 190*06c3fb27SDimitry Andric /// P[3] := __B.xmm[i].dword[3] 191*06c3fb27SDimitry Andric /// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 192*06c3fb27SDimitry Andric /// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 193*06c3fb27SDimitry Andric /// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 194*06c3fb27SDimitry Andric /// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 195*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[0] := C[0] 196*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[1] := C[1] 197*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[2] := C[2] 198*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[3] := C[3] 199*06c3fb27SDimitry Andric /// ENDFOR 200*06c3fb27SDimitry Andric /// DEST[MAX:128] := 0 201*06c3fb27SDimitry Andric /// \endcode 202*06c3fb27SDimitry Andric #define _mm_sm4rnds4_epi32(A, B) \ 203*06c3fb27SDimitry Andric (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B) 204*06c3fb27SDimitry Andric 205*06c3fb27SDimitry Andric /// This intrinisc performs four rounds of SM4 encryption. The intrinisc 206*06c3fb27SDimitry Andric /// operates on independent 128-bit lanes. The calculated results are 207*06c3fb27SDimitry Andric /// stored in \a dst. 208*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 209*06c3fb27SDimitry Andric /// 210*06c3fb27SDimitry Andric /// \code 211*06c3fb27SDimitry Andric /// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B) 212*06c3fb27SDimitry Andric /// \endcode 213*06c3fb27SDimitry Andric /// 214*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VSM4RNDS4 instruction. 215*06c3fb27SDimitry Andric /// 216*06c3fb27SDimitry Andric /// \param __A 217*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int]. 218*06c3fb27SDimitry Andric /// \param __B 219*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int]. 220*06c3fb27SDimitry Andric /// \returns 221*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int]. 222*06c3fb27SDimitry Andric /// 223*06c3fb27SDimitry Andric /// \code{.operation} 224*06c3fb27SDimitry Andric /// DEFINE ROL32(dword, n) { 225*06c3fb27SDimitry Andric /// count := n % 32 226*06c3fb27SDimitry Andric /// dest := (dword << count) | (dword >> (32-count)) 227*06c3fb27SDimitry Andric /// RETURN dest 228*06c3fb27SDimitry Andric /// } 229*06c3fb27SDimitry Andric /// DEFINE lower_t(dword) { 230*06c3fb27SDimitry Andric /// tmp.byte[0] := SBOX_BYTE(dword, 0) 231*06c3fb27SDimitry Andric /// tmp.byte[1] := SBOX_BYTE(dword, 1) 232*06c3fb27SDimitry Andric /// tmp.byte[2] := SBOX_BYTE(dword, 2) 233*06c3fb27SDimitry Andric /// tmp.byte[3] := SBOX_BYTE(dword, 3) 234*06c3fb27SDimitry Andric /// RETURN tmp 235*06c3fb27SDimitry Andric /// } 236*06c3fb27SDimitry Andric /// DEFINE L_RND(dword) { 237*06c3fb27SDimitry Andric /// tmp := dword 238*06c3fb27SDimitry Andric /// tmp := tmp ^ ROL32(dword, 2) 239*06c3fb27SDimitry Andric /// tmp := tmp ^ ROL32(dword, 10) 240*06c3fb27SDimitry Andric /// tmp := tmp ^ ROL32(dword, 18) 241*06c3fb27SDimitry Andric /// tmp := tmp ^ ROL32(dword, 24) 242*06c3fb27SDimitry Andric /// RETURN tmp 243*06c3fb27SDimitry Andric /// } 244*06c3fb27SDimitry Andric /// DEFINE T_RND(dword) { 245*06c3fb27SDimitry Andric /// RETURN L_RND(lower_t(dword)) 246*06c3fb27SDimitry Andric /// } 247*06c3fb27SDimitry Andric /// DEFINE F_RND(X0, X1, X2, X3, round_key) { 248*06c3fb27SDimitry Andric /// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) 249*06c3fb27SDimitry Andric /// } 250*06c3fb27SDimitry Andric /// FOR i:= 0 to 0 251*06c3fb27SDimitry Andric /// P[0] := __B.xmm[i].dword[0] 252*06c3fb27SDimitry Andric /// P[1] := __B.xmm[i].dword[1] 253*06c3fb27SDimitry Andric /// P[2] := __B.xmm[i].dword[2] 254*06c3fb27SDimitry Andric /// P[3] := __B.xmm[i].dword[3] 255*06c3fb27SDimitry Andric /// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 256*06c3fb27SDimitry Andric /// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 257*06c3fb27SDimitry Andric /// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 258*06c3fb27SDimitry Andric /// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 259*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[0] := C[0] 260*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[1] := C[1] 261*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[2] := C[2] 262*06c3fb27SDimitry Andric /// DEST.xmm[i].dword[3] := C[3] 263*06c3fb27SDimitry Andric /// ENDFOR 264*06c3fb27SDimitry Andric /// DEST[MAX:256] := 0 265*06c3fb27SDimitry Andric /// \endcode 266*06c3fb27SDimitry Andric #define _mm256_sm4rnds4_epi32(A, B) \ 267*06c3fb27SDimitry Andric (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B) 268*06c3fb27SDimitry Andric 269*06c3fb27SDimitry Andric #endif // __SM4INTRIN_H 270