xref: /freebsd-src/contrib/llvm-project/clang/lib/Headers/sm4intrin.h (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1*06c3fb27SDimitry Andric /*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
2*06c3fb27SDimitry Andric  *
3*06c3fb27SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*06c3fb27SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
5*06c3fb27SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*06c3fb27SDimitry Andric  *
7*06c3fb27SDimitry Andric  *===-----------------------------------------------------------------------===
8*06c3fb27SDimitry Andric  */
9*06c3fb27SDimitry Andric 
10*06c3fb27SDimitry Andric #ifndef __IMMINTRIN_H
11*06c3fb27SDimitry Andric #error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
12*06c3fb27SDimitry Andric #endif // __IMMINTRIN_H
13*06c3fb27SDimitry Andric 
14*06c3fb27SDimitry Andric #ifndef __SM4INTRIN_H
15*06c3fb27SDimitry Andric #define __SM4INTRIN_H
16*06c3fb27SDimitry Andric 
17*06c3fb27SDimitry Andric /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
18*06c3fb27SDimitry Andric ///    operates on independent 128-bit lanes. The calculated results are
19*06c3fb27SDimitry Andric ///    stored in \a dst.
20*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
21*06c3fb27SDimitry Andric ///
22*06c3fb27SDimitry Andric /// \code
23*06c3fb27SDimitry Andric /// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B)
24*06c3fb27SDimitry Andric /// \endcode
25*06c3fb27SDimitry Andric ///
26*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VSM4KEY4 instruction.
27*06c3fb27SDimitry Andric ///
28*06c3fb27SDimitry Andric /// \param __A
29*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x int].
30*06c3fb27SDimitry Andric /// \param __B
31*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x int].
32*06c3fb27SDimitry Andric /// \returns
33*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x int].
34*06c3fb27SDimitry Andric ///
35*06c3fb27SDimitry Andric /// \code{.operation}
36*06c3fb27SDimitry Andric /// DEFINE ROL32(dword, n) {
37*06c3fb27SDimitry Andric /// 	count := n % 32
38*06c3fb27SDimitry Andric /// 	dest := (dword << count) | (dword >> (32-count))
39*06c3fb27SDimitry Andric /// 	RETURN dest
40*06c3fb27SDimitry Andric /// }
41*06c3fb27SDimitry Andric /// DEFINE SBOX_BYTE(dword, i) {
42*06c3fb27SDimitry Andric /// 	RETURN sbox[dword.byte[i]]
43*06c3fb27SDimitry Andric /// }
44*06c3fb27SDimitry Andric /// DEFINE lower_t(dword) {
45*06c3fb27SDimitry Andric /// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
46*06c3fb27SDimitry Andric /// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
47*06c3fb27SDimitry Andric /// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
48*06c3fb27SDimitry Andric /// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
49*06c3fb27SDimitry Andric /// 	RETURN tmp
50*06c3fb27SDimitry Andric /// }
51*06c3fb27SDimitry Andric /// DEFINE L_KEY(dword) {
52*06c3fb27SDimitry Andric /// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
53*06c3fb27SDimitry Andric /// }
54*06c3fb27SDimitry Andric /// DEFINE T_KEY(dword) {
55*06c3fb27SDimitry Andric /// 	RETURN L_KEY(lower_t(dword))
56*06c3fb27SDimitry Andric /// }
57*06c3fb27SDimitry Andric /// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
58*06c3fb27SDimitry Andric /// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
59*06c3fb27SDimitry Andric /// }
60*06c3fb27SDimitry Andric /// FOR i:= 0 to 0
61*06c3fb27SDimitry Andric /// 	P[0] := __B.xmm[i].dword[0]
62*06c3fb27SDimitry Andric /// 	P[1] := __B.xmm[i].dword[1]
63*06c3fb27SDimitry Andric /// 	P[2] := __B.xmm[i].dword[2]
64*06c3fb27SDimitry Andric /// 	P[3] := __B.xmm[i].dword[3]
65*06c3fb27SDimitry Andric /// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
66*06c3fb27SDimitry Andric /// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
67*06c3fb27SDimitry Andric /// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
68*06c3fb27SDimitry Andric /// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
69*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[0] := C[0]
70*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[1] := C[1]
71*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[2] := C[2]
72*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[3] := C[3]
73*06c3fb27SDimitry Andric /// ENDFOR
74*06c3fb27SDimitry Andric /// DEST[MAX:128] := 0
75*06c3fb27SDimitry Andric /// \endcode
76*06c3fb27SDimitry Andric #define _mm_sm4key4_epi32(A, B)                                                \
77*06c3fb27SDimitry Andric   (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
78*06c3fb27SDimitry Andric 
79*06c3fb27SDimitry Andric /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
80*06c3fb27SDimitry Andric ///    operates on independent 128-bit lanes. The calculated results are
81*06c3fb27SDimitry Andric ///    stored in \a dst.
82*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
83*06c3fb27SDimitry Andric ///
84*06c3fb27SDimitry Andric /// \code
85*06c3fb27SDimitry Andric /// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
86*06c3fb27SDimitry Andric /// \endcode
87*06c3fb27SDimitry Andric ///
88*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VSM4KEY4 instruction.
89*06c3fb27SDimitry Andric ///
90*06c3fb27SDimitry Andric /// \param __A
91*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x int].
92*06c3fb27SDimitry Andric /// \param __B
93*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x int].
94*06c3fb27SDimitry Andric /// \returns
95*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x int].
96*06c3fb27SDimitry Andric ///
97*06c3fb27SDimitry Andric /// \code{.operation}
98*06c3fb27SDimitry Andric /// DEFINE ROL32(dword, n) {
99*06c3fb27SDimitry Andric /// 	count := n % 32
100*06c3fb27SDimitry Andric /// 	dest := (dword << count) | (dword >> (32-count))
101*06c3fb27SDimitry Andric /// 	RETURN dest
102*06c3fb27SDimitry Andric /// }
103*06c3fb27SDimitry Andric /// DEFINE SBOX_BYTE(dword, i) {
104*06c3fb27SDimitry Andric /// 	RETURN sbox[dword.byte[i]]
105*06c3fb27SDimitry Andric /// }
106*06c3fb27SDimitry Andric /// DEFINE lower_t(dword) {
107*06c3fb27SDimitry Andric /// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
108*06c3fb27SDimitry Andric /// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
109*06c3fb27SDimitry Andric /// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
110*06c3fb27SDimitry Andric /// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
111*06c3fb27SDimitry Andric /// 	RETURN tmp
112*06c3fb27SDimitry Andric /// }
113*06c3fb27SDimitry Andric /// DEFINE L_KEY(dword) {
114*06c3fb27SDimitry Andric /// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
115*06c3fb27SDimitry Andric /// }
116*06c3fb27SDimitry Andric /// DEFINE T_KEY(dword) {
117*06c3fb27SDimitry Andric /// 	RETURN L_KEY(lower_t(dword))
118*06c3fb27SDimitry Andric /// }
119*06c3fb27SDimitry Andric /// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
120*06c3fb27SDimitry Andric /// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
121*06c3fb27SDimitry Andric /// }
122*06c3fb27SDimitry Andric /// FOR i:= 0 to 1
123*06c3fb27SDimitry Andric /// 	P[0] := __B.xmm[i].dword[0]
124*06c3fb27SDimitry Andric /// 	P[1] := __B.xmm[i].dword[1]
125*06c3fb27SDimitry Andric /// 	P[2] := __B.xmm[i].dword[2]
126*06c3fb27SDimitry Andric /// 	P[3] := __B.xmm[i].dword[3]
127*06c3fb27SDimitry Andric /// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
128*06c3fb27SDimitry Andric /// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
129*06c3fb27SDimitry Andric /// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
130*06c3fb27SDimitry Andric /// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
131*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[0] := C[0]
132*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[1] := C[1]
133*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[2] := C[2]
134*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[3] := C[3]
135*06c3fb27SDimitry Andric /// ENDFOR
136*06c3fb27SDimitry Andric /// DEST[MAX:256] := 0
137*06c3fb27SDimitry Andric /// \endcode
138*06c3fb27SDimitry Andric #define _mm256_sm4key4_epi32(A, B)                                             \
139*06c3fb27SDimitry Andric   (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
140*06c3fb27SDimitry Andric 
141*06c3fb27SDimitry Andric /// This intrinisc performs four rounds of SM4 encryption. The intrinisc
142*06c3fb27SDimitry Andric ///    operates on independent 128-bit lanes. The calculated results are
143*06c3fb27SDimitry Andric ///    stored in \a dst.
144*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
145*06c3fb27SDimitry Andric ///
146*06c3fb27SDimitry Andric /// \code
147*06c3fb27SDimitry Andric /// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
148*06c3fb27SDimitry Andric /// \endcode
149*06c3fb27SDimitry Andric ///
150*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
151*06c3fb27SDimitry Andric ///
152*06c3fb27SDimitry Andric /// \param __A
153*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x int].
154*06c3fb27SDimitry Andric /// \param __B
155*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x int].
156*06c3fb27SDimitry Andric /// \returns
157*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x int].
158*06c3fb27SDimitry Andric ///
159*06c3fb27SDimitry Andric /// \code{.operation}
160*06c3fb27SDimitry Andric /// DEFINE ROL32(dword, n) {
161*06c3fb27SDimitry Andric /// 	count := n % 32
162*06c3fb27SDimitry Andric /// 	dest := (dword << count) | (dword >> (32-count))
163*06c3fb27SDimitry Andric /// 	RETURN dest
164*06c3fb27SDimitry Andric /// }
165*06c3fb27SDimitry Andric /// DEFINE lower_t(dword) {
166*06c3fb27SDimitry Andric /// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
167*06c3fb27SDimitry Andric /// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
168*06c3fb27SDimitry Andric /// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
169*06c3fb27SDimitry Andric /// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
170*06c3fb27SDimitry Andric /// 	RETURN tmp
171*06c3fb27SDimitry Andric /// }
172*06c3fb27SDimitry Andric /// DEFINE L_RND(dword) {
173*06c3fb27SDimitry Andric /// 	tmp := dword
174*06c3fb27SDimitry Andric /// 	tmp := tmp ^ ROL32(dword, 2)
175*06c3fb27SDimitry Andric /// 	tmp := tmp ^ ROL32(dword, 10)
176*06c3fb27SDimitry Andric /// 	tmp := tmp ^ ROL32(dword, 18)
177*06c3fb27SDimitry Andric /// 	tmp := tmp ^ ROL32(dword, 24)
178*06c3fb27SDimitry Andric ///   RETURN tmp
179*06c3fb27SDimitry Andric /// }
180*06c3fb27SDimitry Andric /// DEFINE T_RND(dword) {
181*06c3fb27SDimitry Andric /// 	RETURN L_RND(lower_t(dword))
182*06c3fb27SDimitry Andric /// }
183*06c3fb27SDimitry Andric /// DEFINE F_RND(X0, X1, X2, X3, round_key) {
184*06c3fb27SDimitry Andric /// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
185*06c3fb27SDimitry Andric /// }
186*06c3fb27SDimitry Andric /// FOR i:= 0 to 0
187*06c3fb27SDimitry Andric /// 	P[0] := __B.xmm[i].dword[0]
188*06c3fb27SDimitry Andric /// 	P[1] := __B.xmm[i].dword[1]
189*06c3fb27SDimitry Andric /// 	P[2] := __B.xmm[i].dword[2]
190*06c3fb27SDimitry Andric /// 	P[3] := __B.xmm[i].dword[3]
191*06c3fb27SDimitry Andric /// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
192*06c3fb27SDimitry Andric /// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
193*06c3fb27SDimitry Andric /// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
194*06c3fb27SDimitry Andric /// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
195*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[0] := C[0]
196*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[1] := C[1]
197*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[2] := C[2]
198*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[3] := C[3]
199*06c3fb27SDimitry Andric /// ENDFOR
200*06c3fb27SDimitry Andric /// DEST[MAX:128] := 0
201*06c3fb27SDimitry Andric /// \endcode
202*06c3fb27SDimitry Andric #define _mm_sm4rnds4_epi32(A, B)                                               \
203*06c3fb27SDimitry Andric   (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
204*06c3fb27SDimitry Andric 
205*06c3fb27SDimitry Andric /// This intrinisc performs four rounds of SM4 encryption. The intrinisc
206*06c3fb27SDimitry Andric ///    operates on independent 128-bit lanes. The calculated results are
207*06c3fb27SDimitry Andric ///    stored in \a dst.
208*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
209*06c3fb27SDimitry Andric ///
210*06c3fb27SDimitry Andric /// \code
211*06c3fb27SDimitry Andric /// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
212*06c3fb27SDimitry Andric /// \endcode
213*06c3fb27SDimitry Andric ///
214*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
215*06c3fb27SDimitry Andric ///
216*06c3fb27SDimitry Andric /// \param __A
217*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x int].
218*06c3fb27SDimitry Andric /// \param __B
219*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x int].
220*06c3fb27SDimitry Andric /// \returns
221*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x int].
222*06c3fb27SDimitry Andric ///
223*06c3fb27SDimitry Andric /// \code{.operation}
224*06c3fb27SDimitry Andric /// DEFINE ROL32(dword, n) {
225*06c3fb27SDimitry Andric /// 	count := n % 32
226*06c3fb27SDimitry Andric /// 	dest := (dword << count) | (dword >> (32-count))
227*06c3fb27SDimitry Andric /// 	RETURN dest
228*06c3fb27SDimitry Andric /// }
229*06c3fb27SDimitry Andric /// DEFINE lower_t(dword) {
230*06c3fb27SDimitry Andric /// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
231*06c3fb27SDimitry Andric /// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
232*06c3fb27SDimitry Andric /// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
233*06c3fb27SDimitry Andric /// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
234*06c3fb27SDimitry Andric /// 	RETURN tmp
235*06c3fb27SDimitry Andric /// }
236*06c3fb27SDimitry Andric /// DEFINE L_RND(dword) {
237*06c3fb27SDimitry Andric /// 	tmp := dword
238*06c3fb27SDimitry Andric /// 	tmp := tmp ^ ROL32(dword, 2)
239*06c3fb27SDimitry Andric /// 	tmp := tmp ^ ROL32(dword, 10)
240*06c3fb27SDimitry Andric /// 	tmp := tmp ^ ROL32(dword, 18)
241*06c3fb27SDimitry Andric /// 	tmp := tmp ^ ROL32(dword, 24)
242*06c3fb27SDimitry Andric ///   RETURN tmp
243*06c3fb27SDimitry Andric /// }
244*06c3fb27SDimitry Andric /// DEFINE T_RND(dword) {
245*06c3fb27SDimitry Andric /// 	RETURN L_RND(lower_t(dword))
246*06c3fb27SDimitry Andric /// }
247*06c3fb27SDimitry Andric /// DEFINE F_RND(X0, X1, X2, X3, round_key) {
248*06c3fb27SDimitry Andric /// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
249*06c3fb27SDimitry Andric /// }
250*06c3fb27SDimitry Andric /// FOR i:= 0 to 0
251*06c3fb27SDimitry Andric /// 	P[0] := __B.xmm[i].dword[0]
252*06c3fb27SDimitry Andric /// 	P[1] := __B.xmm[i].dword[1]
253*06c3fb27SDimitry Andric /// 	P[2] := __B.xmm[i].dword[2]
254*06c3fb27SDimitry Andric /// 	P[3] := __B.xmm[i].dword[3]
255*06c3fb27SDimitry Andric /// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
256*06c3fb27SDimitry Andric /// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
257*06c3fb27SDimitry Andric /// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
258*06c3fb27SDimitry Andric /// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
259*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[0] := C[0]
260*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[1] := C[1]
261*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[2] := C[2]
262*06c3fb27SDimitry Andric /// 	DEST.xmm[i].dword[3] := C[3]
263*06c3fb27SDimitry Andric /// ENDFOR
264*06c3fb27SDimitry Andric /// DEST[MAX:256] := 0
265*06c3fb27SDimitry Andric /// \endcode
266*06c3fb27SDimitry Andric #define _mm256_sm4rnds4_epi32(A, B)                                            \
267*06c3fb27SDimitry Andric   (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
268*06c3fb27SDimitry Andric 
269*06c3fb27SDimitry Andric #endif // __SM4INTRIN_H
270