xref: /llvm-project/llvm/test/CodeGen/X86/keylocker-intrinsics-fast-isel.ll (revision de3e4a9dfe89dfc0a4d6d5e0891c542f6c82ca57)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+kl,+widekl | FileCheck %s
3
4; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/X86/keylocker-builtins.c
5
6define void @test_loadiwkey(i32 %ctl, <2 x i64> %intkey, <2 x i64> %enkey_lo, <2 x i64> %enkey_hi) {
7; CHECK-LABEL: test_loadiwkey:
8; CHECK:       # %bb.0: # %entry
9; CHECK-NEXT:    movl %edi, %eax
10; CHECK-NEXT:    loadiwkey %xmm2, %xmm1
11; CHECK-NEXT:    retq
12entry:
13  tail call void @llvm.x86.loadiwkey(<2 x i64> %intkey, <2 x i64> %enkey_lo, <2 x i64> %enkey_hi, i32 %ctl)
14  ret void
15}
16
17define i32 @test_encodekey128_u32(i32 %htype, <2 x i64> %key, ptr nocapture %h) {
18; CHECK-LABEL: test_encodekey128_u32:
19; CHECK:       # %bb.0: # %entry
20; CHECK-NEXT:    encodekey128 %edi, %eax
21; CHECK-NEXT:    movups %xmm0, (%rsi)
22; CHECK-NEXT:    movups %xmm1, 16(%rsi)
23; CHECK-NEXT:    movups %xmm2, 32(%rsi)
24; CHECK-NEXT:    movups %xmm4, 48(%rsi)
25; CHECK-NEXT:    movups %xmm5, 64(%rsi)
26; CHECK-NEXT:    movups %xmm6, 80(%rsi)
27; CHECK-NEXT:    retq
28entry:
29  %0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 %htype, <2 x i64> %key)
30  %1 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
31  store <2 x i64> %1, ptr %h, align 1
32  %2 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
33  %3 = getelementptr i8, ptr %h, i64 16
34  store <2 x i64> %2, ptr %3, align 1
35  %4 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
36  %5 = getelementptr i8, ptr %h, i64 32
37  store <2 x i64> %4, ptr %5, align 1
38  %6 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
39  %7 = getelementptr i8, ptr %h, i64 48
40  store <2 x i64> %6, ptr %7, align 1
41  %8 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
42  %9 = getelementptr i8, ptr %h, i64 64
43  store <2 x i64> %8, ptr %9, align 1
44  %10 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
45  %11 = getelementptr i8, ptr %h, i64 80
46  store <2 x i64> %10, ptr %11, align 1
47  %12 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
48  ret i32 %12
49}
50
51define i32 @test_encodekey256_u32(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi, ptr nocapture %h) {
52; CHECK-LABEL: test_encodekey256_u32:
53; CHECK:       # %bb.0: # %entry
54; CHECK-NEXT:    encodekey256 %edi, %eax
55; CHECK-NEXT:    movups %xmm0, (%rsi)
56; CHECK-NEXT:    movups %xmm1, 16(%rsi)
57; CHECK-NEXT:    movups %xmm2, 32(%rsi)
58; CHECK-NEXT:    movups %xmm3, 48(%rsi)
59; CHECK-NEXT:    movups %xmm4, 64(%rsi)
60; CHECK-NEXT:    movups %xmm5, 80(%rsi)
61; CHECK-NEXT:    movups %xmm6, 96(%rsi)
62; CHECK-NEXT:    retq
63entry:
64  %0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi)
65  %1 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
66  store <2 x i64> %1, ptr %h, align 1
67  %2 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
68  %3 = getelementptr i8, ptr %h, i64 16
69  store <2 x i64> %2, ptr %3, align 1
70  %4 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
71  %5 = getelementptr i8, ptr %h, i64 32
72  store <2 x i64> %4, ptr %5, align 1
73  %6 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
74  %7 = getelementptr i8, ptr %h, i64 48
75  store <2 x i64> %6, ptr %7, align 1
76  %8 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
77  %9 = getelementptr i8, ptr %h, i64 64
78  store <2 x i64> %8, ptr %9, align 1
79  %10 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
80  %11 = getelementptr i8, ptr %h, i64 80
81  store <2 x i64> %10, ptr %11, align 1
82  %12 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 7
83  %13 = getelementptr i8, ptr %h, i64 96
84  store <2 x i64> %12, ptr %13, align 1
85  %14 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
86  ret i32 %14
87}
88
89define zeroext i8 @test_mm_aesenc256kl_u8(ptr %odata, <2 x i64> %idata, ptr %h) {
90; CHECK-LABEL: test_mm_aesenc256kl_u8:
91; CHECK:       # %bb.0: # %entry
92; CHECK-NEXT:    aesenc256kl (%rsi), %xmm0
93; CHECK-NEXT:    sete %al
94; CHECK-NEXT:    movaps %xmm0, (%rdi)
95; CHECK-NEXT:    retq
96entry:
97  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %idata, ptr %h) #1
98  %1 = extractvalue { i8, <2 x i64> } %0, 1
99  store <2 x i64> %1, ptr %odata, align 16
100  %2 = extractvalue { i8, <2 x i64> } %0, 0
101  ret i8 %2
102}
103
104define zeroext i8 @test_mm_aesdec256kl_u8(ptr %odata, <2 x i64> %idata, ptr %h) {
105; CHECK-LABEL: test_mm_aesdec256kl_u8:
106; CHECK:       # %bb.0: # %entry
107; CHECK-NEXT:    aesdec256kl (%rsi), %xmm0
108; CHECK-NEXT:    sete %al
109; CHECK-NEXT:    movaps %xmm0, (%rdi)
110; CHECK-NEXT:    retq
111entry:
112  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %idata, ptr %h) #1
113  %1 = extractvalue { i8, <2 x i64> } %0, 1
114  store <2 x i64> %1, ptr %odata, align 16
115  %2 = extractvalue { i8, <2 x i64> } %0, 0
116  ret i8 %2
117}
118
119define zeroext i8 @test_mm_aesenc128kl_u8(ptr %odata, <2 x i64> %idata, ptr %h) {
120; CHECK-LABEL: test_mm_aesenc128kl_u8:
121; CHECK:       # %bb.0: # %entry
122; CHECK-NEXT:    aesenc128kl (%rsi), %xmm0
123; CHECK-NEXT:    sete %al
124; CHECK-NEXT:    movaps %xmm0, (%rdi)
125; CHECK-NEXT:    retq
126entry:
127  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %idata, ptr %h) #1
128  %1 = extractvalue { i8, <2 x i64> } %0, 1
129  store <2 x i64> %1, ptr %odata, align 16
130  %2 = extractvalue { i8, <2 x i64> } %0, 0
131  ret i8 %2
132}
133
134define zeroext i8 @test_mm_aesdec128kl_u8(ptr %odata, <2 x i64> %idata, ptr %h) {
135; CHECK-LABEL: test_mm_aesdec128kl_u8:
136; CHECK:       # %bb.0: # %entry
137; CHECK-NEXT:    aesdec128kl (%rsi), %xmm0
138; CHECK-NEXT:    sete %al
139; CHECK-NEXT:    movaps %xmm0, (%rdi)
140; CHECK-NEXT:    retq
141entry:
142  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %idata, ptr %h) #1
143  %1 = extractvalue { i8, <2 x i64> } %0, 1
144  store <2 x i64> %1, ptr %odata, align 16
145  %2 = extractvalue { i8, <2 x i64> } %0, 0
146  ret i8 %2
147}
148
149define zeroext i8 @test__mm_aesencwide128kl_u8(ptr %odata, ptr %idata, ptr %h) {
150; CHECK-LABEL: test__mm_aesencwide128kl_u8:
151; CHECK:       # %bb.0: # %entry
152; CHECK-NEXT:    movaps (%rsi), %xmm0
153; CHECK-NEXT:    movaps 16(%rsi), %xmm1
154; CHECK-NEXT:    movaps 32(%rsi), %xmm2
155; CHECK-NEXT:    movaps 48(%rsi), %xmm3
156; CHECK-NEXT:    movaps 64(%rsi), %xmm4
157; CHECK-NEXT:    movaps 80(%rsi), %xmm5
158; CHECK-NEXT:    movaps 96(%rsi), %xmm6
159; CHECK-NEXT:    movaps 112(%rsi), %xmm7
160; CHECK-NEXT:    aesencwide128kl (%rdx)
161; CHECK-NEXT:    sete %al
162; CHECK-NEXT:    movaps %xmm0, (%rdi)
163; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
164; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
165; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
166; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
167; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
168; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
169; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
170; CHECK-NEXT:    retq
171entry:
172  %0 = load <2 x i64>, ptr %idata, align 16
173  %1 = getelementptr <2 x i64>, ptr %idata, i64 1
174  %2 = load <2 x i64>, ptr %1, align 16
175  %3 = getelementptr <2 x i64>, ptr %idata, i64 2
176  %4 = load <2 x i64>, ptr %3, align 16
177  %5 = getelementptr <2 x i64>, ptr %idata, i64 3
178  %6 = load <2 x i64>, ptr %5, align 16
179  %7 = getelementptr <2 x i64>, ptr %idata, i64 4
180  %8 = load <2 x i64>, ptr %7, align 16
181  %9 = getelementptr <2 x i64>, ptr %idata, i64 5
182  %10 = load <2 x i64>, ptr %9, align 16
183  %11 = getelementptr <2 x i64>, ptr %idata, i64 6
184  %12 = load <2 x i64>, ptr %11, align 16
185  %13 = getelementptr <2 x i64>, ptr %idata, i64 7
186  %14 = load <2 x i64>, ptr %13, align 16
187  %15 = tail call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr %h, <2 x i64> %0, <2 x i64> %2, <2 x i64> %4, <2 x i64> %6, <2 x i64> %8, <2 x i64> %10, <2 x i64> %12, <2 x i64> %14) #1
188  %16 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 1
189  store <2 x i64> %16, ptr %odata, align 16
190  %17 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 2
191  %18 = getelementptr <2 x i64>, ptr %odata, i64 1
192  store <2 x i64> %17, ptr %18, align 16
193  %19 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 3
194  %20 = getelementptr <2 x i64>, ptr %odata, i64 2
195  store <2 x i64> %19, ptr %20, align 16
196  %21 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 4
197  %22 = getelementptr <2 x i64>, ptr %odata, i64 3
198  store <2 x i64> %21, ptr %22, align 16
199  %23 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 5
200  %24 = getelementptr <2 x i64>, ptr %odata, i64 4
201  store <2 x i64> %23, ptr %24, align 16
202  %25 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 6
203  %26 = getelementptr <2 x i64>, ptr %odata, i64 5
204  store <2 x i64> %25, ptr %26, align 16
205  %27 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 7
206  %28 = getelementptr <2 x i64>, ptr %odata, i64 6
207  store <2 x i64> %27, ptr %28, align 16
208  %29 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 8
209  %30 = getelementptr <2 x i64>, ptr %odata, i64 7
210  store <2 x i64> %29, ptr %30, align 16
211  %31 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 0
212  ret i8 %31
213}
214
215define zeroext i8 @test__mm_aesdecwide128kl_u8(ptr %odata, ptr %idata, ptr %h) {
216; CHECK-LABEL: test__mm_aesdecwide128kl_u8:
217; CHECK:       # %bb.0: # %entry
218; CHECK-NEXT:    movaps (%rsi), %xmm0
219; CHECK-NEXT:    movaps 16(%rsi), %xmm1
220; CHECK-NEXT:    movaps 32(%rsi), %xmm2
221; CHECK-NEXT:    movaps 48(%rsi), %xmm3
222; CHECK-NEXT:    movaps 64(%rsi), %xmm4
223; CHECK-NEXT:    movaps 80(%rsi), %xmm5
224; CHECK-NEXT:    movaps 96(%rsi), %xmm6
225; CHECK-NEXT:    movaps 112(%rsi), %xmm7
226; CHECK-NEXT:    aesdecwide128kl (%rdx)
227; CHECK-NEXT:    sete %al
228; CHECK-NEXT:    movaps %xmm0, (%rdi)
229; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
230; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
231; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
232; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
233; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
234; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
235; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
236; CHECK-NEXT:    retq
237entry:
238  %0 = load <2 x i64>, ptr %idata, align 16
239  %1 = getelementptr <2 x i64>, ptr %idata, i64 1
240  %2 = load <2 x i64>, ptr %1, align 16
241  %3 = getelementptr <2 x i64>, ptr %idata, i64 2
242  %4 = load <2 x i64>, ptr %3, align 16
243  %5 = getelementptr <2 x i64>, ptr %idata, i64 3
244  %6 = load <2 x i64>, ptr %5, align 16
245  %7 = getelementptr <2 x i64>, ptr %idata, i64 4
246  %8 = load <2 x i64>, ptr %7, align 16
247  %9 = getelementptr <2 x i64>, ptr %idata, i64 5
248  %10 = load <2 x i64>, ptr %9, align 16
249  %11 = getelementptr <2 x i64>, ptr %idata, i64 6
250  %12 = load <2 x i64>, ptr %11, align 16
251  %13 = getelementptr <2 x i64>, ptr %idata, i64 7
252  %14 = load <2 x i64>, ptr %13, align 16
253  %15 = tail call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr %h, <2 x i64> %0, <2 x i64> %2, <2 x i64> %4, <2 x i64> %6, <2 x i64> %8, <2 x i64> %10, <2 x i64> %12, <2 x i64> %14) #1
254  %16 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 1
255  store <2 x i64> %16, ptr %odata, align 16
256  %17 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 2
257  %18 = getelementptr <2 x i64>, ptr %odata, i64 1
258  store <2 x i64> %17, ptr %18, align 16
259  %19 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 3
260  %20 = getelementptr <2 x i64>, ptr %odata, i64 2
261  store <2 x i64> %19, ptr %20, align 16
262  %21 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 4
263  %22 = getelementptr <2 x i64>, ptr %odata, i64 3
264  store <2 x i64> %21, ptr %22, align 16
265  %23 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 5
266  %24 = getelementptr <2 x i64>, ptr %odata, i64 4
267  store <2 x i64> %23, ptr %24, align 16
268  %25 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 6
269  %26 = getelementptr <2 x i64>, ptr %odata, i64 5
270  store <2 x i64> %25, ptr %26, align 16
271  %27 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 7
272  %28 = getelementptr <2 x i64>, ptr %odata, i64 6
273  store <2 x i64> %27, ptr %28, align 16
274  %29 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 8
275  %30 = getelementptr <2 x i64>, ptr %odata, i64 7
276  store <2 x i64> %29, ptr %30, align 16
277  %31 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 0
278  ret i8 %31
279}
280
281define zeroext i8 @test__mm_aesencwide256kl_u8(ptr %odata, ptr %idata, ptr %h) {
282; CHECK-LABEL: test__mm_aesencwide256kl_u8:
283; CHECK:       # %bb.0: # %entry
284; CHECK-NEXT:    movaps (%rsi), %xmm0
285; CHECK-NEXT:    movaps 16(%rsi), %xmm1
286; CHECK-NEXT:    movaps 32(%rsi), %xmm2
287; CHECK-NEXT:    movaps 48(%rsi), %xmm3
288; CHECK-NEXT:    movaps 64(%rsi), %xmm4
289; CHECK-NEXT:    movaps 80(%rsi), %xmm5
290; CHECK-NEXT:    movaps 96(%rsi), %xmm6
291; CHECK-NEXT:    movaps 112(%rsi), %xmm7
292; CHECK-NEXT:    aesencwide256kl (%rdx)
293; CHECK-NEXT:    sete %al
294; CHECK-NEXT:    movaps %xmm0, (%rdi)
295; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
296; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
297; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
298; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
299; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
300; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
301; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
302; CHECK-NEXT:    retq
303entry:
304  %0 = load <2 x i64>, ptr %idata, align 16
305  %1 = getelementptr <2 x i64>, ptr %idata, i64 1
306  %2 = load <2 x i64>, ptr %1, align 16
307  %3 = getelementptr <2 x i64>, ptr %idata, i64 2
308  %4 = load <2 x i64>, ptr %3, align 16
309  %5 = getelementptr <2 x i64>, ptr %idata, i64 3
310  %6 = load <2 x i64>, ptr %5, align 16
311  %7 = getelementptr <2 x i64>, ptr %idata, i64 4
312  %8 = load <2 x i64>, ptr %7, align 16
313  %9 = getelementptr <2 x i64>, ptr %idata, i64 5
314  %10 = load <2 x i64>, ptr %9, align 16
315  %11 = getelementptr <2 x i64>, ptr %idata, i64 6
316  %12 = load <2 x i64>, ptr %11, align 16
317  %13 = getelementptr <2 x i64>, ptr %idata, i64 7
318  %14 = load <2 x i64>, ptr %13, align 16
319  %15 = tail call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr %h, <2 x i64> %0, <2 x i64> %2, <2 x i64> %4, <2 x i64> %6, <2 x i64> %8, <2 x i64> %10, <2 x i64> %12, <2 x i64> %14) #1
320  %16 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 1
321  store <2 x i64> %16, ptr %odata, align 16
322  %17 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 2
323  %18 = getelementptr <2 x i64>, ptr %odata, i64 1
324  store <2 x i64> %17, ptr %18, align 16
325  %19 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 3
326  %20 = getelementptr <2 x i64>, ptr %odata, i64 2
327  store <2 x i64> %19, ptr %20, align 16
328  %21 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 4
329  %22 = getelementptr <2 x i64>, ptr %odata, i64 3
330  store <2 x i64> %21, ptr %22, align 16
331  %23 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 5
332  %24 = getelementptr <2 x i64>, ptr %odata, i64 4
333  store <2 x i64> %23, ptr %24, align 16
334  %25 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 6
335  %26 = getelementptr <2 x i64>, ptr %odata, i64 5
336  store <2 x i64> %25, ptr %26, align 16
337  %27 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 7
338  %28 = getelementptr <2 x i64>, ptr %odata, i64 6
339  store <2 x i64> %27, ptr %28, align 16
340  %29 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 8
341  %30 = getelementptr <2 x i64>, ptr %odata, i64 7
342  store <2 x i64> %29, ptr %30, align 16
343  %31 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 0
344  ret i8 %31
345}
346
347define zeroext i8 @test__mm_aesdecwide256kl_u8(ptr %odata, ptr %idata, ptr %h) {
348; CHECK-LABEL: test__mm_aesdecwide256kl_u8:
349; CHECK:       # %bb.0: # %entry
350; CHECK-NEXT:    movaps (%rsi), %xmm0
351; CHECK-NEXT:    movaps 16(%rsi), %xmm1
352; CHECK-NEXT:    movaps 32(%rsi), %xmm2
353; CHECK-NEXT:    movaps 48(%rsi), %xmm3
354; CHECK-NEXT:    movaps 64(%rsi), %xmm4
355; CHECK-NEXT:    movaps 80(%rsi), %xmm5
356; CHECK-NEXT:    movaps 96(%rsi), %xmm6
357; CHECK-NEXT:    movaps 112(%rsi), %xmm7
358; CHECK-NEXT:    aesdecwide256kl (%rdx)
359; CHECK-NEXT:    sete %al
360; CHECK-NEXT:    movaps %xmm0, (%rdi)
361; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
362; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
363; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
364; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
365; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
366; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
367; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
368; CHECK-NEXT:    retq
369entry:
370  %0 = load <2 x i64>, ptr %idata, align 16
371  %1 = getelementptr <2 x i64>, ptr %idata, i64 1
372  %2 = load <2 x i64>, ptr %1, align 16
373  %3 = getelementptr <2 x i64>, ptr %idata, i64 2
374  %4 = load <2 x i64>, ptr %3, align 16
375  %5 = getelementptr <2 x i64>, ptr %idata, i64 3
376  %6 = load <2 x i64>, ptr %5, align 16
377  %7 = getelementptr <2 x i64>, ptr %idata, i64 4
378  %8 = load <2 x i64>, ptr %7, align 16
379  %9 = getelementptr <2 x i64>, ptr %idata, i64 5
380  %10 = load <2 x i64>, ptr %9, align 16
381  %11 = getelementptr <2 x i64>, ptr %idata, i64 6
382  %12 = load <2 x i64>, ptr %11, align 16
383  %13 = getelementptr <2 x i64>, ptr %idata, i64 7
384  %14 = load <2 x i64>, ptr %13, align 16
385  %15 = tail call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr %h, <2 x i64> %0, <2 x i64> %2, <2 x i64> %4, <2 x i64> %6, <2 x i64> %8, <2 x i64> %10, <2 x i64> %12, <2 x i64> %14) #1
386  %16 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 1
387  store <2 x i64> %16, ptr %odata, align 16
388  %17 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 2
389  %18 = getelementptr <2 x i64>, ptr %odata, i64 1
390  store <2 x i64> %17, ptr %18, align 16
391  %19 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 3
392  %20 = getelementptr <2 x i64>, ptr %odata, i64 2
393  store <2 x i64> %19, ptr %20, align 16
394  %21 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 4
395  %22 = getelementptr <2 x i64>, ptr %odata, i64 3
396  store <2 x i64> %21, ptr %22, align 16
397  %23 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 5
398  %24 = getelementptr <2 x i64>, ptr %odata, i64 4
399  store <2 x i64> %23, ptr %24, align 16
400  %25 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 6
401  %26 = getelementptr <2 x i64>, ptr %odata, i64 5
402  store <2 x i64> %25, ptr %26, align 16
403  %27 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 7
404  %28 = getelementptr <2 x i64>, ptr %odata, i64 6
405  store <2 x i64> %27, ptr %28, align 16
406  %29 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 8
407  %30 = getelementptr <2 x i64>, ptr %odata, i64 7
408  store <2 x i64> %29, ptr %30, align 16
409  %31 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %15, 0
410  ret i8 %31
411}
412
413declare void @llvm.x86.loadiwkey(<2 x i64>, <2 x i64>, <2 x i64>, i32)
414declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32, <2 x i64>)
415declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32, <2 x i64>, <2 x i64>)
416declare { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64>, ptr)
417declare { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64>, ptr)
418declare { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64>, ptr)
419declare { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64>, ptr)
420declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
421declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
422declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
423declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
424