xref: /llvm-project/llvm/test/CodeGen/X86/keylocker-intrinsics.ll (revision ae81400a0fad6040483f1af5947388aecca5c3f7)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unkown-unknown -mattr=+kl,widekl | FileCheck %s --check-prefix=X64
3; RUN: llc < %s -verify-machineinstrs -mtriple=i386-unkown-unknown -mattr=+kl,widekl -mattr=+avx2 | FileCheck %s --check-prefix=X86
4; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unkown-unknown -mattr=+widekl | FileCheck %s --check-prefix=X64
5; RUN: llc < %s -verify-machineinstrs -mtriple=i386-unkown-unknown -mattr=+widekl -mattr=+avx2 | FileCheck %s --check-prefix=X86
6
7declare void @llvm.x86.loadiwkey(<2 x i64>, <2 x i64>, <2 x i64>, i32)
8declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32, <2 x i64>)
9declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32, <2 x i64>, <2 x i64>)
10declare { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64>, ptr)
11declare { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64>, ptr)
12declare { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64>, ptr)
13declare { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64>, ptr)
14declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
15declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
16declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
17declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>)
18
19define void @test_loadiwkey(i32 %ctl, <2 x i64> %intkey, <2 x i64> %enkey_lo, <2 x i64> %enkey_hi) {
20; X64-LABEL: test_loadiwkey:
21; X64:       # %bb.0: # %entry
22; X64-NEXT:    movl %edi, %eax
23; X64-NEXT:    loadiwkey %xmm2, %xmm1
24; X64-NEXT:    retq
25;
26; X86-LABEL: test_loadiwkey:
27; X86:       # %bb.0: # %entry
28; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
29; X86-NEXT:    loadiwkey %xmm2, %xmm1
30; X86-NEXT:    retl
31entry:
32  tail call void @llvm.x86.loadiwkey(<2 x i64> %intkey, <2 x i64> %enkey_lo, <2 x i64> %enkey_hi, i32 %ctl)
33  ret void
34}
35
36define i32 @test_encodekey128_u32(i32 %htype, <2 x i64> %key, ptr nocapture %h0, ptr nocapture %h1, ptr nocapture %h2, ptr nocapture %h3, ptr nocapture %h4, ptr nocapture %h5) nounwind {
37; X64-LABEL: test_encodekey128_u32:
38; X64:       # %bb.0: # %entry
39; X64-NEXT:    encodekey128 %edi, %eax
40; X64-NEXT:    movaps %xmm0, (%rsi)
41; X64-NEXT:    movaps %xmm1, (%rdx)
42; X64-NEXT:    movaps %xmm2, (%rcx)
43; X64-NEXT:    retq
44;
45; X86-LABEL: test_encodekey128_u32:
46; X86:       # %bb.0: # %entry
47; X86-NEXT:    pushl %esi
48; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
49; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
50; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
51; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
52; X86-NEXT:    encodekey128 %eax, %eax
53; X86-NEXT:    vmovaps %xmm0, (%esi)
54; X86-NEXT:    vmovaps %xmm1, (%edx)
55; X86-NEXT:    vmovaps %xmm2, (%ecx)
56; X86-NEXT:    popl %esi
57; X86-NEXT:    retl
58entry:
59  %0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 %htype, <2 x i64> %key)
60  %1 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
61  store <2 x i64> %1, ptr %h0, align 16
62  %2 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
63  store <2 x i64> %2, ptr %h1, align 16
64  %3 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
65  store <2 x i64> %3, ptr %h2, align 16
66  %4 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
67  ret i32 %4
68}
69
70define i32 @test_encodekey256_u32(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi, ptr nocapture %h0, ptr nocapture %h1, ptr nocapture %h2, ptr nocapture %h3, ptr nocapture %h4, ptr nocapture %h5, ptr nocapture readnone %h6) nounwind {
71; X64-LABEL: test_encodekey256_u32:
72; X64:       # %bb.0: # %entry
73; X64-NEXT:    encodekey256 %edi, %eax
74; X64-NEXT:    movaps %xmm0, (%rsi)
75; X64-NEXT:    movaps %xmm1, (%rdx)
76; X64-NEXT:    movaps %xmm2, (%rcx)
77; X64-NEXT:    movaps %xmm3, (%r8)
78; X64-NEXT:    retq
79;
80; X86-LABEL: test_encodekey256_u32:
81; X86:       # %bb.0: # %entry
82; X86-NEXT:    pushl %edi
83; X86-NEXT:    pushl %esi
84; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
85; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
86; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
87; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
88; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
89; X86-NEXT:    encodekey256 %eax, %eax
90; X86-NEXT:    vmovaps %xmm0, (%edi)
91; X86-NEXT:    vmovaps %xmm1, (%esi)
92; X86-NEXT:    vmovaps %xmm2, (%edx)
93; X86-NEXT:    vmovaps %xmm3, (%ecx)
94; X86-NEXT:    popl %esi
95; X86-NEXT:    popl %edi
96; X86-NEXT:    retl
97entry:
98  %0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi)
99  %1 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
100  store <2 x i64> %1, ptr %h0, align 16
101  %2 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
102  store <2 x i64> %2, ptr %h1, align 16
103  %3 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
104  store <2 x i64> %3, ptr %h2, align 16
105  %4 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
106  store <2 x i64> %4, ptr %h3, align 16
107  %5 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
108  ret i32 %5
109}
110
111define i8 @test_mm_aesenc128kl_u8(<2 x i64> %data, ptr %h, ptr %out) {
112; X64-LABEL: test_mm_aesenc128kl_u8:
113; X64:       # %bb.0: # %entry
114; X64-NEXT:    aesenc128kl (%rdi), %xmm0
115; X64-NEXT:    sete %al
116; X64-NEXT:    movaps %xmm0, (%rsi)
117; X64-NEXT:    retq
118;
119; X86-LABEL: test_mm_aesenc128kl_u8:
120; X86:       # %bb.0: # %entry
121; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
122; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
123; X86-NEXT:    aesenc128kl (%eax), %xmm0
124; X86-NEXT:    sete %al
125; X86-NEXT:    vmovaps %xmm0, (%ecx)
126; X86-NEXT:    retl
127entry:
128  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %data, ptr %h)
129  %1 = extractvalue { i8, <2 x i64> } %0, 1
130  store <2 x i64> %1, ptr %out
131  %2 = extractvalue { i8, <2 x i64> } %0, 0
132  ret i8 %2
133}
134
135define i8 @test_mm_aesdec128kl_u8(<2 x i64> %data, ptr %h, ptr %out) {
136; X64-LABEL: test_mm_aesdec128kl_u8:
137; X64:       # %bb.0: # %entry
138; X64-NEXT:    aesdec128kl (%rdi), %xmm0
139; X64-NEXT:    sete %al
140; X64-NEXT:    movaps %xmm0, (%rsi)
141; X64-NEXT:    retq
142;
143; X86-LABEL: test_mm_aesdec128kl_u8:
144; X86:       # %bb.0: # %entry
145; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
146; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
147; X86-NEXT:    aesdec128kl (%eax), %xmm0
148; X86-NEXT:    sete %al
149; X86-NEXT:    vmovaps %xmm0, (%ecx)
150; X86-NEXT:    retl
151entry:
152  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %data, ptr %h)
153  %1 = extractvalue { i8, <2 x i64> } %0, 1
154  store <2 x i64> %1, ptr %out
155  %2 = extractvalue { i8, <2 x i64> } %0, 0
156  ret i8 %2
157}
158
159define i8 @test_mm_aesenc256kl_u8(<2 x i64> %data, ptr %h, ptr %out) {
160; X64-LABEL: test_mm_aesenc256kl_u8:
161; X64:       # %bb.0: # %entry
162; X64-NEXT:    aesenc256kl (%rdi), %xmm0
163; X64-NEXT:    sete %al
164; X64-NEXT:    movaps %xmm0, (%rsi)
165; X64-NEXT:    retq
166;
167; X86-LABEL: test_mm_aesenc256kl_u8:
168; X86:       # %bb.0: # %entry
169; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
170; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
171; X86-NEXT:    aesenc256kl (%eax), %xmm0
172; X86-NEXT:    sete %al
173; X86-NEXT:    vmovaps %xmm0, (%ecx)
174; X86-NEXT:    retl
175entry:
176  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %data, ptr %h)
177  %1 = extractvalue { i8, <2 x i64> } %0, 1
178  store <2 x i64> %1, ptr %out
179  %2 = extractvalue { i8, <2 x i64> } %0, 0
180  ret i8 %2
181}
182
183define i8 @test_mm_aesdec256kl_u8(<2 x i64> %data, ptr %h, ptr %out) {
184; X64-LABEL: test_mm_aesdec256kl_u8:
185; X64:       # %bb.0: # %entry
186; X64-NEXT:    aesdec256kl (%rdi), %xmm0
187; X64-NEXT:    sete %al
188; X64-NEXT:    movaps %xmm0, (%rsi)
189; X64-NEXT:    retq
190;
191; X86-LABEL: test_mm_aesdec256kl_u8:
192; X86:       # %bb.0: # %entry
193; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
194; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
195; X86-NEXT:    aesdec256kl (%eax), %xmm0
196; X86-NEXT:    sete %al
197; X86-NEXT:    vmovaps %xmm0, (%ecx)
198; X86-NEXT:    retl
199entry:
200  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %data, ptr %h)
201  %1 = extractvalue { i8, <2 x i64> } %0, 1
202  store <2 x i64> %1, ptr %out
203  %2 = extractvalue { i8, <2 x i64> } %0, 0
204  ret i8 %2
205}
206
207define i8 @test_mm_aesencwide128kl_u8(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7, ptr %out0, ptr %out1, ptr %out2, ptr %out3, ptr %out4, ptr %out5, ptr %out6, ptr %out7) nounwind {
208; X64-LABEL: test_mm_aesencwide128kl_u8:
209; X64:       # %bb.0: # %entry
210; X64-NEXT:    pushq %rbx
211; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
212; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
213; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
214; X64-NEXT:    aesencwide128kl (%rdi)
215; X64-NEXT:    sete %al
216; X64-NEXT:    movaps %xmm0, (%rsi)
217; X64-NEXT:    movaps %xmm1, (%rdx)
218; X64-NEXT:    movaps %xmm1, (%rcx)
219; X64-NEXT:    movaps %xmm1, (%r8)
220; X64-NEXT:    movaps %xmm1, (%r9)
221; X64-NEXT:    movaps %xmm1, (%rbx)
222; X64-NEXT:    movaps %xmm1, (%r11)
223; X64-NEXT:    movaps %xmm1, (%r10)
224; X64-NEXT:    popq %rbx
225; X64-NEXT:    retq
226;
227; X86-LABEL: test_mm_aesencwide128kl_u8:
228; X86:       # %bb.0: # %entry
229; X86-NEXT:    pushl %ebp
230; X86-NEXT:    movl %esp, %ebp
231; X86-NEXT:    andl $-16, %esp
232; X86-NEXT:    subl $16, %esp
233; X86-NEXT:    vmovaps 24(%ebp), %xmm3
234; X86-NEXT:    vmovaps 40(%ebp), %xmm4
235; X86-NEXT:    vmovaps 56(%ebp), %xmm5
236; X86-NEXT:    vmovaps 72(%ebp), %xmm6
237; X86-NEXT:    vmovaps 88(%ebp), %xmm7
238; X86-NEXT:    movl 8(%ebp), %eax
239; X86-NEXT:    aesencwide128kl (%eax)
240; X86-NEXT:    movl 104(%ebp), %eax
241; X86-NEXT:    vmovaps %xmm0, (%eax)
242; X86-NEXT:    movl 108(%ebp), %eax
243; X86-NEXT:    vmovaps %xmm1, (%eax)
244; X86-NEXT:    movl 112(%ebp), %eax
245; X86-NEXT:    vmovaps %xmm1, (%eax)
246; X86-NEXT:    movl 116(%ebp), %eax
247; X86-NEXT:    vmovaps %xmm1, (%eax)
248; X86-NEXT:    movl 120(%ebp), %eax
249; X86-NEXT:    vmovaps %xmm1, (%eax)
250; X86-NEXT:    movl 124(%ebp), %eax
251; X86-NEXT:    vmovaps %xmm1, (%eax)
252; X86-NEXT:    movl 128(%ebp), %eax
253; X86-NEXT:    vmovaps %xmm1, (%eax)
254; X86-NEXT:    movl 132(%ebp), %eax
255; X86-NEXT:    vmovaps %xmm1, (%eax)
256; X86-NEXT:    sete %al
257; X86-NEXT:    movl %ebp, %esp
258; X86-NEXT:    popl %ebp
259; X86-NEXT:    retl
260entry:
261  %0 = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6,      <2 x i64> %v7)
262  %1 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
263  store <2 x i64> %1, ptr %out0
264  %2 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
265  store <2 x i64> %2, ptr %out1
266  %3 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
267  store <2 x i64> %2, ptr %out2
268  %4 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
269  store <2 x i64> %2, ptr %out3
270  %5 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
271  store <2 x i64> %2, ptr %out4
272  %6 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
273  store <2 x i64> %2, ptr %out5
274  %7 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 7
275  store <2 x i64> %2, ptr %out6
276  %8 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 8
277  store <2 x i64> %2, ptr %out7
278  %9 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
279  ret i8 %9
280}
281
282define i8 @test_mm_aesdecwide128kl_u8(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7, ptr %out0, ptr %out1, ptr %out2, ptr %out3, ptr %out4, ptr %out5, ptr %out6, ptr %out7) nounwind {
283; X64-LABEL: test_mm_aesdecwide128kl_u8:
284; X64:       # %bb.0: # %entry
285; X64-NEXT:    pushq %rbx
286; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
287; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
288; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
289; X64-NEXT:    aesdecwide128kl (%rdi)
290; X64-NEXT:    sete %al
291; X64-NEXT:    movaps %xmm0, (%rsi)
292; X64-NEXT:    movaps %xmm1, (%rdx)
293; X64-NEXT:    movaps %xmm1, (%rcx)
294; X64-NEXT:    movaps %xmm1, (%r8)
295; X64-NEXT:    movaps %xmm1, (%r9)
296; X64-NEXT:    movaps %xmm1, (%rbx)
297; X64-NEXT:    movaps %xmm1, (%r11)
298; X64-NEXT:    movaps %xmm1, (%r10)
299; X64-NEXT:    popq %rbx
300; X64-NEXT:    retq
301;
302; X86-LABEL: test_mm_aesdecwide128kl_u8:
303; X86:       # %bb.0: # %entry
304; X86-NEXT:    pushl %ebp
305; X86-NEXT:    movl %esp, %ebp
306; X86-NEXT:    andl $-16, %esp
307; X86-NEXT:    subl $16, %esp
308; X86-NEXT:    vmovaps 24(%ebp), %xmm3
309; X86-NEXT:    vmovaps 40(%ebp), %xmm4
310; X86-NEXT:    vmovaps 56(%ebp), %xmm5
311; X86-NEXT:    vmovaps 72(%ebp), %xmm6
312; X86-NEXT:    vmovaps 88(%ebp), %xmm7
313; X86-NEXT:    movl 8(%ebp), %eax
314; X86-NEXT:    aesdecwide128kl (%eax)
315; X86-NEXT:    movl 104(%ebp), %eax
316; X86-NEXT:    vmovaps %xmm0, (%eax)
317; X86-NEXT:    movl 108(%ebp), %eax
318; X86-NEXT:    vmovaps %xmm1, (%eax)
319; X86-NEXT:    movl 112(%ebp), %eax
320; X86-NEXT:    vmovaps %xmm1, (%eax)
321; X86-NEXT:    movl 116(%ebp), %eax
322; X86-NEXT:    vmovaps %xmm1, (%eax)
323; X86-NEXT:    movl 120(%ebp), %eax
324; X86-NEXT:    vmovaps %xmm1, (%eax)
325; X86-NEXT:    movl 124(%ebp), %eax
326; X86-NEXT:    vmovaps %xmm1, (%eax)
327; X86-NEXT:    movl 128(%ebp), %eax
328; X86-NEXT:    vmovaps %xmm1, (%eax)
329; X86-NEXT:    movl 132(%ebp), %eax
330; X86-NEXT:    vmovaps %xmm1, (%eax)
331; X86-NEXT:    sete %al
332; X86-NEXT:    movl %ebp, %esp
333; X86-NEXT:    popl %ebp
334; X86-NEXT:    retl
335entry:
336  %0 = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6,      <2 x i64> %v7)
337  %1 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
338  store <2 x i64> %1, ptr %out0
339  %2 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
340  store <2 x i64> %2, ptr %out1
341  %3 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
342  store <2 x i64> %2, ptr %out2
343  %4 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
344  store <2 x i64> %2, ptr %out3
345  %5 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
346  store <2 x i64> %2, ptr %out4
347  %6 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
348  store <2 x i64> %2, ptr %out5
349  %7 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 7
350  store <2 x i64> %2, ptr %out6
351  %8 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 8
352  store <2 x i64> %2, ptr %out7
353  %9 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
354  ret i8 %9
355}
356
357define i8 @test_mm_aesencwide256kl_u8(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7, ptr %out0, ptr %out1, ptr %out2, ptr %out3, ptr %out4, ptr %out5, ptr %out6, ptr %out7) nounwind {
358; X64-LABEL: test_mm_aesencwide256kl_u8:
359; X64:       # %bb.0: # %entry
360; X64-NEXT:    pushq %rbx
361; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
362; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
363; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
364; X64-NEXT:    aesencwide256kl (%rdi)
365; X64-NEXT:    sete %al
366; X64-NEXT:    movaps %xmm0, (%rsi)
367; X64-NEXT:    movaps %xmm1, (%rdx)
368; X64-NEXT:    movaps %xmm1, (%rcx)
369; X64-NEXT:    movaps %xmm1, (%r8)
370; X64-NEXT:    movaps %xmm1, (%r9)
371; X64-NEXT:    movaps %xmm1, (%rbx)
372; X64-NEXT:    movaps %xmm1, (%r11)
373; X64-NEXT:    movaps %xmm1, (%r10)
374; X64-NEXT:    popq %rbx
375; X64-NEXT:    retq
376;
377; X86-LABEL: test_mm_aesencwide256kl_u8:
378; X86:       # %bb.0: # %entry
379; X86-NEXT:    pushl %ebp
380; X86-NEXT:    movl %esp, %ebp
381; X86-NEXT:    andl $-16, %esp
382; X86-NEXT:    subl $16, %esp
383; X86-NEXT:    vmovaps 24(%ebp), %xmm3
384; X86-NEXT:    vmovaps 40(%ebp), %xmm4
385; X86-NEXT:    vmovaps 56(%ebp), %xmm5
386; X86-NEXT:    vmovaps 72(%ebp), %xmm6
387; X86-NEXT:    vmovaps 88(%ebp), %xmm7
388; X86-NEXT:    movl 8(%ebp), %eax
389; X86-NEXT:    aesencwide256kl (%eax)
390; X86-NEXT:    movl 104(%ebp), %eax
391; X86-NEXT:    vmovaps %xmm0, (%eax)
392; X86-NEXT:    movl 108(%ebp), %eax
393; X86-NEXT:    vmovaps %xmm1, (%eax)
394; X86-NEXT:    movl 112(%ebp), %eax
395; X86-NEXT:    vmovaps %xmm1, (%eax)
396; X86-NEXT:    movl 116(%ebp), %eax
397; X86-NEXT:    vmovaps %xmm1, (%eax)
398; X86-NEXT:    movl 120(%ebp), %eax
399; X86-NEXT:    vmovaps %xmm1, (%eax)
400; X86-NEXT:    movl 124(%ebp), %eax
401; X86-NEXT:    vmovaps %xmm1, (%eax)
402; X86-NEXT:    movl 128(%ebp), %eax
403; X86-NEXT:    vmovaps %xmm1, (%eax)
404; X86-NEXT:    movl 132(%ebp), %eax
405; X86-NEXT:    vmovaps %xmm1, (%eax)
406; X86-NEXT:    sete %al
407; X86-NEXT:    movl %ebp, %esp
408; X86-NEXT:    popl %ebp
409; X86-NEXT:    retl
410entry:
411  %0 = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6,      <2 x i64> %v7)
412  %1 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
413  store <2 x i64> %1, ptr %out0
414  %2 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
415  store <2 x i64> %2, ptr %out1
416  %3 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
417  store <2 x i64> %2, ptr %out2
418  %4 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
419  store <2 x i64> %2, ptr %out3
420  %5 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
421  store <2 x i64> %2, ptr %out4
422  %6 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
423  store <2 x i64> %2, ptr %out5
424  %7 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 7
425  store <2 x i64> %2, ptr %out6
426  %8 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 8
427  store <2 x i64> %2, ptr %out7
428  %9 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
429  ret i8 %9
430}
431
432define i8 @test_mm_aesdecwide256kl_u8(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7, ptr %out0, ptr %out1, ptr %out2, ptr %out3, ptr %out4, ptr %out5, ptr %out6, ptr %out7) nounwind {
433; X64-LABEL: test_mm_aesdecwide256kl_u8:
434; X64:       # %bb.0: # %entry
435; X64-NEXT:    pushq %rbx
436; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
437; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
438; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
439; X64-NEXT:    aesdecwide256kl (%rdi)
440; X64-NEXT:    sete %al
441; X64-NEXT:    movaps %xmm0, (%rsi)
442; X64-NEXT:    movaps %xmm1, (%rdx)
443; X64-NEXT:    movaps %xmm1, (%rcx)
444; X64-NEXT:    movaps %xmm1, (%r8)
445; X64-NEXT:    movaps %xmm1, (%r9)
446; X64-NEXT:    movaps %xmm1, (%rbx)
447; X64-NEXT:    movaps %xmm1, (%r11)
448; X64-NEXT:    movaps %xmm1, (%r10)
449; X64-NEXT:    popq %rbx
450; X64-NEXT:    retq
451;
452; X86-LABEL: test_mm_aesdecwide256kl_u8:
453; X86:       # %bb.0: # %entry
454; X86-NEXT:    pushl %ebp
455; X86-NEXT:    movl %esp, %ebp
456; X86-NEXT:    andl $-16, %esp
457; X86-NEXT:    subl $16, %esp
458; X86-NEXT:    vmovaps 24(%ebp), %xmm3
459; X86-NEXT:    vmovaps 40(%ebp), %xmm4
460; X86-NEXT:    vmovaps 56(%ebp), %xmm5
461; X86-NEXT:    vmovaps 72(%ebp), %xmm6
462; X86-NEXT:    vmovaps 88(%ebp), %xmm7
463; X86-NEXT:    movl 8(%ebp), %eax
464; X86-NEXT:    aesdecwide256kl (%eax)
465; X86-NEXT:    movl 104(%ebp), %eax
466; X86-NEXT:    vmovaps %xmm0, (%eax)
467; X86-NEXT:    movl 108(%ebp), %eax
468; X86-NEXT:    vmovaps %xmm1, (%eax)
469; X86-NEXT:    movl 112(%ebp), %eax
470; X86-NEXT:    vmovaps %xmm1, (%eax)
471; X86-NEXT:    movl 116(%ebp), %eax
472; X86-NEXT:    vmovaps %xmm1, (%eax)
473; X86-NEXT:    movl 120(%ebp), %eax
474; X86-NEXT:    vmovaps %xmm1, (%eax)
475; X86-NEXT:    movl 124(%ebp), %eax
476; X86-NEXT:    vmovaps %xmm1, (%eax)
477; X86-NEXT:    movl 128(%ebp), %eax
478; X86-NEXT:    vmovaps %xmm1, (%eax)
479; X86-NEXT:    movl 132(%ebp), %eax
480; X86-NEXT:    vmovaps %xmm1, (%eax)
481; X86-NEXT:    sete %al
482; X86-NEXT:    movl %ebp, %esp
483; X86-NEXT:    popl %ebp
484; X86-NEXT:    retl
485entry:
486  %0 = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6,      <2 x i64> %v7)
487  %1 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
488  store <2 x i64> %1, ptr %out0
489  %2 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
490  store <2 x i64> %2, ptr %out1
491  %3 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
492  store <2 x i64> %2, ptr %out2
493  %4 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
494  store <2 x i64> %2, ptr %out3
495  %5 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
496  store <2 x i64> %2, ptr %out4
497  %6 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
498  store <2 x i64> %2, ptr %out5
499  %7 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 7
500  store <2 x i64> %2, ptr %out6
501  %8 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 8
502  store <2 x i64> %2, ptr %out7
503  %9 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
504  ret i8 %9
505}
506
507; Tests to make sure we can select an appropriate addressing mode for a global.
508
509@foo = external dso_local global [64 x i8]
510
511define i8 @test_mm_aesenc256kl_u8_global(<2 x i64> %data, ptr %out) {
512; X64-LABEL: test_mm_aesenc256kl_u8_global:
513; X64:       # %bb.0: # %entry
514; X64-NEXT:    aesenc256kl foo(%rip), %xmm0
515; X64-NEXT:    sete %al
516; X64-NEXT:    movaps %xmm0, (%rdi)
517; X64-NEXT:    retq
518;
519; X86-LABEL: test_mm_aesenc256kl_u8_global:
520; X86:       # %bb.0: # %entry
521; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
522; X86-NEXT:    aesenc256kl foo, %xmm0
523; X86-NEXT:    sete %al
524; X86-NEXT:    vmovaps %xmm0, (%ecx)
525; X86-NEXT:    retl
526entry:
527  %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %data, ptr @foo)
528  %1 = extractvalue { i8, <2 x i64> } %0, 1
529  store <2 x i64> %1, ptr %out
530  %2 = extractvalue { i8, <2 x i64> } %0, 0
531  ret i8 %2
532}
533
534define i8 @test_mm_aesdecwide256kl_u8_global(<2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7, ptr %out0, ptr %out1, ptr %out2, ptr %out3, ptr %out4, ptr %out5, ptr %out6, ptr %out7) nounwind {
535; X64-LABEL: test_mm_aesdecwide256kl_u8_global:
536; X64:       # %bb.0: # %entry
537; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
538; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
539; X64-NEXT:    aesdecwide256kl foo(%rip)
540; X64-NEXT:    sete %al
541; X64-NEXT:    movaps %xmm0, (%rdi)
542; X64-NEXT:    movaps %xmm1, (%rsi)
543; X64-NEXT:    movaps %xmm1, (%rdx)
544; X64-NEXT:    movaps %xmm1, (%rcx)
545; X64-NEXT:    movaps %xmm1, (%r8)
546; X64-NEXT:    movaps %xmm1, (%r9)
547; X64-NEXT:    movaps %xmm1, (%r11)
548; X64-NEXT:    movaps %xmm1, (%r10)
549; X64-NEXT:    retq
550;
551; X86-LABEL: test_mm_aesdecwide256kl_u8_global:
552; X86:       # %bb.0: # %entry
553; X86-NEXT:    pushl %ebp
554; X86-NEXT:    movl %esp, %ebp
555; X86-NEXT:    andl $-16, %esp
556; X86-NEXT:    subl $16, %esp
557; X86-NEXT:    movl 88(%ebp), %eax
558; X86-NEXT:    vmovaps 8(%ebp), %xmm3
559; X86-NEXT:    vmovaps 24(%ebp), %xmm4
560; X86-NEXT:    vmovaps 40(%ebp), %xmm5
561; X86-NEXT:    vmovaps 56(%ebp), %xmm6
562; X86-NEXT:    vmovaps 72(%ebp), %xmm7
563; X86-NEXT:    aesdecwide256kl foo
564; X86-NEXT:    vmovaps %xmm0, (%eax)
565; X86-NEXT:    movl 92(%ebp), %eax
566; X86-NEXT:    vmovaps %xmm1, (%eax)
567; X86-NEXT:    movl 96(%ebp), %eax
568; X86-NEXT:    vmovaps %xmm1, (%eax)
569; X86-NEXT:    movl 100(%ebp), %eax
570; X86-NEXT:    vmovaps %xmm1, (%eax)
571; X86-NEXT:    movl 104(%ebp), %eax
572; X86-NEXT:    vmovaps %xmm1, (%eax)
573; X86-NEXT:    movl 108(%ebp), %eax
574; X86-NEXT:    vmovaps %xmm1, (%eax)
575; X86-NEXT:    movl 112(%ebp), %eax
576; X86-NEXT:    vmovaps %xmm1, (%eax)
577; X86-NEXT:    movl 116(%ebp), %eax
578; X86-NEXT:    vmovaps %xmm1, (%eax)
579; X86-NEXT:    sete %al
580; X86-NEXT:    movl %ebp, %esp
581; X86-NEXT:    popl %ebp
582; X86-NEXT:    retl
583entry:
584  %0 = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr @foo, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6,      <2 x i64> %v7)
585  %1 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1
586  store <2 x i64> %1, ptr %out0
587  %2 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2
588  store <2 x i64> %2, ptr %out1
589  %3 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3
590  store <2 x i64> %2, ptr %out2
591  %4 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4
592  store <2 x i64> %2, ptr %out3
593  %5 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5
594  store <2 x i64> %2, ptr %out4
595  %6 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6
596  store <2 x i64> %2, ptr %out5
597  %7 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 7
598  store <2 x i64> %2, ptr %out6
599  %8 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 8
600  store <2 x i64> %2, ptr %out7
601  %9 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0
602  ret i8 %9
603}
604