xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-bmi2.ll (revision 1ec3ad9ed85292c4b3be04b1b09f7541928cc3f0)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define i32 @stack_fold_bzhi_u32(i32 %a0, i32 %a1)   {
13; CHECK-LABEL: stack_fold_bzhi_u32:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    pushq %rbp
16; CHECK-NEXT:    .cfi_def_cfa_offset 16
17; CHECK-NEXT:    pushq %r15
18; CHECK-NEXT:    .cfi_def_cfa_offset 24
19; CHECK-NEXT:    pushq %r14
20; CHECK-NEXT:    .cfi_def_cfa_offset 32
21; CHECK-NEXT:    pushq %r13
22; CHECK-NEXT:    .cfi_def_cfa_offset 40
23; CHECK-NEXT:    pushq %r12
24; CHECK-NEXT:    .cfi_def_cfa_offset 48
25; CHECK-NEXT:    pushq %rbx
26; CHECK-NEXT:    .cfi_def_cfa_offset 56
27; CHECK-NEXT:    .cfi_offset %rbx, -56
28; CHECK-NEXT:    .cfi_offset %r12, -48
29; CHECK-NEXT:    .cfi_offset %r13, -40
30; CHECK-NEXT:    .cfi_offset %r14, -32
31; CHECK-NEXT:    .cfi_offset %r15, -24
32; CHECK-NEXT:    .cfi_offset %rbp, -16
33; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
34; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
35; CHECK-NEXT:    #APP
36; CHECK-NEXT:    nop
37; CHECK-NEXT:    #NO_APP
38; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
39; CHECK-NEXT:    bzhil %eax, {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
40; CHECK-NEXT:    popq %rbx
41; CHECK-NEXT:    .cfi_def_cfa_offset 48
42; CHECK-NEXT:    popq %r12
43; CHECK-NEXT:    .cfi_def_cfa_offset 40
44; CHECK-NEXT:    popq %r13
45; CHECK-NEXT:    .cfi_def_cfa_offset 32
46; CHECK-NEXT:    popq %r14
47; CHECK-NEXT:    .cfi_def_cfa_offset 24
48; CHECK-NEXT:    popq %r15
49; CHECK-NEXT:    .cfi_def_cfa_offset 16
50; CHECK-NEXT:    popq %rbp
51; CHECK-NEXT:    .cfi_def_cfa_offset 8
52; CHECK-NEXT:    retq
53  %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
54  %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a0, i32 %a1)
55  ret i32 %2
56}
57declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
58
59define i64 @stack_fold_bzhi_u64(i64 %a0, i64 %a1)   {
60; CHECK-LABEL: stack_fold_bzhi_u64:
61; CHECK:       # %bb.0:
62; CHECK-NEXT:    pushq %rbp
63; CHECK-NEXT:    .cfi_def_cfa_offset 16
64; CHECK-NEXT:    pushq %r15
65; CHECK-NEXT:    .cfi_def_cfa_offset 24
66; CHECK-NEXT:    pushq %r14
67; CHECK-NEXT:    .cfi_def_cfa_offset 32
68; CHECK-NEXT:    pushq %r13
69; CHECK-NEXT:    .cfi_def_cfa_offset 40
70; CHECK-NEXT:    pushq %r12
71; CHECK-NEXT:    .cfi_def_cfa_offset 48
72; CHECK-NEXT:    pushq %rbx
73; CHECK-NEXT:    .cfi_def_cfa_offset 56
74; CHECK-NEXT:    .cfi_offset %rbx, -56
75; CHECK-NEXT:    .cfi_offset %r12, -48
76; CHECK-NEXT:    .cfi_offset %r13, -40
77; CHECK-NEXT:    .cfi_offset %r14, -32
78; CHECK-NEXT:    .cfi_offset %r15, -24
79; CHECK-NEXT:    .cfi_offset %rbp, -16
80; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
81; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
82; CHECK-NEXT:    #APP
83; CHECK-NEXT:    nop
84; CHECK-NEXT:    #NO_APP
85; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
86; CHECK-NEXT:    bzhiq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
87; CHECK-NEXT:    popq %rbx
88; CHECK-NEXT:    .cfi_def_cfa_offset 48
89; CHECK-NEXT:    popq %r12
90; CHECK-NEXT:    .cfi_def_cfa_offset 40
91; CHECK-NEXT:    popq %r13
92; CHECK-NEXT:    .cfi_def_cfa_offset 32
93; CHECK-NEXT:    popq %r14
94; CHECK-NEXT:    .cfi_def_cfa_offset 24
95; CHECK-NEXT:    popq %r15
96; CHECK-NEXT:    .cfi_def_cfa_offset 16
97; CHECK-NEXT:    popq %rbp
98; CHECK-NEXT:    .cfi_def_cfa_offset 8
99; CHECK-NEXT:    retq
100  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
101  %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a0, i64 %a1)
102  ret i64 %2
103}
104declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
105
106define i32 @stack_fold_pdep_u32(i32 %a0, i32 %a1)   {
107; CHECK-LABEL: stack_fold_pdep_u32:
108; CHECK:       # %bb.0:
109; CHECK-NEXT:    pushq %rbp
110; CHECK-NEXT:    .cfi_def_cfa_offset 16
111; CHECK-NEXT:    pushq %r15
112; CHECK-NEXT:    .cfi_def_cfa_offset 24
113; CHECK-NEXT:    pushq %r14
114; CHECK-NEXT:    .cfi_def_cfa_offset 32
115; CHECK-NEXT:    pushq %r13
116; CHECK-NEXT:    .cfi_def_cfa_offset 40
117; CHECK-NEXT:    pushq %r12
118; CHECK-NEXT:    .cfi_def_cfa_offset 48
119; CHECK-NEXT:    pushq %rbx
120; CHECK-NEXT:    .cfi_def_cfa_offset 56
121; CHECK-NEXT:    .cfi_offset %rbx, -56
122; CHECK-NEXT:    .cfi_offset %r12, -48
123; CHECK-NEXT:    .cfi_offset %r13, -40
124; CHECK-NEXT:    .cfi_offset %r14, -32
125; CHECK-NEXT:    .cfi_offset %r15, -24
126; CHECK-NEXT:    .cfi_offset %rbp, -16
127; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
128; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
129; CHECK-NEXT:    #APP
130; CHECK-NEXT:    nop
131; CHECK-NEXT:    #NO_APP
132; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
133; CHECK-NEXT:    pdepl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload
134; CHECK-NEXT:    popq %rbx
135; CHECK-NEXT:    .cfi_def_cfa_offset 48
136; CHECK-NEXT:    popq %r12
137; CHECK-NEXT:    .cfi_def_cfa_offset 40
138; CHECK-NEXT:    popq %r13
139; CHECK-NEXT:    .cfi_def_cfa_offset 32
140; CHECK-NEXT:    popq %r14
141; CHECK-NEXT:    .cfi_def_cfa_offset 24
142; CHECK-NEXT:    popq %r15
143; CHECK-NEXT:    .cfi_def_cfa_offset 16
144; CHECK-NEXT:    popq %rbp
145; CHECK-NEXT:    .cfi_def_cfa_offset 8
146; CHECK-NEXT:    retq
147  %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
148  %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
149  ret i32 %2
150}
151declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
152
153define i64 @stack_fold_pdep_u64(i64 %a0, i64 %a1)   {
154; CHECK-LABEL: stack_fold_pdep_u64:
155; CHECK:       # %bb.0:
156; CHECK-NEXT:    pushq %rbp
157; CHECK-NEXT:    .cfi_def_cfa_offset 16
158; CHECK-NEXT:    pushq %r15
159; CHECK-NEXT:    .cfi_def_cfa_offset 24
160; CHECK-NEXT:    pushq %r14
161; CHECK-NEXT:    .cfi_def_cfa_offset 32
162; CHECK-NEXT:    pushq %r13
163; CHECK-NEXT:    .cfi_def_cfa_offset 40
164; CHECK-NEXT:    pushq %r12
165; CHECK-NEXT:    .cfi_def_cfa_offset 48
166; CHECK-NEXT:    pushq %rbx
167; CHECK-NEXT:    .cfi_def_cfa_offset 56
168; CHECK-NEXT:    .cfi_offset %rbx, -56
169; CHECK-NEXT:    .cfi_offset %r12, -48
170; CHECK-NEXT:    .cfi_offset %r13, -40
171; CHECK-NEXT:    .cfi_offset %r14, -32
172; CHECK-NEXT:    .cfi_offset %r15, -24
173; CHECK-NEXT:    .cfi_offset %rbp, -16
174; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
175; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
176; CHECK-NEXT:    #APP
177; CHECK-NEXT:    nop
178; CHECK-NEXT:    #NO_APP
179; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
180; CHECK-NEXT:    pdepq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload
181; CHECK-NEXT:    popq %rbx
182; CHECK-NEXT:    .cfi_def_cfa_offset 48
183; CHECK-NEXT:    popq %r12
184; CHECK-NEXT:    .cfi_def_cfa_offset 40
185; CHECK-NEXT:    popq %r13
186; CHECK-NEXT:    .cfi_def_cfa_offset 32
187; CHECK-NEXT:    popq %r14
188; CHECK-NEXT:    .cfi_def_cfa_offset 24
189; CHECK-NEXT:    popq %r15
190; CHECK-NEXT:    .cfi_def_cfa_offset 16
191; CHECK-NEXT:    popq %rbp
192; CHECK-NEXT:    .cfi_def_cfa_offset 8
193; CHECK-NEXT:    retq
194  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
195  %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
196  ret i64 %2
197}
198declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
199
200define i32 @stack_fold_pext_u32(i32 %a0, i32 %a1)   {
201; CHECK-LABEL: stack_fold_pext_u32:
202; CHECK:       # %bb.0:
203; CHECK-NEXT:    pushq %rbp
204; CHECK-NEXT:    .cfi_def_cfa_offset 16
205; CHECK-NEXT:    pushq %r15
206; CHECK-NEXT:    .cfi_def_cfa_offset 24
207; CHECK-NEXT:    pushq %r14
208; CHECK-NEXT:    .cfi_def_cfa_offset 32
209; CHECK-NEXT:    pushq %r13
210; CHECK-NEXT:    .cfi_def_cfa_offset 40
211; CHECK-NEXT:    pushq %r12
212; CHECK-NEXT:    .cfi_def_cfa_offset 48
213; CHECK-NEXT:    pushq %rbx
214; CHECK-NEXT:    .cfi_def_cfa_offset 56
215; CHECK-NEXT:    .cfi_offset %rbx, -56
216; CHECK-NEXT:    .cfi_offset %r12, -48
217; CHECK-NEXT:    .cfi_offset %r13, -40
218; CHECK-NEXT:    .cfi_offset %r14, -32
219; CHECK-NEXT:    .cfi_offset %r15, -24
220; CHECK-NEXT:    .cfi_offset %rbp, -16
221; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
222; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
223; CHECK-NEXT:    #APP
224; CHECK-NEXT:    nop
225; CHECK-NEXT:    #NO_APP
226; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
227; CHECK-NEXT:    pextl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload
228; CHECK-NEXT:    popq %rbx
229; CHECK-NEXT:    .cfi_def_cfa_offset 48
230; CHECK-NEXT:    popq %r12
231; CHECK-NEXT:    .cfi_def_cfa_offset 40
232; CHECK-NEXT:    popq %r13
233; CHECK-NEXT:    .cfi_def_cfa_offset 32
234; CHECK-NEXT:    popq %r14
235; CHECK-NEXT:    .cfi_def_cfa_offset 24
236; CHECK-NEXT:    popq %r15
237; CHECK-NEXT:    .cfi_def_cfa_offset 16
238; CHECK-NEXT:    popq %rbp
239; CHECK-NEXT:    .cfi_def_cfa_offset 8
240; CHECK-NEXT:    retq
241  %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
242  %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
243  ret i32 %2
244}
245declare i32 @llvm.x86.bmi.pext.32(i32, i32)
246
247define i64 @stack_fold_pext_u64(i64 %a0, i64 %a1)   {
248; CHECK-LABEL: stack_fold_pext_u64:
249; CHECK:       # %bb.0:
250; CHECK-NEXT:    pushq %rbp
251; CHECK-NEXT:    .cfi_def_cfa_offset 16
252; CHECK-NEXT:    pushq %r15
253; CHECK-NEXT:    .cfi_def_cfa_offset 24
254; CHECK-NEXT:    pushq %r14
255; CHECK-NEXT:    .cfi_def_cfa_offset 32
256; CHECK-NEXT:    pushq %r13
257; CHECK-NEXT:    .cfi_def_cfa_offset 40
258; CHECK-NEXT:    pushq %r12
259; CHECK-NEXT:    .cfi_def_cfa_offset 48
260; CHECK-NEXT:    pushq %rbx
261; CHECK-NEXT:    .cfi_def_cfa_offset 56
262; CHECK-NEXT:    .cfi_offset %rbx, -56
263; CHECK-NEXT:    .cfi_offset %r12, -48
264; CHECK-NEXT:    .cfi_offset %r13, -40
265; CHECK-NEXT:    .cfi_offset %r14, -32
266; CHECK-NEXT:    .cfi_offset %r15, -24
267; CHECK-NEXT:    .cfi_offset %rbp, -16
268; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
269; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
270; CHECK-NEXT:    #APP
271; CHECK-NEXT:    nop
272; CHECK-NEXT:    #NO_APP
273; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
274; CHECK-NEXT:    pextq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload
275; CHECK-NEXT:    popq %rbx
276; CHECK-NEXT:    .cfi_def_cfa_offset 48
277; CHECK-NEXT:    popq %r12
278; CHECK-NEXT:    .cfi_def_cfa_offset 40
279; CHECK-NEXT:    popq %r13
280; CHECK-NEXT:    .cfi_def_cfa_offset 32
281; CHECK-NEXT:    popq %r14
282; CHECK-NEXT:    .cfi_def_cfa_offset 24
283; CHECK-NEXT:    popq %r15
284; CHECK-NEXT:    .cfi_def_cfa_offset 16
285; CHECK-NEXT:    popq %rbp
286; CHECK-NEXT:    .cfi_def_cfa_offset 8
287; CHECK-NEXT:    retq
288  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
289  %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)
290  ret i64 %2
291}
292declare i64 @llvm.x86.bmi.pext.64(i64, i64)
293