xref: /llvm-project/llvm/test/CodeGen/AMDGPU/select64.ll (revision 859ebca744e634dcc89a2294ffa41574f947bd62)
1; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefixes=SI,GCN %s
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI,GCN %s
3
4; GCN-LABEL: {{^}}select0:
5; i64 select should be split into two i32 selects, and we shouldn't need
6; to use a shfit to extract the hi dword of the input.
7; GCN-NOT: s_lshr_b64
8; GCN: v_cndmask
9; GCN: v_cndmask
10define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
11<<<<<<< HEAD
12; SI-LABEL: select0:
13; SI:       ; %bb.0: ; %entry
14; SI-NEXT:    s_load_dword s6, s[0:1], 0xb
15; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
16; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
17; SI-NEXT:    s_mov_b32 s3, 0xf000
18; SI-NEXT:    s_mov_b32 s2, -1
19; SI-NEXT:    s_waitcnt lgkmcnt(0)
20; SI-NEXT:    s_cmp_lt_u32 s6, 6
21; SI-NEXT:    v_mov_b32_e32 v0, s5
22; SI-NEXT:    s_cselect_b64 vcc, -1, 0
23; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
24; SI-NEXT:    v_mov_b32_e32 v0, s4
25; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
26; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
27; SI-NEXT:    s_endpgm
28;
29; VI-LABEL: select0:
30; VI:       ; %bb.0: ; %entry
31; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
32; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
33; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
34; VI-NEXT:    s_waitcnt lgkmcnt(0)
35; VI-NEXT:    s_cmp_lt_u32 s4, 6
36; VI-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
37; VI-NEXT:    v_mov_b32_e32 v0, s0
38; VI-NEXT:    v_mov_b32_e32 v2, s2
39; VI-NEXT:    v_mov_b32_e32 v1, s1
40; VI-NEXT:    v_mov_b32_e32 v3, s3
41; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
42; VI-NEXT:    s_endpgm
43;
44; GFX90A-LABEL: select0:
45; GFX90A:       ; %bb.0: ; %entry
46; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x2c
47; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
48; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
49; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
50; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX90A-NEXT:    s_cmp_lt_u32 s6, 6
52; GFX90A-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
53; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
54; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
55; GFX90A-NEXT:    s_endpgm
56=======
57>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`.
58entry:
59  %0 = icmp ugt i32 %cond, 5
60  %1 = select i1 %0, i64 0, i64 %in
61  store i64 %1, i64 addrspace(1)* %out
62  ret void
63}
64
65; GCN-LABEL: {{^}}select_trunc_i64:
66; VI: s_cselect_b32
67; VI-NOT: s_cselect_b32
68; SI: v_cndmask_b32
69; SI-NOT: v_cndmask_b32
70define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
71<<<<<<< HEAD
72; SI-LABEL: select_trunc_i64:
73; SI:       ; %bb.0:
74; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
75; SI-NEXT:    s_load_dword s5, s[0:1], 0xd
76; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
77; SI-NEXT:    s_mov_b32 s3, 0xf000
78; SI-NEXT:    s_mov_b32 s2, -1
79; SI-NEXT:    s_waitcnt lgkmcnt(0)
80; SI-NEXT:    s_cmp_lt_u32 s4, 6
81; SI-NEXT:    v_mov_b32_e32 v0, s5
82; SI-NEXT:    s_cselect_b64 vcc, -1, 0
83; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
84; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
85; SI-NEXT:    s_endpgm
86;
87; VI-LABEL: select_trunc_i64:
88; VI:       ; %bb.0:
89; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
90; VI-NEXT:    s_load_dword s3, s[0:1], 0x34
91; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
92; VI-NEXT:    s_waitcnt lgkmcnt(0)
93; VI-NEXT:    s_cmp_lt_u32 s2, 6
94; VI-NEXT:    s_cselect_b32 s2, s3, 0
95; VI-NEXT:    v_mov_b32_e32 v0, s0
96; VI-NEXT:    v_mov_b32_e32 v1, s1
97; VI-NEXT:    v_mov_b32_e32 v2, s2
98; VI-NEXT:    flat_store_dword v[0:1], v2
99; VI-NEXT:    s_endpgm
100;
101; GFX90A-LABEL: select_trunc_i64:
102; GFX90A:       ; %bb.0:
103; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
104; GFX90A-NEXT:    s_load_dword s5, s[0:1], 0x34
105; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
106; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
107; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX90A-NEXT:    s_cmp_lt_u32 s4, 6
109; GFX90A-NEXT:    s_cselect_b32 s0, s5, 0
110; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
111; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
112; GFX90A-NEXT:    s_endpgm
113=======
114>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`.
115  %cmp = icmp ugt i32 %cond, 5
116  %sel = select i1 %cmp, i64 0, i64 %in
117  %trunc = trunc i64 %sel to i32
118  store i32 %trunc, i32 addrspace(1)* %out, align 4
119  ret void
120}
121
122; GCN-LABEL: {{^}}select_trunc_i64_2:
123; VI: s_cselect_b32
124; VI-NOT: s_cselect_b32
125; SI: v_cndmask_b32
126; SI-NOT: v_cndmask_b32
127define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
128<<<<<<< HEAD
129; SI-LABEL: select_trunc_i64_2:
130; SI:       ; %bb.0:
131; SI-NEXT:    s_load_dword s8, s[0:1], 0xb
132; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
133; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
134; SI-NEXT:    s_mov_b32 s3, 0xf000
135; SI-NEXT:    s_mov_b32 s2, -1
136; SI-NEXT:    s_waitcnt lgkmcnt(0)
137; SI-NEXT:    s_cmp_gt_u32 s8, 5
138; SI-NEXT:    v_mov_b32_e32 v0, s6
139; SI-NEXT:    v_mov_b32_e32 v1, s4
140; SI-NEXT:    s_cselect_b64 vcc, -1, 0
141; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
142; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
143; SI-NEXT:    s_endpgm
144;
145; VI-LABEL: select_trunc_i64_2:
146; VI:       ; %bb.0:
147; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
148; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
149; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
150; VI-NEXT:    s_waitcnt lgkmcnt(0)
151; VI-NEXT:    s_cmp_gt_u32 s2, 5
152; VI-NEXT:    s_cselect_b32 s2, s4, s6
153; VI-NEXT:    v_mov_b32_e32 v0, s0
154; VI-NEXT:    v_mov_b32_e32 v1, s1
155; VI-NEXT:    v_mov_b32_e32 v2, s2
156; VI-NEXT:    flat_store_dword v[0:1], v2
157; VI-NEXT:    s_endpgm
158;
159; GFX90A-LABEL: select_trunc_i64_2:
160; GFX90A:       ; %bb.0:
161; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x2c
162; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
163; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
164; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
165; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX90A-NEXT:    s_cmp_gt_u32 s8, 5
167; GFX90A-NEXT:    s_cselect_b32 s0, s4, s6
168; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
169; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
170; GFX90A-NEXT:    s_endpgm
171=======
172>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`.
173  %cmp = icmp ugt i32 %cond, 5
174  %sel = select i1 %cmp, i64 %a, i64 %b
175  %trunc = trunc i64 %sel to i32
176  store i32 %trunc, i32 addrspace(1)* %out, align 4
177  ret void
178}
179
180; GCN-LABEL: {{^}}v_select_trunc_i64_2:
181; VI: s_cselect_b32
182; VI-NOT: s_cselect_b32
183; SI: v_cndmask_b32
184; SI-NOT: v_cndmask_b32
185define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
186<<<<<<< HEAD
187; SI-LABEL: v_select_trunc_i64_2:
188; SI:       ; %bb.0:
189; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
190; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
191; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
192; SI-NEXT:    s_mov_b32 s11, 0xf000
193; SI-NEXT:    s_mov_b32 s10, -1
194; SI-NEXT:    s_waitcnt lgkmcnt(0)
195; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
196; SI-NEXT:    s_load_dword s2, s[4:5], 0x0
197; SI-NEXT:    s_cmp_gt_u32 s0, 5
198; SI-NEXT:    s_cselect_b64 vcc, -1, 0
199; SI-NEXT:    s_waitcnt lgkmcnt(0)
200; SI-NEXT:    v_mov_b32_e32 v0, s1
201; SI-NEXT:    v_mov_b32_e32 v1, s2
202; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
203; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
204; SI-NEXT:    s_endpgm
205;
206; VI-LABEL: v_select_trunc_i64_2:
207; VI:       ; %bb.0:
208; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
209; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
210; VI-NEXT:    s_waitcnt lgkmcnt(0)
211; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
212; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
213; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
214; VI-NEXT:    s_cmp_gt_u32 s2, 5
215; VI-NEXT:    s_waitcnt lgkmcnt(0)
216; VI-NEXT:    s_cselect_b32 s2, s3, s4
217; VI-NEXT:    v_mov_b32_e32 v0, s0
218; VI-NEXT:    v_mov_b32_e32 v1, s1
219; VI-NEXT:    v_mov_b32_e32 v2, s2
220; VI-NEXT:    flat_store_dword v[0:1], v2
221; VI-NEXT:    s_endpgm
222;
223; GFX90A-LABEL: v_select_trunc_i64_2:
224; GFX90A:       ; %bb.0:
225; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
226; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x2c
227; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
228; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
229; GFX90A-NEXT:    s_load_dword s9, s[4:5], 0x0
230; GFX90A-NEXT:    s_load_dword s10, s[6:7], 0x0
231; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
232; GFX90A-NEXT:    s_cmp_gt_u32 s8, 5
233; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX90A-NEXT:    s_cselect_b32 s0, s9, s10
235; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
236; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
237; GFX90A-NEXT:    s_endpgm
238=======
239>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`.
240  %cmp = icmp ugt i32 %cond, 5
241  %a = load i64, i64 addrspace(1)* %aptr, align 8
242  %b = load i64, i64 addrspace(1)* %bptr, align 8
243  %sel = select i1 %cmp, i64 %a, i64 %b
244  %trunc = trunc i64 %sel to i32
245  store i32 %trunc, i32 addrspace(1)* %out, align 4
246  ret void
247}
248
249; GCN-LABEL: {{^}}v_select_i64_split_imm:
250; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
251; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
252; GCN: s_endpgm
253define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
254<<<<<<< HEAD
255; SI-LABEL: v_select_i64_split_imm:
256; SI:       ; %bb.0:
257; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
258; SI-NEXT:    s_load_dword s6, s[0:1], 0xb
259; SI-NEXT:    s_waitcnt lgkmcnt(0)
260; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
261; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
262; SI-NEXT:    s_cmp_gt_u32 s6, 5
263; SI-NEXT:    s_cselect_b64 vcc, -1, 0
264; SI-NEXT:    s_mov_b32 s3, 0xf000
265; SI-NEXT:    s_waitcnt lgkmcnt(0)
266; SI-NEXT:    v_mov_b32_e32 v0, s5
267; SI-NEXT:    v_mov_b32_e32 v2, s4
268; SI-NEXT:    s_mov_b32 s2, -1
269; SI-NEXT:    v_cndmask_b32_e32 v1, 63, v0, vcc
270; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
271; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
272; SI-NEXT:    s_endpgm
273;
274; VI-LABEL: v_select_i64_split_imm:
275; VI:       ; %bb.0:
276; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
277; VI-NEXT:    s_load_dword s6, s[0:1], 0x2c
278; VI-NEXT:    s_mov_b32 s4, 0
279; VI-NEXT:    s_mov_b32 s5, 63
280; VI-NEXT:    s_waitcnt lgkmcnt(0)
281; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
282; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
283; VI-NEXT:    s_cmp_gt_u32 s6, 5
284; VI-NEXT:    s_waitcnt lgkmcnt(0)
285; VI-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
286; VI-NEXT:    v_mov_b32_e32 v0, s0
287; VI-NEXT:    v_mov_b32_e32 v2, s2
288; VI-NEXT:    v_mov_b32_e32 v1, s1
289; VI-NEXT:    v_mov_b32_e32 v3, s3
290; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
291; VI-NEXT:    s_endpgm
292;
293; GFX90A-LABEL: v_select_i64_split_imm:
294; GFX90A:       ; %bb.0:
295; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
296; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x2c
297; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
298; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
299; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
300; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
301; GFX90A-NEXT:    s_mov_b32 s2, 0
302; GFX90A-NEXT:    s_cmp_gt_u32 s6, 5
303; GFX90A-NEXT:    s_mov_b32 s3, 63
304; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
305; GFX90A-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
306; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
307; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
308; GFX90A-NEXT:    s_endpgm
309=======
310>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`.
311  %cmp = icmp ugt i32 %cond, 5
312  %a = load i64, i64 addrspace(1)* %aptr, align 8
313  %b = load i64, i64 addrspace(1)* %bptr, align 8
314  %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32
315  store i64 %sel, i64 addrspace(1)* %out, align 8
316  ret void
317}
318