xref: /llvm-project/llvm/test/CodeGen/AMDGPU/select-vectors.ll (revision 758444ca3e7163a1504eeced3383af861d01d761)
1; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
4
5; Test expansion of scalar selects on vectors.
6; Evergreen not enabled since it seems to be having problems with doubles.
7
8; GCN-LABEL: {{^}}v_select_v2i8:
9; SI: v_cndmask_b32
10; SI-NOT: cndmask
11
12; GFX9: v_cndmask_b32
13; GFX9-NOT: cndmask
14
15; This is worse when i16 is legal and packed is not because
16; SelectionDAGBuilder for some reason changes the select type.
17; VI: s_cselect_b64
18; VI: v_cndmask_b32
19define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
20  %a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2
21  %b = load <2 x i8>, ptr addrspace(1) %b.ptr, align 2
22  %cmp = icmp eq i32 %c, 0
23  %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
24  store <2 x i8> %select, ptr addrspace(1) %out, align 2
25  ret void
26}
27
28; GCN-LABEL: {{^}}v_select_v4i8:
29; GCN: v_cndmask_b32_e32
30; GCN-NOT: cndmask
31define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
32  %a = load <4 x i8>, ptr addrspace(1) %a.ptr
33  %b = load <4 x i8>, ptr addrspace(1) %b.ptr
34  %cmp = icmp eq i32 %c, 0
35  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
36  store <4 x i8> %select, ptr addrspace(1) %out, align 4
37  ret void
38}
39
40; GCN-LABEL: {{^}}v_select_v8i8:
41; GCN: v_cndmask_b32_e32
42; GCN: v_cndmask_b32_e32
43; GCN-NOT: cndmask
44define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
45  %a = load <8 x i8>, ptr addrspace(1) %a.ptr
46  %b = load <8 x i8>, ptr addrspace(1) %b.ptr
47  %cmp = icmp eq i32 %c, 0
48  %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
49  store <8 x i8> %select, ptr addrspace(1) %out, align 4
50  ret void
51}
52
53; GCN-LABEL: {{^}}v_select_v16i8:
54; GCN: v_cndmask_b32_e32
55; GCN: v_cndmask_b32_e32
56; GCN: v_cndmask_b32_e32
57; GCN: v_cndmask_b32_e32
58; GCN-NOT: cndmask
59define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
60  %a = load <16 x i8>, ptr addrspace(1) %a.ptr
61  %b = load <16 x i8>, ptr addrspace(1) %b.ptr
62  %cmp = icmp eq i32 %c, 0
63  %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
64  store <16 x i8> %select, ptr addrspace(1) %out, align 4
65  ret void
66}
67
68; GCN-LABEL: {{^}}select_v4i8:
69; GFX89: s_cselect_b32
70; GFX89-NOT: s_cselect_b32
71
72; SI: s_cselect_b32
73; SI-NOT: cndmask
74define amdgpu_kernel void @select_v4i8(ptr addrspace(1) %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
75  %cmp = icmp eq i8 %c, 0
76  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
77  store <4 x i8> %select, ptr addrspace(1) %out, align 4
78  ret void
79}
80
81; GCN-LABEL: {{^}}select_v2i16:
82; GFX89: s_load_dwordx4
83; GFX89: s_cselect_b32
84; GFX89-NOT: s_cselect_b32
85
86; SI: s_cselect_b32
87; SI-NOT: v_cndmask_b32e
88define amdgpu_kernel void @select_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
89  %cmp = icmp eq i32 %c, 0
90  %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
91  store <2 x i16> %select, ptr addrspace(1) %out, align 4
92  ret void
93}
94
95; GCN-LABEL: {{^}}v_select_v2i16:
96; GCN: buffer_load_dword v
97; GCN: buffer_load_dword v
98; GCN: v_cndmask_b32
99; GCN-NOT: cndmask
100define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
101  %a = load <2 x i16>, ptr addrspace(1) %a.ptr
102  %b = load <2 x i16>, ptr addrspace(1) %b.ptr
103  %cmp = icmp eq i32 %c, 0
104  %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
105  store <2 x i16> %select, ptr addrspace(1) %out, align 4
106  ret void
107}
108
109; GCN-LABEL: {{^}}v_select_v3i16:
110; SI: v_cndmask_b32_e32
111; SI: cndmask
112; SI-NOT: cndmask
113
114; VI: s_cselect_b64
115; GFX9: cndmask
116; GFX9: cndmask
117define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
118  %a = load <3 x i16>, ptr addrspace(1) %a.ptr
119  %b = load <3 x i16>, ptr addrspace(1) %b.ptr
120  %cmp = icmp eq i32 %c, 0
121  %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
122  store <3 x i16> %select, ptr addrspace(1) %out, align 4
123  ret void
124}
125
126; GCN-LABEL: {{^}}v_select_v4i16:
127; GCN: v_cndmask_b32_e32
128; GCN: v_cndmask_b32_e32
129; GCN-NOT: cndmask
130define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
131  %a = load <4 x i16>, ptr addrspace(1) %a.ptr
132  %b = load <4 x i16>, ptr addrspace(1) %b.ptr
133  %cmp = icmp eq i32 %c, 0
134  %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
135  store <4 x i16> %select, ptr addrspace(1) %out, align 4
136  ret void
137}
138
139; GCN-LABEL: {{^}}v_select_v8i16:
140; GCN: v_cndmask_b32_e32
141; GCN: v_cndmask_b32_e32
142; GCN: v_cndmask_b32_e32
143; GCN: v_cndmask_b32_e32
144; GCN-NOT: cndmask
145define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
146  %a = load <8 x i16>, ptr addrspace(1) %a.ptr
147  %b = load <8 x i16>, ptr addrspace(1) %b.ptr
148  %cmp = icmp eq i32 %c, 0
149  %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
150  store <8 x i16> %select, ptr addrspace(1) %out, align 4
151  ret void
152}
153
154; GCN-LABEL: {{^}}v_select_v16i16:
155; GCN: v_cndmask_b32_e32
156; GCN: v_cndmask_b32_e32
157; GCN: v_cndmask_b32_e32
158; GCN: v_cndmask_b32_e32
159; GCN: v_cndmask_b32_e32
160; GCN: v_cndmask_b32_e32
161; GCN: v_cndmask_b32_e32
162; GCN: v_cndmask_b32_e32
163; GCN-NOT: cndmask
164define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
165  %a = load <16 x i16>, ptr addrspace(1) %a.ptr
166  %b = load <16 x i16>, ptr addrspace(1) %b.ptr
167  %cmp = icmp eq i32 %c, 0
168  %select = select i1 %cmp, <16 x i16> %a, <16 x i16> %b
169  store <16 x i16> %select, ptr addrspace(1) %out, align 4
170  ret void
171}
172
173; GCN-LABEL: {{^}}v_select_v32i16:
174; GCN: v_cndmask_b32_e32
175; GCN: v_cndmask_b32_e32
176; GCN: v_cndmask_b32_e32
177; GCN: v_cndmask_b32_e32
178; GCN: v_cndmask_b32_e32
179; GCN: v_cndmask_b32_e32
180; GCN: v_cndmask_b32_e32
181; GCN: v_cndmask_b32_e32
182; GCN: v_cndmask_b32_e32
183; GCN: v_cndmask_b32_e32
184; GCN: v_cndmask_b32_e32
185; GCN: v_cndmask_b32_e32
186; GCN: v_cndmask_b32_e32
187; GCN: v_cndmask_b32_e32
188; GCN: v_cndmask_b32_e32
189; GCN: v_cndmask_b32_e32
190; GCN-NOT: cndmask
191define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
192  %a = load <32 x i16>, ptr addrspace(1) %a.ptr
193  %b = load <32 x i16>, ptr addrspace(1) %b.ptr
194  %cmp = icmp eq i32 %c, 0
195  %select = select i1 %cmp, <32 x i16> %a, <32 x i16> %b
196  store <32 x i16> %select, ptr addrspace(1) %out, align 4
197  ret void
198}
199
200; FIXME: Expansion with bitwise operations may be better if doing a
201; vector select with SGPR inputs.
202
203; GCN-LABEL: {{^}}s_select_v2i32:
204; GCN: s_cselect_b32
205; GCN: s_cselect_b32
206; GCN: buffer_store_dwordx2
207define amdgpu_kernel void @s_select_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
208  %cmp = icmp eq i32 %c, 0
209  %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
210  store <2 x i32> %select, ptr addrspace(1) %out, align 8
211  ret void
212}
213
214; GCN-LABEL: {{^}}s_select_v4i32:
215; GCN: s_cselect_b32
216; GCN: s_cselect_b32
217; GCN: s_cselect_b32
218; GCN: s_cselect_b32
219; GCN: buffer_store_dwordx4
220define amdgpu_kernel void @s_select_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
221  %cmp = icmp eq i32 %c, 0
222  %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
223  store <4 x i32> %select, ptr addrspace(1) %out, align 16
224  ret void
225}
226
227; GCN-LABEL: {{^}}v_select_v4i32:
228; GCN: buffer_load_dwordx4
229; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32
230; GCN: s_cselect_b64 vcc, -1, 0
231; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
232; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
233; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
234; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
235; GCN: buffer_store_dwordx4
236define amdgpu_kernel void @v_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
237bb:
238  %tmp2 = icmp ult i32 %cond, 32
239  %val = load <4 x i32>, ptr addrspace(1) %in
240  %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer
241  store <4 x i32> %tmp3, ptr addrspace(1) %out, align 16
242  ret void
243}
244
245; GCN-LABEL: {{^}}select_v8i32:
246; GCN: s_cselect_b32
247; GCN: s_cselect_b32
248; GCN: s_cselect_b32
249; GCN: s_cselect_b32
250; GCN: s_cselect_b32
251; GCN: s_cselect_b32
252; GCN: s_cselect_b32
253; GCN: s_cselect_b32
254define amdgpu_kernel void @select_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 {
255  %cmp = icmp eq i32 %c, 0
256  %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
257  store <8 x i32> %select, ptr addrspace(1) %out, align 16
258  ret void
259}
260
261; GCN-LABEL: {{^}}s_select_v2f32:
262; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
263; GCN-DAG: s_cselect_b32
264; GCN-DAG: s_cselect_b32
265; GCN: buffer_store_dwordx2
266define amdgpu_kernel void @s_select_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
267  %cmp = icmp eq i32 %c, 0
268  %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
269  store <2 x float> %select, ptr addrspace(1) %out, align 16
270  ret void
271}
272
273; GCN-LABEL: {{^}}s_select_v3f32:
274; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
275
276; GCN: s_cselect_b32
277; GCN: s_cselect_b32
278; GCN: s_cselect_b32
279
280; GCN: buffer_store_dwordx
281define amdgpu_kernel void @s_select_v3f32(ptr addrspace(1) %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 {
282  %cmp = icmp eq i32 %c, 0
283  %select = select i1 %cmp, <3 x float> %a, <3 x float> %b
284  store <3 x float> %select, ptr addrspace(1) %out, align 16
285  ret void
286}
287
288; GCN-LABEL: {{^}}s_select_v4f32:
289; GCN: s_load_dwordx8
290; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
291
292; GCN: s_cselect_b32
293; GCN: s_cselect_b32
294; GCN: s_cselect_b32
295; GCN: s_cselect_b32
296
297; GCN: buffer_store_dwordx4
298define amdgpu_kernel void @s_select_v4f32(ptr addrspace(1) %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 {
299  %cmp = icmp eq i32 %c, 0
300  %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
301  store <4 x float> %select, ptr addrspace(1) %out, align 16
302  ret void
303}
304
305; GCN-LABEL: {{^}}v_select_v4f32:
306; GCN: buffer_load_dwordx4
307; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32
308; GCN: s_cselect_b64 vcc, -1, 0
309; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
310; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
311; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
312; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
313; GCN: buffer_store_dwordx4
314define amdgpu_kernel void @v_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
315bb:
316  %tmp2 = icmp ult i32 %cond, 32
317  %val = load <4 x float>, ptr addrspace(1) %in
318  %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer
319  store <4 x float> %tmp3, ptr addrspace(1) %out, align 16
320  ret void
321}
322
323; GCN-LABEL: {{^}}s_select_v5f32:
324; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
325
326; GCN: s_cselect_b32
327; GCN: s_cselect_b32
328; GCN: s_cselect_b32
329; GCN: s_cselect_b32
330; GCN: s_cselect_b32
331
332; GCN: buffer_store_dwordx
333define amdgpu_kernel void @s_select_v5f32(ptr addrspace(1) %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 {
334  %cmp = icmp eq i32 %c, 0
335  %select = select i1 %cmp, <5 x float> %a, <5 x float> %b
336  store <5 x float> %select, ptr addrspace(1) %out, align 16
337  ret void
338}
339
340; GCN-LABEL: {{^}}select_v8f32:
341; GCN: v_cndmask_b32_e32
342; GCN: v_cndmask_b32_e32
343; GCN: v_cndmask_b32_e32
344; GCN: v_cndmask_b32_e32
345; GCN: v_cndmask_b32_e32
346; GCN: v_cndmask_b32_e32
347; GCN: v_cndmask_b32_e32
348; GCN: v_cndmask_b32_e32
349define amdgpu_kernel void @select_v8f32(ptr addrspace(1) %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 {
350  %cmp = icmp eq i32 %c, 0
351  %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
352  store <8 x float> %select, ptr addrspace(1) %out, align 16
353  ret void
354}
355
356; GCN-LABEL: {{^}}select_v2f64:
357; GCN: s_cselect_b32
358; GCN: s_cselect_b32
359; GCN: s_cselect_b32
360; GCN: s_cselect_b32
361define amdgpu_kernel void @select_v2f64(ptr addrspace(1) %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
362  %cmp = icmp eq i32 %c, 0
363  %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
364  store <2 x double> %select, ptr addrspace(1) %out, align 16
365  ret void
366}
367
368; GCN-LABEL: {{^}}select_v4f64:
369; GCN: s_cselect_b32
370; GCN: s_cselect_b32
371; GCN: s_cselect_b32
372; GCN: s_cselect_b32
373; GCN: s_cselect_b32
374; GCN: s_cselect_b32
375; GCN: s_cselect_b32
376; GCN: s_cselect_b32
377define amdgpu_kernel void @select_v4f64(ptr addrspace(1) %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 {
378  %cmp = icmp eq i32 %c, 0
379  %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
380  store <4 x double> %select, ptr addrspace(1) %out, align 16
381  ret void
382}
383
384; GCN-LABEL: {{^}}select_v8f64:
385; GCN: s_cselect_b32
386; GCN: s_cselect_b32
387; GCN: s_cselect_b32
388; GCN: s_cselect_b32
389; GCN: s_cselect_b32
390; GCN: s_cselect_b32
391; GCN: s_cselect_b32
392; GCN: s_cselect_b32
393; GCN: s_cselect_b32
394; GCN: s_cselect_b32
395; GCN: s_cselect_b32
396; GCN: s_cselect_b32
397; GCN: s_cselect_b32
398; GCN: s_cselect_b32
399; GCN: s_cselect_b32
400; GCN: s_cselect_b32
401define amdgpu_kernel void @select_v8f64(ptr addrspace(1) %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 {
402  %cmp = icmp eq i32 %c, 0
403  %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
404  store <8 x double> %select, ptr addrspace(1) %out, align 16
405  ret void
406}
407
408; GCN-LABEL: {{^}}v_select_v2f16:
409; GCN: v_cndmask_b32
410; GCN-NOT: cndmask
411define amdgpu_kernel void @v_select_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
412  %a = load <2 x half>, ptr addrspace(1) %a.ptr
413  %b = load <2 x half>, ptr addrspace(1) %b.ptr
414  %cmp = icmp eq i32 %c, 0
415  %select = select i1 %cmp, <2 x half> %a, <2 x half> %b
416  store <2 x half> %select, ptr addrspace(1) %out, align 4
417  ret void
418}
419
420; GCN-LABEL: {{^}}v_select_v3f16:
421; GCN: v_cndmask_b32_e32
422; GCN: v_cndmask_b32_e32
423; GCN-NOT: cndmask
424define amdgpu_kernel void @v_select_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
425  %a = load <3 x half>, ptr addrspace(1) %a.ptr
426  %b = load <3 x half>, ptr addrspace(1) %b.ptr
427  %cmp = icmp eq i32 %c, 0
428  %select = select i1 %cmp, <3 x half> %a, <3 x half> %b
429  store <3 x half> %select, ptr addrspace(1) %out, align 4
430  ret void
431}
432
433; GCN-LABEL: {{^}}v_select_v4f16:
434; GCN: v_cndmask_b32_e32
435; GCN: v_cndmask_b32_e32
436; GCN-NOT: cndmask
437define amdgpu_kernel void @v_select_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
438  %a = load <4 x half>, ptr addrspace(1) %a.ptr
439  %b = load <4 x half>, ptr addrspace(1) %b.ptr
440  %cmp = icmp eq i32 %c, 0
441  %select = select i1 %cmp, <4 x half> %a, <4 x half> %b
442  store <4 x half> %select, ptr addrspace(1) %out, align 4
443  ret void
444}
445
446; Function Attrs: nounwind readnone
447declare i32 @llvm.amdgcn.workitem.id.x() #1
448
449attributes #0 = { nounwind }
450attributes #1 = { nounwind readnone }
451