xref: /llvm-project/llvm/test/CodeGen/AMDGPU/uniform-select.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s
5; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s
6
7define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
8; GFX90A-LABEL: test_insert_extract:
9; GFX90A:       ; %bb.0: ; %entry
10; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
11; GFX90A-NEXT:    s_mov_b32 s2, 0
12; GFX90A-NEXT:    s_and_b64 vcc, exec, -1
13; GFX90A-NEXT:    s_mov_b32 s3, 0
14; GFX90A-NEXT:    s_mov_b32 s4, 0
15; GFX90A-NEXT:    s_mov_b32 s5, 0
16; GFX90A-NEXT:    s_mov_b32 s6, 0
17; GFX90A-NEXT:  .LBB0_1: ; %for.body
18; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
19; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX90A-NEXT:    s_cmp_eq_u32 s1, 1
21; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
22; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
23; GFX90A-NEXT:    s_cselect_b32 s7, s4, s3
24; GFX90A-NEXT:    s_cmp_eq_u32 s1, 2
25; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
26; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
27; GFX90A-NEXT:    s_cselect_b32 s7, s5, s7
28; GFX90A-NEXT:    s_cmp_eq_u32 s1, 3
29; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
30; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
31; GFX90A-NEXT:    s_cselect_b32 s7, s6, s7
32; GFX90A-NEXT:    s_or_b32 s7, s7, s0
33; GFX90A-NEXT:    s_cmp_eq_u32 s1, 1
34; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
35; GFX90A-NEXT:    s_and_b64 s[10:11], s[8:9], exec
36; GFX90A-NEXT:    s_cselect_b32 s4, s7, s4
37; GFX90A-NEXT:    s_cmp_eq_u32 s1, 3
38; GFX90A-NEXT:    s_cselect_b64 s[10:11], -1, 0
39; GFX90A-NEXT:    s_and_b64 s[12:13], s[10:11], exec
40; GFX90A-NEXT:    s_cselect_b32 s6, s7, s6
41; GFX90A-NEXT:    s_cmp_eq_u32 s1, 2
42; GFX90A-NEXT:    s_cselect_b64 s[12:13], -1, 0
43; GFX90A-NEXT:    s_and_b64 s[14:15], s[12:13], exec
44; GFX90A-NEXT:    s_cselect_b32 s5, s7, s5
45; GFX90A-NEXT:    s_cmp_eq_u32 s1, 0
46; GFX90A-NEXT:    s_cselect_b32 s3, s7, s3
47; GFX90A-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
48; GFX90A-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
49; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
50; GFX90A-NEXT:    s_cselect_b32 s2, 0, s2
51; GFX90A-NEXT:    s_mov_b64 vcc, vcc
52; GFX90A-NEXT:    s_cbranch_vccnz .LBB0_1
53; GFX90A-NEXT:  ; %bb.2: ; %DummyReturnBlock
54; GFX90A-NEXT:    s_endpgm
55;
56; GFX940-LABEL: test_insert_extract:
57; GFX940:       ; %bb.0: ; %entry
58; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
59; GFX940-NEXT:    s_mov_b32 s2, 0
60; GFX940-NEXT:    s_and_b64 vcc, exec, -1
61; GFX940-NEXT:    s_mov_b32 s3, 0
62; GFX940-NEXT:    s_mov_b32 s4, 0
63; GFX940-NEXT:    s_mov_b32 s5, 0
64; GFX940-NEXT:    s_mov_b32 s6, 0
65; GFX940-NEXT:  .LBB0_1: ; %for.body
66; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
67; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX940-NEXT:    s_cmp_eq_u32 s1, 1
69; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
70; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
71; GFX940-NEXT:    s_cselect_b32 s7, s4, s3
72; GFX940-NEXT:    s_cmp_eq_u32 s1, 2
73; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
74; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
75; GFX940-NEXT:    s_cselect_b32 s7, s5, s7
76; GFX940-NEXT:    s_cmp_eq_u32 s1, 3
77; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
78; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
79; GFX940-NEXT:    s_cselect_b32 s7, s6, s7
80; GFX940-NEXT:    s_or_b32 s7, s7, s0
81; GFX940-NEXT:    s_cmp_eq_u32 s1, 1
82; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
83; GFX940-NEXT:    s_and_b64 s[10:11], s[8:9], exec
84; GFX940-NEXT:    s_cselect_b32 s4, s7, s4
85; GFX940-NEXT:    s_cmp_eq_u32 s1, 3
86; GFX940-NEXT:    s_cselect_b64 s[10:11], -1, 0
87; GFX940-NEXT:    s_and_b64 s[12:13], s[10:11], exec
88; GFX940-NEXT:    s_cselect_b32 s6, s7, s6
89; GFX940-NEXT:    s_cmp_eq_u32 s1, 2
90; GFX940-NEXT:    s_cselect_b64 s[12:13], -1, 0
91; GFX940-NEXT:    s_and_b64 s[14:15], s[12:13], exec
92; GFX940-NEXT:    s_cselect_b32 s5, s7, s5
93; GFX940-NEXT:    s_cmp_eq_u32 s1, 0
94; GFX940-NEXT:    s_cselect_b32 s3, s7, s3
95; GFX940-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
96; GFX940-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
97; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
98; GFX940-NEXT:    s_cselect_b32 s2, 0, s2
99; GFX940-NEXT:    s_mov_b64 vcc, vcc
100; GFX940-NEXT:    s_cbranch_vccnz .LBB0_1
101; GFX940-NEXT:  ; %bb.2: ; %DummyReturnBlock
102; GFX940-NEXT:    s_endpgm
103;
104; GFX1030-LABEL: test_insert_extract:
105; GFX1030:       ; %bb.0: ; %entry
106; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
107; GFX1030-NEXT:    s_mov_b32 s2, 0
108; GFX1030-NEXT:    s_mov_b32 s3, 0
109; GFX1030-NEXT:    s_mov_b32 s4, 0
110; GFX1030-NEXT:    s_mov_b32 s5, 0
111; GFX1030-NEXT:    s_mov_b32 s6, 0
112; GFX1030-NEXT:    s_mov_b32 vcc_lo, exec_lo
113; GFX1030-NEXT:    .p2align 6
114; GFX1030-NEXT:  .LBB0_1: ; %for.body
115; GFX1030-NEXT:    ; =>This Inner Loop Header: Depth=1
116; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX1030-NEXT:    s_cmp_eq_u32 s1, 1
118; GFX1030-NEXT:    s_cselect_b32 s7, -1, 0
119; GFX1030-NEXT:    s_and_b32 s7, s7, exec_lo
120; GFX1030-NEXT:    s_cselect_b32 s7, s4, s3
121; GFX1030-NEXT:    s_cmp_eq_u32 s1, 2
122; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
123; GFX1030-NEXT:    s_and_b32 s8, s8, exec_lo
124; GFX1030-NEXT:    s_cselect_b32 s7, s5, s7
125; GFX1030-NEXT:    s_cmp_eq_u32 s1, 3
126; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
127; GFX1030-NEXT:    s_and_b32 s8, s8, exec_lo
128; GFX1030-NEXT:    s_cselect_b32 s7, s6, s7
129; GFX1030-NEXT:    s_or_b32 s7, s7, s0
130; GFX1030-NEXT:    s_cmp_eq_u32 s1, 1
131; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
132; GFX1030-NEXT:    s_and_b32 s9, s8, exec_lo
133; GFX1030-NEXT:    s_cselect_b32 s4, s7, s4
134; GFX1030-NEXT:    s_cmp_eq_u32 s1, 3
135; GFX1030-NEXT:    s_cselect_b32 s9, -1, 0
136; GFX1030-NEXT:    s_and_b32 s10, s9, exec_lo
137; GFX1030-NEXT:    s_cselect_b32 s6, s7, s6
138; GFX1030-NEXT:    s_cmp_eq_u32 s1, 2
139; GFX1030-NEXT:    s_cselect_b32 s10, -1, 0
140; GFX1030-NEXT:    s_and_b32 s11, s10, exec_lo
141; GFX1030-NEXT:    s_cselect_b32 s5, s7, s5
142; GFX1030-NEXT:    s_cmp_eq_u32 s1, 0
143; GFX1030-NEXT:    s_cselect_b32 s3, s7, s3
144; GFX1030-NEXT:    s_or_b32 s7, s10, s8
145; GFX1030-NEXT:    s_or_b32 s7, s9, s7
146; GFX1030-NEXT:    s_and_b32 s7, s7, exec_lo
147; GFX1030-NEXT:    s_cselect_b32 s2, 0, s2
148; GFX1030-NEXT:    s_cbranch_vccnz .LBB0_1
149; GFX1030-NEXT:  ; %bb.2: ; %DummyReturnBlock
150; GFX1030-NEXT:    s_endpgm
151;
152; GFX1100-LABEL: test_insert_extract:
153; GFX1100:       ; %bb.0: ; %entry
154; GFX1100-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
155; GFX1100-NEXT:    s_mov_b32 s2, 0
156; GFX1100-NEXT:    s_mov_b32 s3, 0
157; GFX1100-NEXT:    s_mov_b32 s4, 0
158; GFX1100-NEXT:    s_mov_b32 s5, 0
159; GFX1100-NEXT:    s_mov_b32 s6, 0
160; GFX1100-NEXT:    s_mov_b32 vcc_lo, exec_lo
161; GFX1100-NEXT:    .p2align 6
162; GFX1100-NEXT:  .LBB0_1: ; %for.body
163; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
164; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX1100-NEXT:    s_cmp_eq_u32 s1, 1
166; GFX1100-NEXT:    s_cselect_b32 s7, -1, 0
167; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
168; GFX1100-NEXT:    s_and_b32 s7, s7, exec_lo
169; GFX1100-NEXT:    s_cselect_b32 s7, s4, s3
170; GFX1100-NEXT:    s_cmp_eq_u32 s1, 2
171; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
172; GFX1100-NEXT:    s_and_b32 s8, s8, exec_lo
173; GFX1100-NEXT:    s_cselect_b32 s7, s5, s7
174; GFX1100-NEXT:    s_cmp_eq_u32 s1, 3
175; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
176; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
177; GFX1100-NEXT:    s_and_b32 s8, s8, exec_lo
178; GFX1100-NEXT:    s_cselect_b32 s7, s6, s7
179; GFX1100-NEXT:    s_or_b32 s7, s7, s0
180; GFX1100-NEXT:    s_cmp_eq_u32 s1, 1
181; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
182; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
183; GFX1100-NEXT:    s_and_b32 s9, s8, exec_lo
184; GFX1100-NEXT:    s_cselect_b32 s4, s7, s4
185; GFX1100-NEXT:    s_cmp_eq_u32 s1, 3
186; GFX1100-NEXT:    s_cselect_b32 s9, -1, 0
187; GFX1100-NEXT:    s_and_b32 s10, s9, exec_lo
188; GFX1100-NEXT:    s_cselect_b32 s6, s7, s6
189; GFX1100-NEXT:    s_cmp_eq_u32 s1, 2
190; GFX1100-NEXT:    s_cselect_b32 s10, -1, 0
191; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
192; GFX1100-NEXT:    s_and_b32 s11, s10, exec_lo
193; GFX1100-NEXT:    s_cselect_b32 s5, s7, s5
194; GFX1100-NEXT:    s_cmp_eq_u32 s1, 0
195; GFX1100-NEXT:    s_cselect_b32 s3, s7, s3
196; GFX1100-NEXT:    s_or_b32 s7, s10, s8
197; GFX1100-NEXT:    s_or_b32 s7, s9, s7
198; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
199; GFX1100-NEXT:    s_and_b32 s7, s7, exec_lo
200; GFX1100-NEXT:    s_cselect_b32 s2, 0, s2
201; GFX1100-NEXT:    s_cbranch_vccnz .LBB0_1
202; GFX1100-NEXT:  ; %bb.2: ; %DummyReturnBlock
203; GFX1100-NEXT:    s_endpgm
204entry:
205  %init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0
206  br label %for.body
207
208for.body:                                     ; preds = %for.body, %entry
209  %x1 = phi <4 x i32> [ %init, %entry ], [ %i4, %for.body ]
210  %x2 = phi <4 x i32> [ zeroinitializer, %entry ], [ %i2, %for.body ]
211  %idxprom = zext i32 %q to i64
212  %e1 = extractelement <4 x i32> %x2, i64 %idxprom
213  %add = or i32 %e1, %p
214  %i2 = insertelement <4 x i32> %x2, i32 %add, i64 %idxprom
215  %e3 = extractelement <4 x i32> %x1, i64 %idxprom
216  %i4 = insertelement <4 x i32> %x1, i32 %e3, i64 0
217  br label %for.body
218}
219
220