xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll (revision b60c118f53e6f7e5328e54dc26b4d6787030c02b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS %s
4
5; if instruction is uniform and there is available instruction, select SALU instruction
6define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) {
7; OLD_RBS-LABEL: uniform_in_vgpr:
8; OLD_RBS:       ; %bb.0:
9; OLD_RBS-NEXT:    v_cvt_u32_f32_e32 v2, s0
10; OLD_RBS-NEXT:    v_add_nc_u32_e32 v2, s1, v2
11; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
12; OLD_RBS-NEXT:    s_endpgm
13;
14; NEW_RBS-LABEL: uniform_in_vgpr:
15; NEW_RBS:       ; %bb.0:
16; NEW_RBS-NEXT:    v_cvt_u32_f32_e32 v2, s0
17; NEW_RBS-NEXT:    v_readfirstlane_b32 s0, v2
18; NEW_RBS-NEXT:    s_add_i32 s0, s0, s1
19; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
20; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
21; NEW_RBS-NEXT:    s_endpgm
22  %a.i32 = fptoui float %a to i32
23  %res = add i32 %a.i32, %b
24  store i32 %res, ptr addrspace(1) %ptr
25  ret void
26}
27
28; copy sgpr to vgpr + readfirstlane vgpr to sgpr combine from rb-legalize
29define amdgpu_ps void @back_to_back_uniform_in_vgpr(float inreg %a, float inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) {
30; OLD_RBS-LABEL: back_to_back_uniform_in_vgpr:
31; OLD_RBS:       ; %bb.0:
32; OLD_RBS-NEXT:    v_add_f32_e64 v2, s0, s1
33; OLD_RBS-NEXT:    v_cvt_u32_f32_e32 v2, v2
34; OLD_RBS-NEXT:    v_add_nc_u32_e32 v2, s2, v2
35; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
36; OLD_RBS-NEXT:    s_endpgm
37;
38; NEW_RBS-LABEL: back_to_back_uniform_in_vgpr:
39; NEW_RBS:       ; %bb.0:
40; NEW_RBS-NEXT:    v_add_f32_e64 v2, s0, s1
41; NEW_RBS-NEXT:    v_cvt_u32_f32_e32 v2, v2
42; NEW_RBS-NEXT:    v_readfirstlane_b32 s0, v2
43; NEW_RBS-NEXT:    s_add_i32 s0, s0, s2
44; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
45; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
46; NEW_RBS-NEXT:    s_endpgm
47  %add = fadd float %a, %b
48  %add.i32 = fptoui float %add to i32
49  %res = add i32 %add.i32, %c
50  store i32 %res, ptr addrspace(1) %ptr
51  ret void
52}
53
54; fast rules for vector instructions
55define amdgpu_cs void @buffer_load_uniform(<4 x i32> inreg %rsrc, i32 inreg %voffset, ptr addrspace(1) %ptr) {
56; OLD_RBS-LABEL: buffer_load_uniform:
57; OLD_RBS:       ; %bb.0: ; %.entry
58; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s4
59; OLD_RBS-NEXT:    buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen
60; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
61; OLD_RBS-NEXT:    v_add_nc_u32_e32 v2, 1, v3
62; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
63; OLD_RBS-NEXT:    s_endpgm
64;
65; NEW_RBS-LABEL: buffer_load_uniform:
66; NEW_RBS:       ; %bb.0: ; %.entry
67; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s4
68; NEW_RBS-NEXT:    buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen
69; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
70; NEW_RBS-NEXT:    v_readfirstlane_b32 s0, v3
71; NEW_RBS-NEXT:    s_add_i32 s0, s0, 1
72; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
73; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
74; NEW_RBS-NEXT:    s_endpgm
75.entry:
76  %vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
77  %el1 = extractelement <4 x i32> %vec, i64 1
78  %res = add i32 %el1, 1
79  store i32 %res, ptr addrspace(1) %ptr
80  ret void
81}
82
83define amdgpu_cs void @buffer_load_divergent(<4 x i32> inreg %rsrc, i32 %voffset, ptr addrspace(1) %ptr) {
84; OLD_RBS-LABEL: buffer_load_divergent:
85; OLD_RBS:       ; %bb.0: ; %.entry
86; OLD_RBS-NEXT:    buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen
87; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
88; OLD_RBS-NEXT:    v_add_nc_u32_e32 v0, 1, v4
89; OLD_RBS-NEXT:    global_store_dword v[1:2], v0, off
90; OLD_RBS-NEXT:    s_endpgm
91;
92; NEW_RBS-LABEL: buffer_load_divergent:
93; NEW_RBS:       ; %bb.0: ; %.entry
94; NEW_RBS-NEXT:    buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen
95; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
96; NEW_RBS-NEXT:    v_add_nc_u32_e32 v0, 1, v4
97; NEW_RBS-NEXT:    global_store_dword v[1:2], v0, off
98; NEW_RBS-NEXT:    s_endpgm
99.entry:
100  %vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
101  %el1 = extractelement <4 x i32> %vec, i64 1
102  %res = add i32 %el1, 1
103  store i32 %res, ptr addrspace(1) %ptr
104  ret void
105}
106
107;lowering in rb-legalize (sgpr S64 is legal, vgpr has to be split to S32)
108define amdgpu_ps void @vgpr_and_i64(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
109; OLD_RBS-LABEL: vgpr_and_i64:
110; OLD_RBS:       ; %bb.0:
111; OLD_RBS-NEXT:    v_and_b32_e32 v0, v0, v2
112; OLD_RBS-NEXT:    v_and_b32_e32 v1, v1, v3
113; OLD_RBS-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
114; OLD_RBS-NEXT:    s_endpgm
115;
116; NEW_RBS-LABEL: vgpr_and_i64:
117; NEW_RBS:       ; %bb.0:
118; NEW_RBS-NEXT:    v_and_b32_e32 v0, v0, v2
119; NEW_RBS-NEXT:    v_and_b32_e32 v1, v1, v3
120; NEW_RBS-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
121; NEW_RBS-NEXT:    s_endpgm
122  %res = and i64 %a, %b
123  store i64 %res, ptr addrspace(1) %ptr
124  ret void
125}
126
127; It is up to user instruction to deal with potential truncated bits in reg.
128; Here G_ABS needs to sign extend S16 in reg to S32 and then do S32 G_ABS.
129define amdgpu_ps void @abs_sgpr_i16(i16 inreg %arg, ptr addrspace(1) %ptr) {
130; OLD_RBS-LABEL: abs_sgpr_i16:
131; OLD_RBS:       ; %bb.0:
132; OLD_RBS-NEXT:    s_sext_i32_i16 s0, s0
133; OLD_RBS-NEXT:    s_abs_i32 s0, s0
134; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
135; OLD_RBS-NEXT:    global_store_short v[0:1], v2, off
136; OLD_RBS-NEXT:    s_endpgm
137;
138; NEW_RBS-LABEL: abs_sgpr_i16:
139; NEW_RBS:       ; %bb.0:
140; NEW_RBS-NEXT:    s_sext_i32_i16 s0, s0
141; NEW_RBS-NEXT:    s_abs_i32 s0, s0
142; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
143; NEW_RBS-NEXT:    global_store_short v[0:1], v2, off
144; NEW_RBS-NEXT:    s_endpgm
145  %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
146  store i16 %res, ptr addrspace(1) %ptr
147  ret void
148}
149
150define amdgpu_ps void @uniform_i1_phi(ptr addrspace(1) %out, i32 inreg %tid, i32 inreg %cond) {
151; OLD_RBS-LABEL: uniform_i1_phi:
152; OLD_RBS:       ; %bb.0: ; %A
153; OLD_RBS-NEXT:    s_cmp_ge_u32 s0, 6
154; OLD_RBS-NEXT:    s_cselect_b32 s2, 1, 0
155; OLD_RBS-NEXT:    s_cmp_lg_u32 s1, 0
156; OLD_RBS-NEXT:    s_cbranch_scc1 .LBB6_2
157; OLD_RBS-NEXT:  ; %bb.1: ; %B
158; OLD_RBS-NEXT:    s_cmp_lt_u32 s0, 1
159; OLD_RBS-NEXT:    s_cselect_b32 s2, 1, 0
160; OLD_RBS-NEXT:  .LBB6_2: ; %exit
161; OLD_RBS-NEXT:    s_bfe_i32 s0, s2, 0x10000
162; OLD_RBS-NEXT:    s_add_i32 s0, s0, 2
163; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
164; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
165; OLD_RBS-NEXT:    s_endpgm
166;
167; NEW_RBS-LABEL: uniform_i1_phi:
168; NEW_RBS:       ; %bb.0: ; %A
169; NEW_RBS-NEXT:    s_cmp_ge_u32 s0, 6
170; NEW_RBS-NEXT:    s_cselect_b32 s2, 1, 0
171; NEW_RBS-NEXT:    s_cmp_lg_u32 s1, 0
172; NEW_RBS-NEXT:    s_cbranch_scc1 .LBB6_2
173; NEW_RBS-NEXT:  ; %bb.1: ; %B
174; NEW_RBS-NEXT:    s_cmp_lt_u32 s0, 1
175; NEW_RBS-NEXT:    s_cselect_b32 s2, 1, 0
176; NEW_RBS-NEXT:  .LBB6_2: ; %exit
177; NEW_RBS-NEXT:    s_cmp_lg_u32 s2, 0
178; NEW_RBS-NEXT:    s_cselect_b32 s0, -1, 0
179; NEW_RBS-NEXT:    s_add_i32 s0, s0, 2
180; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
181; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
182; NEW_RBS-NEXT:    s_endpgm
183A:
184  %val_A = icmp uge i32 %tid, 6
185  %cmp = icmp eq i32 %cond, 0
186  br i1 %cmp, label %B, label %exit
187
188B:
189  %val_B = icmp ult i32 %tid, 1
190  br label %exit
191
192exit:
193  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
194  %sel = select i1 %phi, i32 1, i32 2
195  store i32 %sel, ptr addrspace(1) %out
196  ret void
197}
198
199; this is kind of i1 readfirstlane
200; uniform i1 result on instruction that is only available on VALU
201define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) {
202; OLD_RBS-LABEL: vcc_to_scc:
203; OLD_RBS:       ; %bb.0:
204; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s2
205; OLD_RBS-NEXT:    v_cmp_eq_f32_e64 s0, s0, 0
206; OLD_RBS-NEXT:    v_cndmask_b32_e64 v2, v2, s1, s0
207; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
208; OLD_RBS-NEXT:    s_endpgm
209;
210; NEW_RBS-LABEL: vcc_to_scc:
211; NEW_RBS:       ; %bb.0:
212; NEW_RBS-NEXT:    v_cmp_eq_f32_e64 s0, s0, 0
213; NEW_RBS-NEXT:    s_cmp_lg_u32 s0, 0
214; NEW_RBS-NEXT:    s_cselect_b32 s0, 1, 0
215; NEW_RBS-NEXT:    s_and_b32 s0, s0, 1
216; NEW_RBS-NEXT:    s_cmp_lg_u32 s0, 0
217; NEW_RBS-NEXT:    s_cselect_b32 s0, s1, s2
218; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
219; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
220; NEW_RBS-NEXT:    s_endpgm
221  %vcc_to_scc = fcmp oeq float %a, 0.0
222  %select = select i1 %vcc_to_scc, i32 %b, i32 %c
223  store i32 %select, ptr addrspace(1) %ptr
224  ret void
225}
226
227; combiner in rb-legalize recognizes sgpr S1 to vcc copy
228define amdgpu_ps void @scc_to_vcc(i32 inreg %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) {
229; OLD_RBS-LABEL: scc_to_vcc:
230; OLD_RBS:       ; %bb.0:
231; OLD_RBS-NEXT:    s_cmp_eq_u32 s0, 0
232; OLD_RBS-NEXT:    s_cselect_b32 s0, 1, 0
233; OLD_RBS-NEXT:    s_and_b32 s0, 1, s0
234; OLD_RBS-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
235; OLD_RBS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
236; OLD_RBS-NEXT:    global_store_dword v[2:3], v0, off
237; OLD_RBS-NEXT:    s_endpgm
238;
239; NEW_RBS-LABEL: scc_to_vcc:
240; NEW_RBS:       ; %bb.0:
241; NEW_RBS-NEXT:    s_cmp_eq_u32 s0, 0
242; NEW_RBS-NEXT:    s_cselect_b32 vcc_lo, exec_lo, 0
243; NEW_RBS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
244; NEW_RBS-NEXT:    global_store_dword v[2:3], v0, off
245; NEW_RBS-NEXT:    s_endpgm
246  %scc_to_vcc = icmp eq i32 %a, 0
247  %select = select i1 %scc_to_vcc, i32 %b, i32 %c
248  store i32 %select, ptr addrspace(1) %ptr
249  ret void
250}
251
252; this is only G_TRUNC that is not no-op in global-isel for AMDGPU
253define amdgpu_ps void @vgpr_to_vcc_trunc(i32 %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) {
254; OLD_RBS-LABEL: vgpr_to_vcc_trunc:
255; OLD_RBS:       ; %bb.0:
256; OLD_RBS-NEXT:    v_and_b32_e32 v0, 1, v0
257; OLD_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
258; OLD_RBS-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
259; OLD_RBS-NEXT:    global_store_dword v[3:4], v0, off
260; OLD_RBS-NEXT:    s_endpgm
261;
262; NEW_RBS-LABEL: vgpr_to_vcc_trunc:
263; NEW_RBS:       ; %bb.0:
264; NEW_RBS-NEXT:    v_and_b32_e32 v0, 1, v0
265; NEW_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
266; NEW_RBS-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
267; NEW_RBS-NEXT:    global_store_dword v[3:4], v0, off
268; NEW_RBS-NEXT:    s_endpgm
269  %vcc = trunc i32 %a to i1
270  %select = select i1 %vcc, i32 %b, i32 %c
271  store i32 %select, ptr addrspace(1) %ptr
272  ret void
273}
274
275; i1 input to zext and sext is something that survived legalizer (not trunc)
276; lower to select
277define amdgpu_ps void @zext(i32 inreg %a, ptr addrspace(1) %ptr) {
278; OLD_RBS-LABEL: zext:
279; OLD_RBS:       ; %bb.0:
280; OLD_RBS-NEXT:    s_cmp_eq_u32 s0, 10
281; OLD_RBS-NEXT:    s_cselect_b32 s0, 1, 0
282; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
283; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
284; OLD_RBS-NEXT:    s_endpgm
285;
286; NEW_RBS-LABEL: zext:
287; NEW_RBS:       ; %bb.0:
288; NEW_RBS-NEXT:    s_cmp_eq_u32 s0, 10
289; NEW_RBS-NEXT:    s_cselect_b32 s0, 1, 0
290; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
291; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
292; NEW_RBS-NEXT:    s_endpgm
293  %bool = icmp eq i32 %a, 10
294  %zext = zext i1 %bool to i32
295  store i32 %zext, ptr addrspace(1) %ptr
296  ret void
297}
298
299define amdgpu_ps void @sext(i32 inreg %a, ptr addrspace(1) %ptr) {
300; OLD_RBS-LABEL: sext:
301; OLD_RBS:       ; %bb.0:
302; OLD_RBS-NEXT:    s_cmp_eq_u32 s0, 10
303; OLD_RBS-NEXT:    s_cselect_b32 s0, 1, 0
304; OLD_RBS-NEXT:    s_bfe_i32 s0, s0, 0x10000
305; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
306; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
307; OLD_RBS-NEXT:    s_endpgm
308;
309; NEW_RBS-LABEL: sext:
310; NEW_RBS:       ; %bb.0:
311; NEW_RBS-NEXT:    s_cmp_eq_u32 s0, 10
312; NEW_RBS-NEXT:    s_cselect_b32 s0, -1, 0
313; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
314; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
315; NEW_RBS-NEXT:    s_endpgm
316  %bool = icmp eq i32 %a, 10
317  %sext = sext i1 %bool to i32
318  store i32 %sext, ptr addrspace(1) %ptr
319  ret void
320}
321
322; divergent i1 bitwise, i1 vcc.
323; inst selected into s_and_b32 on wave32 or s_and_b64 on wave64.
324define amdgpu_ps void @and_i1_vcc(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
325; OLD_RBS-LABEL: and_i1_vcc:
326; OLD_RBS:       ; %bb.0:
327; OLD_RBS-NEXT:    v_cmp_le_u32_e32 vcc_lo, 10, v0
328; OLD_RBS-NEXT:    v_cmp_le_u32_e64 s0, 20, v1
329; OLD_RBS-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
330; OLD_RBS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
331; OLD_RBS-NEXT:    global_store_dword v[2:3], v0, off
332; OLD_RBS-NEXT:    s_endpgm
333;
334; NEW_RBS-LABEL: and_i1_vcc:
335; NEW_RBS:       ; %bb.0:
336; NEW_RBS-NEXT:    v_cmp_le_u32_e32 vcc_lo, 10, v0
337; NEW_RBS-NEXT:    v_cmp_le_u32_e64 s0, 20, v1
338; NEW_RBS-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
339; NEW_RBS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
340; NEW_RBS-NEXT:    global_store_dword v[2:3], v0, off
341; NEW_RBS-NEXT:    s_endpgm
342  %cmp_a = icmp uge i32 %a, 10
343  %cmp_b = icmp uge i32 %b, 20
344  %cc = and i1 %cmp_a, %cmp_b
345  %res = select i1 %cc, i32 %a, i32 %b
346  store i32 %res, ptr addrspace(1) %ptr
347  ret void
348}
349
350; uniform i1 bitwise, i32 sgpr. inst selected into s_and_b32.
351define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) {
352; OLD_RBS-LABEL: and_i1_scc:
353; OLD_RBS:       ; %bb.0:
354; OLD_RBS-NEXT:    s_cmp_ge_u32 s0, 10
355; OLD_RBS-NEXT:    s_cselect_b32 s2, 1, 0
356; OLD_RBS-NEXT:    s_cmp_ge_u32 s1, 20
357; OLD_RBS-NEXT:    s_cselect_b32 s3, 1, 0
358; OLD_RBS-NEXT:    s_and_b32 s2, s2, s3
359; OLD_RBS-NEXT:    s_and_b32 s2, s2, 1
360; OLD_RBS-NEXT:    s_cmp_lg_u32 s2, 0
361; OLD_RBS-NEXT:    s_cselect_b32 s0, s0, s1
362; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
363; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
364; OLD_RBS-NEXT:    s_endpgm
365;
366; NEW_RBS-LABEL: and_i1_scc:
367; NEW_RBS:       ; %bb.0:
368; NEW_RBS-NEXT:    s_cmp_ge_u32 s0, 10
369; NEW_RBS-NEXT:    s_cselect_b32 s2, 1, 0
370; NEW_RBS-NEXT:    s_cmp_ge_u32 s1, 20
371; NEW_RBS-NEXT:    s_cselect_b32 s3, 1, 0
372; NEW_RBS-NEXT:    s_and_b32 s2, s2, s3
373; NEW_RBS-NEXT:    s_cmp_lg_u32 s2, 0
374; NEW_RBS-NEXT:    s_cselect_b32 s0, s0, s1
375; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
376; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
377; NEW_RBS-NEXT:    s_endpgm
378  %cmp_a = icmp uge i32 %a, 10
379  %cmp_b = icmp uge i32 %b, 20
380  %cc = and i1 %cmp_a, %cmp_b
381  %res = select i1 %cc, i32 %a, i32 %b
382  store i32 %res, ptr addrspace(1) %ptr
383  ret void
384}
385
386; old RBS selects sgpr phi because it had sgpr inputs.
387define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) {
388; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs:
389; OLD_RBS:       ; %bb.0: ; %A
390; OLD_RBS-NEXT:    s_mov_b32 s0, 0
391; OLD_RBS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
392; OLD_RBS-NEXT:    s_and_saveexec_b32 s1, vcc_lo
393; OLD_RBS-NEXT:  ; %bb.1: ; %B
394; OLD_RBS-NEXT:    s_mov_b32 s0, 1
395; OLD_RBS-NEXT:  ; %bb.2: ; %exit
396; OLD_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s1
397; OLD_RBS-NEXT:    v_mov_b32_e32 v0, s0
398; OLD_RBS-NEXT:    global_store_dword v[1:2], v0, off
399; OLD_RBS-NEXT:    s_endpgm
400;
401; NEW_RBS-LABEL: divergent_phi_with_uniform_inputs:
402; NEW_RBS:       ; %bb.0: ; %A
403; NEW_RBS-NEXT:    s_mov_b32 s0, 0
404; NEW_RBS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
405; NEW_RBS-NEXT:    v_mov_b32_e32 v0, s0
406; NEW_RBS-NEXT:    s_and_saveexec_b32 s0, vcc_lo
407; NEW_RBS-NEXT:  ; %bb.1: ; %B
408; NEW_RBS-NEXT:    s_mov_b32 s1, 1
409; NEW_RBS-NEXT:    v_mov_b32_e32 v0, s1
410; NEW_RBS-NEXT:  ; %bb.2: ; %exit
411; NEW_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s0
412; NEW_RBS-NEXT:    global_store_dword v[1:2], v0, off
413; NEW_RBS-NEXT:    s_endpgm
414A:
415  %cmp = icmp eq i32 %a, 0
416  br i1 %cmp, label %B, label %exit
417
418B:
419  br label %exit
420
421exit:
422  %phi = phi i32 [ 0, %A ], [ 1, %B ]
423  store i32 %phi, ptr addrspace(1) %out
424  ret void
425}
426
427; old RBS assigned vgpr to uniform phi (because one input had undetermined bank)
428; and it propagated to mul, which was not wrong.
429; new RBS assigns vgpr to destination of mul even though both inputs are sgpr.
430; TODO: implement temporal divergence lowering
431define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, ptr addrspace(1) %addr) {
432; OLD_RBS-LABEL: divergent_because_of_temporal_divergent_use:
433; OLD_RBS:       ; %bb.0: ; %entry
434; OLD_RBS-NEXT:    s_mov_b32 s0, -1
435; OLD_RBS-NEXT:    v_mov_b32_e32 v3, s0
436; OLD_RBS-NEXT:    s_mov_b32 s0, 0
437; OLD_RBS-NEXT:  .LBB15_1: ; %loop
438; OLD_RBS-NEXT:    ; =>This Inner Loop Header: Depth=1
439; OLD_RBS-NEXT:    v_add_nc_u32_e32 v3, 1, v3
440; OLD_RBS-NEXT:    v_cvt_f32_u32_e32 v4, v3
441; OLD_RBS-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
442; OLD_RBS-NEXT:    s_or_b32 s0, vcc_lo, s0
443; OLD_RBS-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
444; OLD_RBS-NEXT:    s_cbranch_execnz .LBB15_1
445; OLD_RBS-NEXT:  ; %bb.2: ; %exit
446; OLD_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s0
447; OLD_RBS-NEXT:    v_mul_lo_u32 v0, v3, 10
448; OLD_RBS-NEXT:    global_store_dword v[1:2], v0, off
449; OLD_RBS-NEXT:    s_endpgm
450;
451; NEW_RBS-LABEL: divergent_because_of_temporal_divergent_use:
452; NEW_RBS:       ; %bb.0: ; %entry
453; NEW_RBS-NEXT:    s_mov_b32 s0, -1
454; NEW_RBS-NEXT:    s_mov_b32 s1, 0
455; NEW_RBS-NEXT:  .LBB15_1: ; %loop
456; NEW_RBS-NEXT:    ; =>This Inner Loop Header: Depth=1
457; NEW_RBS-NEXT:    s_add_i32 s0, s0, 1
458; NEW_RBS-NEXT:    v_cvt_f32_u32_e32 v3, s0
459; NEW_RBS-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v0
460; NEW_RBS-NEXT:    s_or_b32 s1, vcc_lo, s1
461; NEW_RBS-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
462; NEW_RBS-NEXT:    s_cbranch_execnz .LBB15_1
463; NEW_RBS-NEXT:  ; %bb.2: ; %exit
464; NEW_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s1
465; NEW_RBS-NEXT:    v_mov_b32_e32 v0, s0
466; NEW_RBS-NEXT:    v_mul_lo_u32 v0, v0, 10
467; NEW_RBS-NEXT:    global_store_dword v[1:2], v0, off
468; NEW_RBS-NEXT:    s_endpgm
469entry:
470  br label %loop
471
472loop:
473  %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
474  %f.counter = uitofp i32 %counter to float
475  %cond = fcmp ogt float %f.counter, %val
476  %counter.plus.1 = add i32 %counter, 1
477  br i1 %cond, label %exit, label %loop
478
479exit:
480  %ceilx10 = mul i32 %counter, 10
481  store i32 %ceilx10, ptr addrspace(1) %addr
482  ret void
483}
484
485; Variables that hande counter can be allocated to sgprs.
486define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
487; OLD_RBS-LABEL: loop_with_2breaks:
488; OLD_RBS:       ; %bb.0: ; %entry
489; OLD_RBS-NEXT:    s_mov_b32 s0, 0
490; OLD_RBS-NEXT:    ; implicit-def: $sgpr1
491; OLD_RBS-NEXT:    v_mov_b32_e32 v6, s0
492; OLD_RBS-NEXT:    s_branch .LBB16_3
493; OLD_RBS-NEXT:  .LBB16_1: ; %Flow3
494; OLD_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
495; OLD_RBS-NEXT:    s_waitcnt_depctr 0xffe3
496; OLD_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s3
497; OLD_RBS-NEXT:    s_andn2_b32 s1, s1, exec_lo
498; OLD_RBS-NEXT:    s_and_b32 s3, exec_lo, s4
499; OLD_RBS-NEXT:    s_or_b32 s1, s1, s3
500; OLD_RBS-NEXT:  .LBB16_2: ; %Flow
501; OLD_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
502; OLD_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s2
503; OLD_RBS-NEXT:    s_and_b32 s2, exec_lo, s1
504; OLD_RBS-NEXT:    s_or_b32 s0, s2, s0
505; OLD_RBS-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
506; OLD_RBS-NEXT:    s_cbranch_execz .LBB16_6
507; OLD_RBS-NEXT:  .LBB16_3: ; %A
508; OLD_RBS-NEXT:    ; =>This Inner Loop Header: Depth=1
509; OLD_RBS-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
510; OLD_RBS-NEXT:    s_andn2_b32 s1, s1, exec_lo
511; OLD_RBS-NEXT:    s_and_b32 s2, exec_lo, -1
512; OLD_RBS-NEXT:    s_or_b32 s1, s1, s2
513; OLD_RBS-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
514; OLD_RBS-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
515; OLD_RBS-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
516; OLD_RBS-NEXT:    global_load_dword v9, v[9:10], off
517; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
518; OLD_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
519; OLD_RBS-NEXT:    s_and_saveexec_b32 s2, vcc_lo
520; OLD_RBS-NEXT:    s_cbranch_execz .LBB16_2
521; OLD_RBS-NEXT:  ; %bb.4: ; %B
522; OLD_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
523; OLD_RBS-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
524; OLD_RBS-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
525; OLD_RBS-NEXT:    s_mov_b32 s4, -1
526; OLD_RBS-NEXT:    global_load_dword v9, v[9:10], off
527; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
528; OLD_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
529; OLD_RBS-NEXT:    s_and_saveexec_b32 s3, vcc_lo
530; OLD_RBS-NEXT:    s_cbranch_execz .LBB16_1
531; OLD_RBS-NEXT:  ; %bb.5: ; %loop.body
532; OLD_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
533; OLD_RBS-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
534; OLD_RBS-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
535; OLD_RBS-NEXT:    v_add_nc_u32_e32 v10, 1, v6
536; OLD_RBS-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
537; OLD_RBS-NEXT:    s_andn2_b32 s4, -1, exec_lo
538; OLD_RBS-NEXT:    global_load_dword v9, v[7:8], off
539; OLD_RBS-NEXT:    v_mov_b32_e32 v6, v10
540; OLD_RBS-NEXT:    s_and_b32 s5, exec_lo, vcc_lo
541; OLD_RBS-NEXT:    s_or_b32 s4, s4, s5
542; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
543; OLD_RBS-NEXT:    v_add_nc_u32_e32 v9, 1, v9
544; OLD_RBS-NEXT:    global_store_dword v[7:8], v9, off
545; OLD_RBS-NEXT:    s_branch .LBB16_1
546; OLD_RBS-NEXT:  .LBB16_6: ; %exit
547; OLD_RBS-NEXT:    s_endpgm
548;
549; NEW_RBS-LABEL: loop_with_2breaks:
550; NEW_RBS:       ; %bb.0: ; %entry
551; NEW_RBS-NEXT:    s_mov_b32 s4, 0
552; NEW_RBS-NEXT:    s_mov_b32 s0, 0
553; NEW_RBS-NEXT:    ; implicit-def: $sgpr5
554; NEW_RBS-NEXT:    s_branch .LBB16_3
555; NEW_RBS-NEXT:  .LBB16_1: ; %Flow3
556; NEW_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
557; NEW_RBS-NEXT:    s_waitcnt_depctr 0xffe3
558; NEW_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s7
559; NEW_RBS-NEXT:    s_andn2_b32 s2, s5, exec_lo
560; NEW_RBS-NEXT:    s_and_b32 s3, exec_lo, s6
561; NEW_RBS-NEXT:    s_or_b32 s5, s2, s3
562; NEW_RBS-NEXT:  .LBB16_2: ; %Flow
563; NEW_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
564; NEW_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s1
565; NEW_RBS-NEXT:    s_and_b32 s1, exec_lo, s5
566; NEW_RBS-NEXT:    s_or_b32 s4, s1, s4
567; NEW_RBS-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
568; NEW_RBS-NEXT:    s_cbranch_execz .LBB16_6
569; NEW_RBS-NEXT:  .LBB16_3: ; %A
570; NEW_RBS-NEXT:    ; =>This Inner Loop Header: Depth=1
571; NEW_RBS-NEXT:    s_ashr_i32 s1, s0, 31
572; NEW_RBS-NEXT:    s_lshl_b64 s[2:3], s[0:1], 2
573; NEW_RBS-NEXT:    s_andn2_b32 s1, s5, exec_lo
574; NEW_RBS-NEXT:    v_mov_b32_e32 v7, s3
575; NEW_RBS-NEXT:    v_mov_b32_e32 v6, s2
576; NEW_RBS-NEXT:    s_and_b32 s5, exec_lo, exec_lo
577; NEW_RBS-NEXT:    s_or_b32 s5, s1, s5
578; NEW_RBS-NEXT:    v_add_co_u32 v6, vcc_lo, v2, v6
579; NEW_RBS-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo
580; NEW_RBS-NEXT:    global_load_dword v6, v[6:7], off
581; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
582; NEW_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
583; NEW_RBS-NEXT:    s_and_saveexec_b32 s1, vcc_lo
584; NEW_RBS-NEXT:    s_cbranch_execz .LBB16_2
585; NEW_RBS-NEXT:  ; %bb.4: ; %B
586; NEW_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
587; NEW_RBS-NEXT:    v_mov_b32_e32 v7, s3
588; NEW_RBS-NEXT:    v_mov_b32_e32 v6, s2
589; NEW_RBS-NEXT:    s_mov_b32 s6, exec_lo
590; NEW_RBS-NEXT:    v_add_co_u32 v6, vcc_lo, v4, v6
591; NEW_RBS-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo
592; NEW_RBS-NEXT:    global_load_dword v6, v[6:7], off
593; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
594; NEW_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
595; NEW_RBS-NEXT:    s_and_saveexec_b32 s7, vcc_lo
596; NEW_RBS-NEXT:    s_cbranch_execz .LBB16_1
597; NEW_RBS-NEXT:  ; %bb.5: ; %loop.body
598; NEW_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
599; NEW_RBS-NEXT:    v_mov_b32_e32 v7, s3
600; NEW_RBS-NEXT:    v_mov_b32_e32 v6, s2
601; NEW_RBS-NEXT:    s_add_i32 s2, s0, 1
602; NEW_RBS-NEXT:    s_cmpk_lt_u32 s0, 0x64
603; NEW_RBS-NEXT:    s_cselect_b32 s0, exec_lo, 0
604; NEW_RBS-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v6
605; NEW_RBS-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
606; NEW_RBS-NEXT:    s_andn2_b32 s3, s6, exec_lo
607; NEW_RBS-NEXT:    s_and_b32 s0, exec_lo, s0
608; NEW_RBS-NEXT:    s_or_b32 s6, s3, s0
609; NEW_RBS-NEXT:    global_load_dword v8, v[6:7], off
610; NEW_RBS-NEXT:    s_mov_b32 s0, s2
611; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
612; NEW_RBS-NEXT:    v_add_nc_u32_e32 v8, 1, v8
613; NEW_RBS-NEXT:    global_store_dword v[6:7], v8, off
614; NEW_RBS-NEXT:    s_branch .LBB16_1
615; NEW_RBS-NEXT:  .LBB16_6: ; %exit
616; NEW_RBS-NEXT:    s_endpgm
617entry:
618  br label %A
619
620A:
621  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
622  %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
623  %a.val = load i32, ptr addrspace(1) %a.plus.counter
624  %a.cond = icmp eq i32 %a.val, 0
625  br i1 %a.cond, label %exit, label %B
626
627B:
628  %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
629  %b.val = load i32, ptr addrspace(1) %b.plus.counter
630  %b.cond = icmp eq i32 %b.val, 0
631  br i1 %b.cond, label %exit, label %loop.body
632
633loop.body:
634  %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
635  %x.val = load i32, ptr addrspace(1) %x.plus.counter
636  %x.val.plus.1 = add i32 %x.val, 1
637  store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
638  %counter.plus.1 = add i32 %counter, 1
639  %x.cond = icmp ult i32 %counter, 100
640  br i1 %x.cond, label %exit, label %A
641
642exit:
643  ret void
644}
645
646declare i16 @llvm.abs.i16(i16, i1)
647declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
648