xref: /llvm-project/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1010 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1100 %s
4
5; Test that unused lanes in the s_xor result are masked out with v_cndmask.
6
7define i32 @combine_add_zext_xor() {
8; GFX1010-LABEL: combine_add_zext_xor:
9; GFX1010:       ; %bb.0: ; %.entry
10; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
12; GFX1010-NEXT:    s_branch .LBB0_2
13; GFX1010-NEXT:  .LBB0_1: ; %bb9
14; GFX1010-NEXT:    ; in Loop: Header=BB0_2 Depth=1
15; GFX1010-NEXT:    s_xor_b32 s4, s4, -1
16; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
17; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
18; GFX1010-NEXT:    v_add_nc_u32_e32 v2, v1, v0
19; GFX1010-NEXT:    v_mov_b32_e32 v1, v2
20; GFX1010-NEXT:    s_cbranch_vccz .LBB0_4
21; GFX1010-NEXT:  .LBB0_2: ; %.a
22; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
23; GFX1010-NEXT:    ; implicit-def: $sgpr4
24; GFX1010-NEXT:    s_cbranch_scc1 .LBB0_1
25; GFX1010-NEXT:  ; %bb.3: ; %bb
26; GFX1010-NEXT:    ; in Loop: Header=BB0_2 Depth=1
27; GFX1010-NEXT:    buffer_load_dword v0, v1, s[4:7], 64 offen glc
28; GFX1010-NEXT:    s_waitcnt vmcnt(0)
29; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
30; GFX1010-NEXT:    s_branch .LBB0_1
31; GFX1010-NEXT:  .LBB0_4: ; %.exit
32; GFX1010-NEXT:    s_setpc_b64 s[30:31]
33;
34; GFX1100-LABEL: combine_add_zext_xor:
35; GFX1100:       ; %bb.0: ; %.entry
36; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
38; GFX1100-NEXT:    s_branch .LBB0_2
39; GFX1100-NEXT:  .LBB0_1: ; %bb9
40; GFX1100-NEXT:    ; in Loop: Header=BB0_2 Depth=1
41; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
42; GFX1100-NEXT:    s_xor_b32 s0, s0, -1
43; GFX1100-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
44; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
45; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
46; GFX1100-NEXT:    v_add_nc_u32_e32 v2, v1, v0
47; GFX1100-NEXT:    v_mov_b32_e32 v1, v2
48; GFX1100-NEXT:    s_cbranch_vccz .LBB0_4
49; GFX1100-NEXT:  .LBB0_2: ; %.a
50; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
51; GFX1100-NEXT:    ; implicit-def: $sgpr0
52; GFX1100-NEXT:    s_cbranch_scc1 .LBB0_1
53; GFX1100-NEXT:  ; %bb.3: ; %bb
54; GFX1100-NEXT:    ; in Loop: Header=BB0_2 Depth=1
55; GFX1100-NEXT:    buffer_load_b32 v0, v1, s[0:3], 64 offen glc
56; GFX1100-NEXT:    s_waitcnt vmcnt(0)
57; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
58; GFX1100-NEXT:    s_branch .LBB0_1
59; GFX1100-NEXT:  .LBB0_4: ; %.exit
60; GFX1100-NEXT:    s_setpc_b64 s[30:31]
61.entry:
62  br label %.a
63
64.a:                                               ; preds = %bb9, %.entry
65  %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
66  br i1 undef, label %bb9, label %bb
67
68bb:                                               ; preds = %.a
69  %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
70  %i5 = icmp eq i32 %.i3, 0
71  br label %bb9
72
73bb9:                                              ; preds = %bb, %.a
74  %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
75  %.2.0.in = xor i1 %.2.0.in.in, true
76  %.2.0 = zext i1 %.2.0.in to i32
77  %i11 = add i32 %.2, %.2.0
78  %i12 = icmp sgt i32 %.2, -1050
79  br i1 %i12, label %.a, label %.exit
80
81.exit:                                            ; preds = %bb9
82  ret i32 %.2.0
83}
84
85; Test that unused lanes in the s_xor result are masked out with v_cndmask.
86
87define i32 @combine_sub_zext_xor() {
88; GFX1010-LABEL: combine_sub_zext_xor:
89; GFX1010:       ; %bb.0: ; %.entry
90; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
92; GFX1010-NEXT:    s_branch .LBB1_2
93; GFX1010-NEXT:  .LBB1_1: ; %bb9
94; GFX1010-NEXT:    ; in Loop: Header=BB1_2 Depth=1
95; GFX1010-NEXT:    s_xor_b32 s4, s4, -1
96; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
97; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
98; GFX1010-NEXT:    v_sub_nc_u32_e32 v2, v1, v0
99; GFX1010-NEXT:    v_mov_b32_e32 v1, v2
100; GFX1010-NEXT:    s_cbranch_vccz .LBB1_4
101; GFX1010-NEXT:  .LBB1_2: ; %.a
102; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
103; GFX1010-NEXT:    ; implicit-def: $sgpr4
104; GFX1010-NEXT:    s_cbranch_scc1 .LBB1_1
105; GFX1010-NEXT:  ; %bb.3: ; %bb
106; GFX1010-NEXT:    ; in Loop: Header=BB1_2 Depth=1
107; GFX1010-NEXT:    buffer_load_dword v0, v1, s[4:7], 64 offen glc
108; GFX1010-NEXT:    s_waitcnt vmcnt(0)
109; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
110; GFX1010-NEXT:    s_branch .LBB1_1
111; GFX1010-NEXT:  .LBB1_4: ; %.exit
112; GFX1010-NEXT:    s_setpc_b64 s[30:31]
113;
114; GFX1100-LABEL: combine_sub_zext_xor:
115; GFX1100:       ; %bb.0: ; %.entry
116; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
118; GFX1100-NEXT:    s_branch .LBB1_2
119; GFX1100-NEXT:  .LBB1_1: ; %bb9
120; GFX1100-NEXT:    ; in Loop: Header=BB1_2 Depth=1
121; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
122; GFX1100-NEXT:    s_xor_b32 s0, s0, -1
123; GFX1100-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
124; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
125; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
126; GFX1100-NEXT:    v_sub_nc_u32_e32 v2, v1, v0
127; GFX1100-NEXT:    v_mov_b32_e32 v1, v2
128; GFX1100-NEXT:    s_cbranch_vccz .LBB1_4
129; GFX1100-NEXT:  .LBB1_2: ; %.a
130; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
131; GFX1100-NEXT:    ; implicit-def: $sgpr0
132; GFX1100-NEXT:    s_cbranch_scc1 .LBB1_1
133; GFX1100-NEXT:  ; %bb.3: ; %bb
134; GFX1100-NEXT:    ; in Loop: Header=BB1_2 Depth=1
135; GFX1100-NEXT:    buffer_load_b32 v0, v1, s[0:3], 64 offen glc
136; GFX1100-NEXT:    s_waitcnt vmcnt(0)
137; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
138; GFX1100-NEXT:    s_branch .LBB1_1
139; GFX1100-NEXT:  .LBB1_4: ; %.exit
140; GFX1100-NEXT:    s_setpc_b64 s[30:31]
141.entry:
142  br label %.a
143
144.a:                                               ; preds = %bb9, %.entry
145  %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
146  br i1 undef, label %bb9, label %bb
147
148bb:                                               ; preds = %.a
149  %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
150  %i5 = icmp eq i32 %.i3, 0
151  br label %bb9
152
153bb9:                                              ; preds = %bb, %.a
154  %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
155  %.2.0.in = xor i1 %.2.0.in.in, true
156  %.2.0 = zext i1 %.2.0.in to i32
157  %i11 = sub i32 %.2, %.2.0
158  %i12 = icmp sgt i32 %.2, -1050
159  br i1 %i12, label %.a, label %.exit
160
161.exit:                                            ; preds = %bb9
162  ret i32 %.2.0
163}
164
165; Test that unused lanes in the s_or result are masked out with v_cndmask.
166
167define i32 @combine_add_zext_or() {
168; GFX1010-LABEL: combine_add_zext_or:
169; GFX1010:       ; %bb.0: ; %.entry
170; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171; GFX1010-NEXT:    s_mov_b32 s4, 0
172; GFX1010-NEXT:    s_branch .LBB2_2
173; GFX1010-NEXT:  .LBB2_1: ; %bb9
174; GFX1010-NEXT:    ; in Loop: Header=BB2_2 Depth=1
175; GFX1010-NEXT:    s_cmpk_gt_i32 s4, 0xfbe6
176; GFX1010-NEXT:    s_cselect_b32 s6, -1, 0
177; GFX1010-NEXT:    s_add_i32 s4, s4, 1
178; GFX1010-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
179; GFX1010-NEXT:    s_cbranch_vccz .LBB2_4
180; GFX1010-NEXT:  .LBB2_2: ; %.a
181; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
182; GFX1010-NEXT:    ; implicit-def: $sgpr5
183; GFX1010-NEXT:    s_cbranch_scc1 .LBB2_1
184; GFX1010-NEXT:  ; %bb.3: ; %bb
185; GFX1010-NEXT:    ; in Loop: Header=BB2_2 Depth=1
186; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
187; GFX1010-NEXT:    buffer_load_dword v0, v0, s[4:7], 64 offen glc
188; GFX1010-NEXT:    s_waitcnt vmcnt(0)
189; GFX1010-NEXT:    v_cmp_eq_u32_e64 s5, 0, v0
190; GFX1010-NEXT:    s_branch .LBB2_1
191; GFX1010-NEXT:  .LBB2_4: ; %.exit
192; GFX1010-NEXT:    s_or_b32 s4, s5, s6
193; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
194; GFX1010-NEXT:    s_setpc_b64 s[30:31]
195;
196; GFX1100-LABEL: combine_add_zext_or:
197; GFX1100:       ; %bb.0: ; %.entry
198; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199; GFX1100-NEXT:    s_mov_b32 s0, 0
200; GFX1100-NEXT:    s_branch .LBB2_2
201; GFX1100-NEXT:  .LBB2_1: ; %bb9
202; GFX1100-NEXT:    ; in Loop: Header=BB2_2 Depth=1
203; GFX1100-NEXT:    s_cmpk_gt_i32 s0, 0xfbe6
204; GFX1100-NEXT:    s_cselect_b32 s2, -1, 0
205; GFX1100-NEXT:    s_add_i32 s0, s0, 1
206; GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
207; GFX1100-NEXT:    s_cbranch_vccz .LBB2_4
208; GFX1100-NEXT:  .LBB2_2: ; %.a
209; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
210; GFX1100-NEXT:    ; implicit-def: $sgpr1
211; GFX1100-NEXT:    s_cbranch_scc1 .LBB2_1
212; GFX1100-NEXT:  ; %bb.3: ; %bb
213; GFX1100-NEXT:    ; in Loop: Header=BB2_2 Depth=1
214; GFX1100-NEXT:    v_mov_b32_e32 v0, s0
215; GFX1100-NEXT:    buffer_load_b32 v0, v0, s[0:3], 64 offen glc
216; GFX1100-NEXT:    s_waitcnt vmcnt(0)
217; GFX1100-NEXT:    v_cmp_eq_u32_e64 s1, 0, v0
218; GFX1100-NEXT:    s_branch .LBB2_1
219; GFX1100-NEXT:  .LBB2_4: ; %.exit
220; GFX1100-NEXT:    s_or_b32 s0, s1, s2
221; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
222; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
223; GFX1100-NEXT:    s_setpc_b64 s[30:31]
224.entry:
225  br label %.a
226
227.a:                                               ; preds = %bb9, %.entry
228  %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
229  br i1 undef, label %bb9, label %bb
230
231bb:                                               ; preds = %.a
232  %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
233  %i5 = icmp eq i32 %.i3, 0
234  br label %bb9
235
236bb9:                                              ; preds = %bb, %.a
237  %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
238  %t = icmp sgt i32 %.2, -1050
239  %.2.0.in = or i1 %.2.0.in.in, %t
240  %.2.0 = zext i1 %.2.0.in to i32
241  %i11 = add i32 %.2, %.2.0
242  %i12 = icmp sgt i32 %.2, -1050
243  br i1 %i12, label %.a, label %.exit
244
245.exit:                                            ; preds = %bb9
246  ret i32 %.2.0
247}
248
249; Test that unused lanes in the s_or result are masked out with v_cndmask.
250
251define i32 @combine_sub_zext_or() {
252; GFX1010-LABEL: combine_sub_zext_or:
253; GFX1010:       ; %bb.0: ; %.entry
254; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255; GFX1010-NEXT:    s_mov_b32 s4, 0
256; GFX1010-NEXT:    s_branch .LBB3_2
257; GFX1010-NEXT:  .LBB3_1: ; %bb9
258; GFX1010-NEXT:    ; in Loop: Header=BB3_2 Depth=1
259; GFX1010-NEXT:    s_cmpk_gt_i32 s4, 0xfbe6
260; GFX1010-NEXT:    s_cselect_b32 s6, -1, 0
261; GFX1010-NEXT:    s_add_i32 s4, s4, -1
262; GFX1010-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
263; GFX1010-NEXT:    s_cbranch_vccz .LBB3_4
264; GFX1010-NEXT:  .LBB3_2: ; %.a
265; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
266; GFX1010-NEXT:    ; implicit-def: $sgpr5
267; GFX1010-NEXT:    s_cbranch_scc1 .LBB3_1
268; GFX1010-NEXT:  ; %bb.3: ; %bb
269; GFX1010-NEXT:    ; in Loop: Header=BB3_2 Depth=1
270; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
271; GFX1010-NEXT:    buffer_load_dword v0, v0, s[4:7], 64 offen glc
272; GFX1010-NEXT:    s_waitcnt vmcnt(0)
273; GFX1010-NEXT:    v_cmp_eq_u32_e64 s5, 0, v0
274; GFX1010-NEXT:    s_branch .LBB3_1
275; GFX1010-NEXT:  .LBB3_4: ; %.exit
276; GFX1010-NEXT:    s_or_b32 s4, s5, s6
277; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
278; GFX1010-NEXT:    s_setpc_b64 s[30:31]
279;
280; GFX1100-LABEL: combine_sub_zext_or:
281; GFX1100:       ; %bb.0: ; %.entry
282; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283; GFX1100-NEXT:    s_mov_b32 s0, 0
284; GFX1100-NEXT:    s_branch .LBB3_2
285; GFX1100-NEXT:  .LBB3_1: ; %bb9
286; GFX1100-NEXT:    ; in Loop: Header=BB3_2 Depth=1
287; GFX1100-NEXT:    s_cmpk_gt_i32 s0, 0xfbe6
288; GFX1100-NEXT:    s_cselect_b32 s2, -1, 0
289; GFX1100-NEXT:    s_add_i32 s0, s0, -1
290; GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
291; GFX1100-NEXT:    s_cbranch_vccz .LBB3_4
292; GFX1100-NEXT:  .LBB3_2: ; %.a
293; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
294; GFX1100-NEXT:    ; implicit-def: $sgpr1
295; GFX1100-NEXT:    s_cbranch_scc1 .LBB3_1
296; GFX1100-NEXT:  ; %bb.3: ; %bb
297; GFX1100-NEXT:    ; in Loop: Header=BB3_2 Depth=1
298; GFX1100-NEXT:    v_mov_b32_e32 v0, s0
299; GFX1100-NEXT:    buffer_load_b32 v0, v0, s[0:3], 64 offen glc
300; GFX1100-NEXT:    s_waitcnt vmcnt(0)
301; GFX1100-NEXT:    v_cmp_eq_u32_e64 s1, 0, v0
302; GFX1100-NEXT:    s_branch .LBB3_1
303; GFX1100-NEXT:  .LBB3_4: ; %.exit
304; GFX1100-NEXT:    s_or_b32 s0, s1, s2
305; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
306; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
307; GFX1100-NEXT:    s_setpc_b64 s[30:31]
308.entry:
309  br label %.a
310
311.a:                                               ; preds = %bb9, %.entry
312  %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
313  br i1 undef, label %bb9, label %bb
314
315bb:                                               ; preds = %.a
316  %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
317  %i5 = icmp eq i32 %.i3, 0
318  br label %bb9
319
320bb9:                                              ; preds = %bb, %.a
321  %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
322  %t = icmp sgt i32 %.2, -1050
323  %.2.0.in = or i1 %.2.0.in.in, %t
324  %.2.0 = zext i1 %.2.0.in to i32
325  %i11 = sub i32 %.2, %.2.0
326  %i12 = icmp sgt i32 %.2, -1050
327  br i1 %i12, label %.a, label %.exit
328
329.exit:                                            ; preds = %bb9
330  ret i32 %.2.0
331}
332
333; Test that unused lanes in the s_and result are masked out with v_cndmask.
334
335define i32 @combine_add_zext_and() {
336; GFX1010-LABEL: combine_add_zext_and:
337; GFX1010:       ; %bb.0: ; %.entry
338; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
340; GFX1010-NEXT:    s_branch .LBB4_2
341; GFX1010-NEXT:  .LBB4_1: ; %bb9
342; GFX1010-NEXT:    ; in Loop: Header=BB4_2 Depth=1
343; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
344; GFX1010-NEXT:    s_and_b32 s4, s4, vcc_lo
345; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
346; GFX1010-NEXT:    v_add_nc_u32_e32 v1, v1, v0
347; GFX1010-NEXT:    s_cbranch_vccz .LBB4_4
348; GFX1010-NEXT:  .LBB4_2: ; %.a
349; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
350; GFX1010-NEXT:    ; implicit-def: $sgpr4
351; GFX1010-NEXT:    s_cbranch_scc1 .LBB4_1
352; GFX1010-NEXT:  ; %bb.3: ; %bb
353; GFX1010-NEXT:    ; in Loop: Header=BB4_2 Depth=1
354; GFX1010-NEXT:    buffer_load_dword v0, v1, s[4:7], 64 offen glc
355; GFX1010-NEXT:    s_waitcnt vmcnt(0)
356; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
357; GFX1010-NEXT:    s_branch .LBB4_1
358; GFX1010-NEXT:  .LBB4_4: ; %.exit
359; GFX1010-NEXT:    s_setpc_b64 s[30:31]
360;
361; GFX1100-LABEL: combine_add_zext_and:
362; GFX1100:       ; %bb.0: ; %.entry
363; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
365; GFX1100-NEXT:    s_branch .LBB4_2
366; GFX1100-NEXT:  .LBB4_1: ; %bb9
367; GFX1100-NEXT:    ; in Loop: Header=BB4_2 Depth=1
368; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
369; GFX1100-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
370; GFX1100-NEXT:    s_and_b32 s0, s0, vcc_lo
371; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
372; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
373; GFX1100-NEXT:    v_add_nc_u32_e32 v1, v1, v0
374; GFX1100-NEXT:    s_cbranch_vccz .LBB4_4
375; GFX1100-NEXT:  .LBB4_2: ; %.a
376; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
377; GFX1100-NEXT:    ; implicit-def: $sgpr0
378; GFX1100-NEXT:    s_cbranch_scc1 .LBB4_1
379; GFX1100-NEXT:  ; %bb.3: ; %bb
380; GFX1100-NEXT:    ; in Loop: Header=BB4_2 Depth=1
381; GFX1100-NEXT:    buffer_load_b32 v0, v1, s[0:3], 64 offen glc
382; GFX1100-NEXT:    s_waitcnt vmcnt(0)
383; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
384; GFX1100-NEXT:    s_branch .LBB4_1
385; GFX1100-NEXT:  .LBB4_4: ; %.exit
386; GFX1100-NEXT:    s_setpc_b64 s[30:31]
387.entry:
388  br label %.a
389
390.a:                                               ; preds = %bb9, %.entry
391  %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
392  br i1 undef, label %bb9, label %bb
393
394bb:                                               ; preds = %.a
395  %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
396  %i5 = icmp eq i32 %.i3, 0
397  br label %bb9
398
399bb9:                                              ; preds = %bb, %.a
400  %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
401  %t = icmp sgt i32 %.2, -1050
402  %.2.0.in = and i1 %.2.0.in.in, %t
403  %.2.0 = zext i1 %.2.0.in to i32
404  %i11 = add i32 %.2, %.2.0
405  %i12 = icmp sgt i32 %.2, -1050
406  br i1 %i12, label %.a, label %.exit
407
408.exit:                                            ; preds = %bb9
409  ret i32 %.2.0
410}
411
412; Test that unused lanes in the s_and result are masked out with v_cndmask.
413
414define i32 @combine_sub_zext_and() {
415; GFX1010-LABEL: combine_sub_zext_and:
416; GFX1010:       ; %bb.0: ; %.entry
417; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
418; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
419; GFX1010-NEXT:    s_branch .LBB5_2
420; GFX1010-NEXT:  .LBB5_1: ; %bb9
421; GFX1010-NEXT:    ; in Loop: Header=BB5_2 Depth=1
422; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
423; GFX1010-NEXT:    s_and_b32 s4, s4, vcc_lo
424; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
425; GFX1010-NEXT:    v_sub_nc_u32_e32 v1, v1, v0
426; GFX1010-NEXT:    s_cbranch_vccz .LBB5_4
427; GFX1010-NEXT:  .LBB5_2: ; %.a
428; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
429; GFX1010-NEXT:    ; implicit-def: $sgpr4
430; GFX1010-NEXT:    s_cbranch_scc1 .LBB5_1
431; GFX1010-NEXT:  ; %bb.3: ; %bb
432; GFX1010-NEXT:    ; in Loop: Header=BB5_2 Depth=1
433; GFX1010-NEXT:    buffer_load_dword v0, v1, s[4:7], 64 offen glc
434; GFX1010-NEXT:    s_waitcnt vmcnt(0)
435; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
436; GFX1010-NEXT:    s_branch .LBB5_1
437; GFX1010-NEXT:  .LBB5_4: ; %.exit
438; GFX1010-NEXT:    s_setpc_b64 s[30:31]
439;
440; GFX1100-LABEL: combine_sub_zext_and:
441; GFX1100:       ; %bb.0: ; %.entry
442; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
444; GFX1100-NEXT:    s_branch .LBB5_2
445; GFX1100-NEXT:  .LBB5_1: ; %bb9
446; GFX1100-NEXT:    ; in Loop: Header=BB5_2 Depth=1
447; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
448; GFX1100-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
449; GFX1100-NEXT:    s_and_b32 s0, s0, vcc_lo
450; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
451; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
452; GFX1100-NEXT:    v_sub_nc_u32_e32 v1, v1, v0
453; GFX1100-NEXT:    s_cbranch_vccz .LBB5_4
454; GFX1100-NEXT:  .LBB5_2: ; %.a
455; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
456; GFX1100-NEXT:    ; implicit-def: $sgpr0
457; GFX1100-NEXT:    s_cbranch_scc1 .LBB5_1
458; GFX1100-NEXT:  ; %bb.3: ; %bb
459; GFX1100-NEXT:    ; in Loop: Header=BB5_2 Depth=1
460; GFX1100-NEXT:    buffer_load_b32 v0, v1, s[0:3], 64 offen glc
461; GFX1100-NEXT:    s_waitcnt vmcnt(0)
462; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
463; GFX1100-NEXT:    s_branch .LBB5_1
464; GFX1100-NEXT:  .LBB5_4: ; %.exit
465; GFX1100-NEXT:    s_setpc_b64 s[30:31]
466.entry:
467  br label %.a
468
469.a:                                               ; preds = %bb9, %.entry
470  %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
471  br i1 undef, label %bb9, label %bb
472
473bb:                                               ; preds = %.a
474  %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
475  %i5 = icmp eq i32 %.i3, 0
476  br label %bb9
477
478bb9:                                              ; preds = %bb, %.a
479  %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
480  %t = icmp sgt i32 %.2, -1050
481  %.2.0.in = and i1 %.2.0.in.in, %t
482  %.2.0 = zext i1 %.2.0.in to i32
483  %i11 = sub i32 %.2, %.2.0
484  %i12 = icmp sgt i32 %.2, -1050
485  br i1 %i12, label %.a, label %.exit
486
487.exit:                                            ; preds = %bb9
488  ret i32 %.2.0
489}
490
491
492; Function Attrs: nounwind readonly willreturn
493declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture, i32, i32, i32 immarg) #0
494
495attributes #0 = { nounwind willreturn memory(argmem: read) }
496
497