xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
3
4define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
5; GFX906-LABEL: v3i8_liveout:
6; GFX906:       ; %bb.0: ; %entry
7; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
9; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
10; GFX906-NEXT:    v_mov_b32_e32 v4, 8
11; GFX906-NEXT:    v_mov_b32_e32 v5, 16
12; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX906-NEXT:    global_load_dword v3, v2, s[0:1]
14; GFX906-NEXT:    v_mov_b32_e32 v1, 0xff
15; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
16; GFX906-NEXT:    s_waitcnt vmcnt(0)
17; GFX906-NEXT:    v_and_b32_e32 v6, 0xff, v3
18; GFX906-NEXT:    v_lshlrev_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
19; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
20; GFX906-NEXT:    v_or3_b32 v3, v6, v7, v3
21; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
22; GFX906-NEXT:    s_cbranch_execz .LBB0_2
23; GFX906-NEXT:  ; %bb.1: ; %bb.1
24; GFX906-NEXT:    global_load_dword v0, v2, s[2:3]
25; GFX906-NEXT:    s_waitcnt vmcnt(0)
26; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v0
27; GFX906-NEXT:    v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
28; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
29; GFX906-NEXT:    v_or3_b32 v3, v2, v3, v0
30; GFX906-NEXT:  .LBB0_2: ; %bb.2
31; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
32; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
33; GFX906-NEXT:    v_and_b32_e32 v0, 0xff, v0
34; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
35; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
36; GFX906-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
37; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
38; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
39; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
40; GFX906-NEXT:    v_mov_b32_e32 v1, 0
41; GFX906-NEXT:    global_store_short v1, v0, s[6:7]
42; GFX906-NEXT:    global_store_byte_d16_hi v1, v0, s[6:7] offset:2
43; GFX906-NEXT:    s_endpgm
44entry:
45  %idx = call i32 @llvm.amdgcn.workitem.id.x()
46  %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
47  %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
48  %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
49  %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
50  %cmp = icmp ult i32 %idx, 15
51  br i1 %cmp, label %bb.1, label %bb.2
52bb.1:
53  br label %bb.2
54
55bb.2:
56  %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
57  store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
58  ret void
59}
60
61define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
62; GFX906-LABEL: v4i8_liveout:
63; GFX906:       ; %bb.0: ; %entry
64; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
65; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
66; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
67; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
68; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX906-NEXT:    global_load_dword v1, v2, s[0:1]
70; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
71; GFX906-NEXT:    s_cbranch_execz .LBB1_2
72; GFX906-NEXT:  ; %bb.1: ; %bb.1
73; GFX906-NEXT:    global_load_dword v1, v2, s[2:3]
74; GFX906-NEXT:  .LBB1_2: ; %bb.2
75; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
76; GFX906-NEXT:    v_mov_b32_e32 v0, 0
77; GFX906-NEXT:    s_waitcnt vmcnt(0)
78; GFX906-NEXT:    global_store_dword v0, v1, s[6:7]
79; GFX906-NEXT:    s_endpgm
80entry:
81  %idx = call i32 @llvm.amdgcn.workitem.id.x()
82  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
83  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
84  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
85  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
86  %cmp = icmp ult i32 %idx, 15
87  br i1 %cmp, label %bb.1, label %bb.2
88bb.1:
89  br label %bb.2
90
91bb.2:
92  %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
93  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
94  ret void
95}
96
97define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
98; GFX906-LABEL: v5i8_liveout:
99; GFX906:       ; %bb.0: ; %entry
100; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
101; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
102; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
103; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
104; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
106; GFX906-NEXT:    s_waitcnt vmcnt(0)
107; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
108; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
109; GFX906-NEXT:    s_cbranch_execz .LBB2_2
110; GFX906-NEXT:  ; %bb.1: ; %bb.1
111; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
112; GFX906-NEXT:    s_waitcnt vmcnt(0)
113; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
114; GFX906-NEXT:  .LBB2_2: ; %bb.2
115; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
116; GFX906-NEXT:    v_mov_b32_e32 v4, 0
117; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
118; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
119; GFX906-NEXT:    global_store_byte v4, v1, s[6:7]
120; GFX906-NEXT:    global_store_byte v4, v0, s[6:7] offset:1
121; GFX906-NEXT:    global_store_byte_d16_hi v4, v1, s[6:7] offset:2
122; GFX906-NEXT:    global_store_byte v4, v3, s[6:7] offset:3
123; GFX906-NEXT:    global_store_byte v4, v2, s[6:7] offset:4
124; GFX906-NEXT:    s_endpgm
125entry:
126  %idx = call i32 @llvm.amdgcn.workitem.id.x()
127  %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
128  %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
129  %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
130  %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
131  %cmp = icmp ult i32 %idx, 15
132  br i1 %cmp, label %bb.1, label %bb.2
133bb.1:
134  br label %bb.2
135
136bb.2:
137  %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
138  store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
139  ret void
140}
141
142define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
143; GFX906-LABEL: v8i8_liveout:
144; GFX906:       ; %bb.0: ; %entry
145; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
146; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
147; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
148; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
149; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
151; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
152; GFX906-NEXT:    s_cbranch_execz .LBB3_2
153; GFX906-NEXT:  ; %bb.1: ; %bb.1
154; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
155; GFX906-NEXT:  .LBB3_2: ; %bb.2
156; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
157; GFX906-NEXT:    v_mov_b32_e32 v0, 0
158; GFX906-NEXT:    s_waitcnt vmcnt(0)
159; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
160; GFX906-NEXT:    s_endpgm
161entry:
162  %idx = call i32 @llvm.amdgcn.workitem.id.x()
163  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
164  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
165  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
166  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
167  %cmp = icmp ult i32 %idx, 15
168  br i1 %cmp, label %bb.1, label %bb.2
169bb.1:
170  br label %bb.2
171
172bb.2:
173  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
174  store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
175  ret void
176}
177
178define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
179; GFX906-LABEL: v16i8_liveout:
180; GFX906:       ; %bb.0: ; %entry
181; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
182; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
183; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
184; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
185; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[0:1]
187; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
188; GFX906-NEXT:    s_cbranch_execz .LBB4_2
189; GFX906-NEXT:  ; %bb.1: ; %bb.1
190; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[2:3]
191; GFX906-NEXT:  .LBB4_2: ; %bb.2
192; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
193; GFX906-NEXT:    v_mov_b32_e32 v0, 0
194; GFX906-NEXT:    s_waitcnt vmcnt(0)
195; GFX906-NEXT:    global_store_dwordx4 v0, v[1:4], s[6:7]
196; GFX906-NEXT:    s_endpgm
197entry:
198  %idx = call i32 @llvm.amdgcn.workitem.id.x()
199  %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
200  %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
201  %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx
202  %vec2 = load <16 x i8>, ptr addrspace(1) %gep2
203  %cmp = icmp ult i32 %idx, 15
204  br i1 %cmp, label %bb.1, label %bb.2
205bb.1:
206  br label %bb.2
207
208bb.2:
209  %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
210  store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4
211  ret void
212}
213
214define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
215; GFX906-LABEL: v32i8_liveout:
216; GFX906:       ; %bb.0: ; %entry
217; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
218; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
219; GFX906-NEXT:    v_lshlrev_b32_e32 v9, 5, v0
220; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
221; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
222; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[0:1]
223; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[0:1] offset:16
224; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
225; GFX906-NEXT:    s_cbranch_execz .LBB5_2
226; GFX906-NEXT:  ; %bb.1: ; %bb.1
227; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[2:3]
228; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[2:3] offset:16
229; GFX906-NEXT:  .LBB5_2: ; %bb.2
230; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
231; GFX906-NEXT:    v_mov_b32_e32 v0, 0
232; GFX906-NEXT:    s_waitcnt vmcnt(1)
233; GFX906-NEXT:    global_store_dwordx4 v0, v[1:4], s[6:7]
234; GFX906-NEXT:    s_waitcnt vmcnt(1)
235; GFX906-NEXT:    global_store_dwordx4 v0, v[5:8], s[6:7] offset:16
236; GFX906-NEXT:    s_endpgm
237entry:
238  %idx = call i32 @llvm.amdgcn.workitem.id.x()
239  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
240  %vec1 = load <32 x i8>, ptr addrspace(1) %gep1
241  %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx
242  %vec2 = load <32 x i8>, ptr addrspace(1) %gep2
243  %cmp = icmp ult i32 %idx, 15
244  br i1 %cmp, label %bb.1, label %bb.2
245bb.1:
246  br label %bb.2
247
248bb.2:
249  %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
250  store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4
251  ret void
252}
253
254define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
255; GFX906-LABEL: v256i8_liveout:
256; GFX906:       ; %bb.0: ; %entry
257; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
258; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
259; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
260; GFX906-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
261; GFX906-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
262; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1]
264; GFX906-NEXT:    s_mov_b32 s14, -1
265; GFX906-NEXT:    s_mov_b32 s15, 0xe00000
266; GFX906-NEXT:    s_add_u32 s12, s12, s11
267; GFX906-NEXT:    s_addc_u32 s13, s13, 0
268; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
269; GFX906-NEXT:    s_waitcnt vmcnt(0)
270; GFX906-NEXT:    buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
271; GFX906-NEXT:    s_nop 0
272; GFX906-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
273; GFX906-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
274; GFX906-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
275; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
276; GFX906-NEXT:    s_nop 0
277; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
278; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
279; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
280; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
281; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
282; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[0:1] offset:112
283; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[0:1] offset:128
284; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[0:1] offset:144
285; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[0:1] offset:160
286; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[0:1] offset:176
287; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[0:1] offset:192
288; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
289; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
290; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
291; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
292; GFX906-NEXT:    s_cbranch_execz .LBB6_2
293; GFX906-NEXT:  ; %bb.1: ; %bb.1
294; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
295; GFX906-NEXT:    s_waitcnt vmcnt(0)
296; GFX906-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
297; GFX906-NEXT:    s_nop 0
298; GFX906-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
299; GFX906-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
300; GFX906-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
301; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[2:3] offset:16
302; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[2:3] offset:32
303; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[2:3] offset:48
304; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[2:3] offset:64
305; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[2:3] offset:80
306; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[2:3] offset:96
307; GFX906-NEXT:    global_load_dwordx4 v[29:32], v4, s[2:3] offset:112
308; GFX906-NEXT:    global_load_dwordx4 v[33:36], v4, s[2:3] offset:128
309; GFX906-NEXT:    global_load_dwordx4 v[37:40], v4, s[2:3] offset:144
310; GFX906-NEXT:    global_load_dwordx4 v[41:44], v4, s[2:3] offset:160
311; GFX906-NEXT:    global_load_dwordx4 v[45:48], v4, s[2:3] offset:176
312; GFX906-NEXT:    global_load_dwordx4 v[49:52], v4, s[2:3] offset:192
313; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[2:3] offset:208
314; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[2:3] offset:224
315; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:240
316; GFX906-NEXT:  .LBB6_2: ; %bb.2
317; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
318; GFX906-NEXT:    s_waitcnt vmcnt(0)
319; GFX906-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
320; GFX906-NEXT:    s_nop 0
321; GFX906-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
322; GFX906-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
323; GFX906-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
324; GFX906-NEXT:    v_mov_b32_e32 v0, v57
325; GFX906-NEXT:    v_mov_b32_e32 v1, v58
326; GFX906-NEXT:    v_mov_b32_e32 v2, v59
327; GFX906-NEXT:    v_mov_b32_e32 v3, v60
328; GFX906-NEXT:    v_mov_b32_e32 v60, v56
329; GFX906-NEXT:    v_mov_b32_e32 v59, v55
330; GFX906-NEXT:    v_mov_b32_e32 v58, v54
331; GFX906-NEXT:    v_mov_b32_e32 v57, v53
332; GFX906-NEXT:    v_mov_b32_e32 v56, v52
333; GFX906-NEXT:    v_mov_b32_e32 v55, v51
334; GFX906-NEXT:    v_mov_b32_e32 v54, v50
335; GFX906-NEXT:    v_mov_b32_e32 v53, v49
336; GFX906-NEXT:    v_mov_b32_e32 v52, v48
337; GFX906-NEXT:    v_mov_b32_e32 v51, v47
338; GFX906-NEXT:    v_mov_b32_e32 v50, v46
339; GFX906-NEXT:    v_mov_b32_e32 v49, v45
340; GFX906-NEXT:    v_mov_b32_e32 v48, v44
341; GFX906-NEXT:    v_mov_b32_e32 v47, v43
342; GFX906-NEXT:    v_mov_b32_e32 v46, v42
343; GFX906-NEXT:    v_mov_b32_e32 v45, v41
344; GFX906-NEXT:    v_mov_b32_e32 v44, v40
345; GFX906-NEXT:    v_mov_b32_e32 v43, v39
346; GFX906-NEXT:    v_mov_b32_e32 v42, v38
347; GFX906-NEXT:    v_mov_b32_e32 v41, v37
348; GFX906-NEXT:    v_mov_b32_e32 v40, v36
349; GFX906-NEXT:    v_mov_b32_e32 v39, v35
350; GFX906-NEXT:    v_mov_b32_e32 v38, v34
351; GFX906-NEXT:    v_mov_b32_e32 v37, v33
352; GFX906-NEXT:    v_mov_b32_e32 v36, v32
353; GFX906-NEXT:    v_mov_b32_e32 v35, v31
354; GFX906-NEXT:    v_mov_b32_e32 v34, v30
355; GFX906-NEXT:    v_mov_b32_e32 v33, v29
356; GFX906-NEXT:    v_mov_b32_e32 v32, v28
357; GFX906-NEXT:    v_mov_b32_e32 v31, v27
358; GFX906-NEXT:    v_mov_b32_e32 v30, v26
359; GFX906-NEXT:    v_mov_b32_e32 v29, v25
360; GFX906-NEXT:    v_mov_b32_e32 v28, v24
361; GFX906-NEXT:    v_mov_b32_e32 v27, v23
362; GFX906-NEXT:    v_mov_b32_e32 v26, v22
363; GFX906-NEXT:    v_mov_b32_e32 v25, v21
364; GFX906-NEXT:    v_mov_b32_e32 v24, v20
365; GFX906-NEXT:    v_mov_b32_e32 v23, v19
366; GFX906-NEXT:    v_mov_b32_e32 v22, v18
367; GFX906-NEXT:    v_mov_b32_e32 v21, v17
368; GFX906-NEXT:    v_mov_b32_e32 v20, v16
369; GFX906-NEXT:    v_mov_b32_e32 v19, v15
370; GFX906-NEXT:    v_mov_b32_e32 v18, v14
371; GFX906-NEXT:    v_mov_b32_e32 v17, v13
372; GFX906-NEXT:    v_mov_b32_e32 v16, v12
373; GFX906-NEXT:    v_mov_b32_e32 v15, v11
374; GFX906-NEXT:    v_mov_b32_e32 v14, v10
375; GFX906-NEXT:    v_mov_b32_e32 v13, v9
376; GFX906-NEXT:    v_mov_b32_e32 v12, v8
377; GFX906-NEXT:    v_mov_b32_e32 v11, v7
378; GFX906-NEXT:    v_mov_b32_e32 v10, v6
379; GFX906-NEXT:    v_mov_b32_e32 v9, v5
380; GFX906-NEXT:    buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload
381; GFX906-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
382; GFX906-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
383; GFX906-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
384; GFX906-NEXT:    v_mov_b32_e32 v4, 0
385; GFX906-NEXT:    s_waitcnt vmcnt(0)
386; GFX906-NEXT:    global_store_dwordx4 v4, v[5:8], s[6:7]
387; GFX906-NEXT:    global_store_dwordx4 v4, v[9:12], s[6:7] offset:16
388; GFX906-NEXT:    global_store_dwordx4 v4, v[13:16], s[6:7] offset:32
389; GFX906-NEXT:    global_store_dwordx4 v4, v[17:20], s[6:7] offset:48
390; GFX906-NEXT:    global_store_dwordx4 v4, v[21:24], s[6:7] offset:64
391; GFX906-NEXT:    global_store_dwordx4 v4, v[25:28], s[6:7] offset:80
392; GFX906-NEXT:    global_store_dwordx4 v4, v[29:32], s[6:7] offset:96
393; GFX906-NEXT:    global_store_dwordx4 v4, v[33:36], s[6:7] offset:112
394; GFX906-NEXT:    global_store_dwordx4 v4, v[37:40], s[6:7] offset:128
395; GFX906-NEXT:    global_store_dwordx4 v4, v[41:44], s[6:7] offset:144
396; GFX906-NEXT:    global_store_dwordx4 v4, v[45:48], s[6:7] offset:160
397; GFX906-NEXT:    global_store_dwordx4 v4, v[49:52], s[6:7] offset:176
398; GFX906-NEXT:    global_store_dwordx4 v4, v[53:56], s[6:7] offset:192
399; GFX906-NEXT:    global_store_dwordx4 v4, v[57:60], s[6:7] offset:208
400; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:224
401; GFX906-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
402; GFX906-NEXT:    s_nop 0
403; GFX906-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
404; GFX906-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
405; GFX906-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
406; GFX906-NEXT:    s_waitcnt vmcnt(0)
407; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:240
408; GFX906-NEXT:    s_endpgm
409entry:
410  %idx = call i32 @llvm.amdgcn.workitem.id.x()
411  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
412  %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
413  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
414  %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
415  %cmp = icmp ult i32 %idx, 15
416  br i1 %cmp, label %bb.1, label %bb.2
417bb.1:
418  br label %bb.2
419
420bb.2:
421  %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
422  store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
423  ret void
424}
425
426
427define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
428; GFX906-LABEL: repeat_successor:
429; GFX906:       ; %bb.0: ; %entry
430; GFX906-NEXT:    s_load_dword s6, s[4:5], 0x24
431; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
432; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX906-NEXT:    s_cmp_lt_i32 s6, 3
434; GFX906-NEXT:    s_cbranch_scc0 .LBB7_3
435; GFX906-NEXT:  ; %bb.1: ; %LeafBlock
436; GFX906-NEXT:    s_cmp_ge_i32 s6, 1
437; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
438; GFX906-NEXT:  ; %bb.2:
439; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
440; GFX906-NEXT:    global_load_dword v0, v0, s[0:1]
441; GFX906-NEXT:    s_branch .LBB7_5
442; GFX906-NEXT:  .LBB7_3: ; %LeafBlock5
443; GFX906-NEXT:    s_cmp_eq_u32 s6, 3
444; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
445; GFX906-NEXT:  ; %bb.4: ; %sw.bb5
446; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
447; GFX906-NEXT:    global_load_dword v0, v0, s[2:3]
448; GFX906-NEXT:  .LBB7_5: ; %return.sink.split
449; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x3c
450; GFX906-NEXT:    v_mov_b32_e32 v1, 0
451; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
452; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
453; GFX906-NEXT:  .LBB7_6: ; %return
454; GFX906-NEXT:    s_endpgm
455entry:
456  %idx = call i32 @llvm.amdgcn.workitem.id.x()
457  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
458  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
459  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
460  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
461  switch i32 %in, label %return [
462    i32 1, label %return.sink.split
463    i32 2, label %return.sink.split
464    i32 3, label %sw.bb5
465  ]
466
467sw.bb5:
468  br label %return.sink.split
469
470return.sink.split:
471  %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
472  store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
473  ret void
474
475return:
476  ret void
477}
478
479define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
480; GFX906-LABEL: v8i8_phi_chain:
481; GFX906:       ; %bb.0: ; %entry
482; GFX906-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
483; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
484; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
485; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, -1
486; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[8:9]
488; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
489; GFX906-NEXT:    s_cbranch_execz .LBB8_2
490; GFX906-NEXT:  ; %bb.1: ; %bb.1
491; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[10:11]
492; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
493; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
494; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
495; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
496; GFX906-NEXT:  .LBB8_2: ; %Flow
497; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
498; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
499; GFX906-NEXT:    s_cbranch_execz .LBB8_4
500; GFX906-NEXT:  ; %bb.3: ; %bb.2
501; GFX906-NEXT:    v_mov_b32_e32 v0, 0
502; GFX906-NEXT:    s_waitcnt vmcnt(0)
503; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[12:13]
504; GFX906-NEXT:  .LBB8_4: ; %bb.3
505; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
506; GFX906-NEXT:    v_mov_b32_e32 v0, 0
507; GFX906-NEXT:    s_waitcnt vmcnt(0)
508; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[14:15]
509; GFX906-NEXT:    s_endpgm
510entry:
511  %idx = call i32 @llvm.amdgcn.workitem.id.x()
512  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
513  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
514  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
515  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
516  %cmp = icmp ult i32 %idx, 15
517  br i1 %cmp, label %bb.1, label %bb.2
518bb.1:
519  %cmp2 = icmp ult i32 %idx, 7
520  br i1 %cmp2, label %bb.2, label %bb.3
521
522bb.2:
523  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
524  store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
525  br label %bb.3
526
527bb.3:
528  %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
529  store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
530  ret void
531}
532
533define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
534; GFX906-LABEL: v8i8_multi_block:
535; GFX906:       ; %bb.0: ; %entry
536; GFX906-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
537; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
538; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
539; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[8:9]
541; GFX906-NEXT:    s_waitcnt vmcnt(0)
542; GFX906-NEXT:    v_mov_b32_e32 v1, v3
543; GFX906-NEXT:    v_mov_b32_e32 v2, v4
544; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
545; GFX906-NEXT:    s_cbranch_execz .LBB9_4
546; GFX906-NEXT:  ; %bb.1: ; %bb.1
547; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[10:11]
548; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
549; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
550; GFX906-NEXT:    s_cbranch_execz .LBB9_3
551; GFX906-NEXT:  ; %bb.2: ; %bb.2
552; GFX906-NEXT:    v_mov_b32_e32 v0, 0
553; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[12:13]
554; GFX906-NEXT:  .LBB9_3: ; %Flow
555; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
556; GFX906-NEXT:  .LBB9_4: ; %bb.3
557; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
558; GFX906-NEXT:    v_mov_b32_e32 v0, 0
559; GFX906-NEXT:    s_waitcnt vmcnt(0)
560; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[14:15]
561; GFX906-NEXT:    s_endpgm
562entry:
563  %idx = call i32 @llvm.amdgcn.workitem.id.x()
564  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
565  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
566  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
567  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
568  %cmp = icmp ult i32 %idx, 15
569  br i1 %cmp, label %bb.1, label %bb.3
570bb.1:
571  %cmp2 = icmp ult i32 %idx, 7
572  br i1 %cmp2, label %bb.2, label %bb.3
573
574bb.2:
575  store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
576  br label %bb.3
577
578bb.3:
579  %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
580  store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
581  ret void
582}
583
584define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
585; GFX906-LABEL: v32i8_loop_carried:
586; GFX906:       ; %bb.0: ; %entry
587; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
588; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
589; GFX906-NEXT:    v_mov_b32_e32 v3, 8
590; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
591; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
592; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX906-NEXT:    global_load_dword v1, v1, s[0:1]
594; GFX906-NEXT:    s_mov_b64 s[0:1], 0
595; GFX906-NEXT:    s_waitcnt vmcnt(0)
596; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
597; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
598; GFX906-NEXT:    v_mov_b32_e32 v2, 24
599; GFX906-NEXT:  .LBB10_1: ; %bb.1
600; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
601; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v1
602; GFX906-NEXT:    s_and_b64 s[2:3], exec, vcc
603; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
604; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
605; GFX906-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
606; GFX906-NEXT:    v_or3_b32 v1, v0, v3, v1
607; GFX906-NEXT:    s_andn2_b64 exec, exec, s[0:1]
608; GFX906-NEXT:    s_cbranch_execnz .LBB10_1
609; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
610; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
611; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
612; GFX906-NEXT:    v_mov_b32_e32 v0, 0
613; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
615; GFX906-NEXT:    s_endpgm
616entry:
617  %idx = call i32 @llvm.amdgcn.workitem.id.x()
618  %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
619  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
620  br label %bb.1
621
622bb.1:
623  %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
624  %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
625  %cmp = icmp ult i32 %idx, 15
626  br i1 %cmp, label %bb.1, label %bb.2
627  br label %bb.2
628
629bb.2:
630  store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
631  ret void
632}
633
634
635declare i32 @llvm.amdgcn.workitem.id.x()
636
637