xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll (revision b3995aa338a2837626d31ae8fffc340d95b888ca)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs -o - %s | FileCheck %s
3
4; Make sure the waterfall loop does not fail the verifier after regalloc fast
5;
6; FIXME: There are a lot of extra spills that aren't needed. This is due to the unmerge_merge combine
7;        running after RegBankSelect which inserts a lot of COPY instructions, but the original merge
8;        instruction (G_BUILD_VECTOR) stays because it has more than one use.
9;        Those spills are not present when optimizations are enabled.
10define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
11; CHECK-LABEL: waterfall_loop:
12; CHECK:       ; %bb.0: ; %bb
13; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; CHECK-NEXT:    s_xor_saveexec_b32 s4, -1
15; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
16; CHECK-NEXT:    s_mov_b32 exec_lo, s4
17; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
18; CHECK-NEXT:    v_mov_b32_e32 v14, v1
19; CHECK-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
20; CHECK-NEXT:    v_mov_b32_e32 v13, v2
21; CHECK-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
22; CHECK-NEXT:    v_mov_b32_e32 v12, v3
23; CHECK-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
24; CHECK-NEXT:    v_mov_b32_e32 v11, v4
25; CHECK-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
26; CHECK-NEXT:    v_mov_b32_e32 v10, v5
27; CHECK-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
28; CHECK-NEXT:    v_mov_b32_e32 v9, v6
29; CHECK-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
30; CHECK-NEXT:    v_mov_b32_e32 v8, v7
31; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
32; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
33; CHECK-NEXT:    v_mov_b32_e32 v1, v14
34; CHECK-NEXT:    v_mov_b32_e32 v2, v13
35; CHECK-NEXT:    v_mov_b32_e32 v3, v12
36; CHECK-NEXT:    v_mov_b32_e32 v4, v11
37; CHECK-NEXT:    v_mov_b32_e32 v5, v10
38; CHECK-NEXT:    v_mov_b32_e32 v6, v9
39; CHECK-NEXT:    v_mov_b32_e32 v7, v8
40; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
41; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
42; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
43; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
44; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
45; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
46; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
47; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
48; CHECK-NEXT:    s_mov_b32 s8, 0
49; CHECK-NEXT:    s_mov_b32 s4, s8
50; CHECK-NEXT:    s_mov_b32 s5, s8
51; CHECK-NEXT:    s_mov_b32 s6, s8
52; CHECK-NEXT:    s_mov_b32 s7, s8
53; CHECK-NEXT:    ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
54; CHECK-NEXT:    v_writelane_b32 v16, s4, 0
55; CHECK-NEXT:    v_writelane_b32 v16, s5, 1
56; CHECK-NEXT:    v_writelane_b32 v16, s6, 2
57; CHECK-NEXT:    v_writelane_b32 v16, s7, 3
58; CHECK-NEXT:    s_mov_b32 s4, 0
59; CHECK-NEXT:    v_mov_b32_e32 v0, s4
60; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
61; CHECK-NEXT:    v_mov_b32_e32 v0, s4
62; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
63; CHECK-NEXT:    s_mov_b32 s4, exec_lo
64; CHECK-NEXT:    v_writelane_b32 v16, s4, 4
65; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
66; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
67; CHECK-NEXT:    s_mov_b32 exec_lo, s21
68; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
69; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
70; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
71; CHECK-NEXT:    s_mov_b32 exec_lo, s21
72; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
73; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
74; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
75; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
76; CHECK-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
77; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
78; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
79; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
80; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
81; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
82; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
83; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
84; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
85; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
86; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
87; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
88; CHECK-NEXT:    s_waitcnt vmcnt(0)
89; CHECK-NEXT:    v_readfirstlane_b32 s12, v7
90; CHECK-NEXT:    v_readfirstlane_b32 s10, v6
91; CHECK-NEXT:    v_readfirstlane_b32 s9, v5
92; CHECK-NEXT:    v_readfirstlane_b32 s8, v4
93; CHECK-NEXT:    v_readfirstlane_b32 s7, v3
94; CHECK-NEXT:    v_readfirstlane_b32 s6, v2
95; CHECK-NEXT:    v_readfirstlane_b32 s5, v1
96; CHECK-NEXT:    v_readfirstlane_b32 s4, v0
97; CHECK-NEXT:    ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
98; CHECK-NEXT:    s_mov_b32 s13, s10
99; CHECK-NEXT:    s_mov_b32 s14, s9
100; CHECK-NEXT:    s_mov_b32 s15, s8
101; CHECK-NEXT:    s_mov_b32 s16, s7
102; CHECK-NEXT:    s_mov_b32 s17, s6
103; CHECK-NEXT:    s_mov_b32 s18, s5
104; CHECK-NEXT:    s_mov_b32 s19, s4
105; CHECK-NEXT:    v_writelane_b32 v16, s12, 5
106; CHECK-NEXT:    v_writelane_b32 v16, s13, 6
107; CHECK-NEXT:    v_writelane_b32 v16, s14, 7
108; CHECK-NEXT:    v_writelane_b32 v16, s15, 8
109; CHECK-NEXT:    v_writelane_b32 v16, s16, 9
110; CHECK-NEXT:    v_writelane_b32 v16, s17, 10
111; CHECK-NEXT:    v_writelane_b32 v16, s18, 11
112; CHECK-NEXT:    v_writelane_b32 v16, s19, 12
113; CHECK-NEXT:    v_mov_b32_e32 v6, v8
114; CHECK-NEXT:    v_mov_b32_e32 v7, v9
115; CHECK-NEXT:    v_mov_b32_e32 v4, v10
116; CHECK-NEXT:    v_mov_b32_e32 v5, v11
117; CHECK-NEXT:    v_mov_b32_e32 v2, v12
118; CHECK-NEXT:    v_mov_b32_e32 v3, v13
119; CHECK-NEXT:    v_mov_b32_e32 v0, v14
120; CHECK-NEXT:    v_mov_b32_e32 v1, v15
121; CHECK-NEXT:    s_mov_b64 s[4:5], s[12:13]
122; CHECK-NEXT:    s_mov_b64 s[10:11], s[14:15]
123; CHECK-NEXT:    s_mov_b64 s[8:9], s[16:17]
124; CHECK-NEXT:    s_mov_b64 s[6:7], s[18:19]
125; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, s[4:5], v[6:7]
126; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[4:5]
127; CHECK-NEXT:    s_and_b32 s4, s4, s5
128; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[8:9], v[2:3]
129; CHECK-NEXT:    s_and_b32 s4, s4, s5
130; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[6:7], v[0:1]
131; CHECK-NEXT:    s_and_b32 s4, s4, s5
132; CHECK-NEXT:    s_and_saveexec_b32 s4, s4
133; CHECK-NEXT:    v_writelane_b32 v16, s4, 13
134; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
135; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
136; CHECK-NEXT:    s_mov_b32 exec_lo, s21
137; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
138; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
139; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
140; CHECK-NEXT:    s_mov_b32 exec_lo, s21
141; CHECK-NEXT:    s_waitcnt vmcnt(0)
142; CHECK-NEXT:    v_readlane_b32 s4, v16, 13
143; CHECK-NEXT:    v_readlane_b32 s8, v16, 5
144; CHECK-NEXT:    v_readlane_b32 s9, v16, 6
145; CHECK-NEXT:    v_readlane_b32 s10, v16, 7
146; CHECK-NEXT:    v_readlane_b32 s11, v16, 8
147; CHECK-NEXT:    v_readlane_b32 s12, v16, 9
148; CHECK-NEXT:    v_readlane_b32 s13, v16, 10
149; CHECK-NEXT:    v_readlane_b32 s14, v16, 11
150; CHECK-NEXT:    v_readlane_b32 s15, v16, 12
151; CHECK-NEXT:    v_readlane_b32 s16, v16, 0
152; CHECK-NEXT:    v_readlane_b32 s17, v16, 1
153; CHECK-NEXT:    v_readlane_b32 s18, v16, 2
154; CHECK-NEXT:    v_readlane_b32 s19, v16, 3
155; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
156; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
157; CHECK-NEXT:    s_waitcnt vmcnt(0)
158; CHECK-NEXT:    image_sample v0, [v0, v1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
159; CHECK-NEXT:    s_waitcnt vmcnt(0)
160; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
161; CHECK-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
162; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
163; CHECK-NEXT:  ; %bb.3:
164; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
165; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
166; CHECK-NEXT:    s_mov_b32 exec_lo, s21
167; CHECK-NEXT:    s_waitcnt vmcnt(0)
168; CHECK-NEXT:    v_readlane_b32 s4, v16, 4
169; CHECK-NEXT:    s_mov_b32 exec_lo, s4
170; CHECK-NEXT:  ; %bb.4:
171; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
172; CHECK-NEXT:    ; implicit-def: $sgpr4
173; CHECK-NEXT:    v_mov_b32_e32 v1, s4
174; CHECK-NEXT:    v_mov_b32_e32 v2, s4
175; CHECK-NEXT:    v_mov_b32_e32 v3, s4
176; CHECK-NEXT:    s_xor_saveexec_b32 s4, -1
177; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
178; CHECK-NEXT:    s_mov_b32 exec_lo, s4
179; CHECK-NEXT:    s_waitcnt vmcnt(0)
180; CHECK-NEXT:    s_setpc_b64 s[30:31]
181bb:
182  %ret = tail call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %vgpr_srd, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
183  ret <4 x float> %ret
184}
185
186declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
187
188attributes #0 = { nounwind readonly willreturn }
189