xref: /llvm-project/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
3
4define void @main(i1 %arg) #0 {
5; CHECK-LABEL: main:
6; CHECK:       ; %bb.0: ; %bb
7; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
9; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
10; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
11; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
12; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
13; CHECK-NEXT:    v_writelane_b32 v5, s30, 0
14; CHECK-NEXT:    v_writelane_b32 v5, s31, 1
15; CHECK-NEXT:    v_writelane_b32 v5, s36, 2
16; CHECK-NEXT:    v_writelane_b32 v5, s37, 3
17; CHECK-NEXT:    v_writelane_b32 v5, s38, 4
18; CHECK-NEXT:    v_writelane_b32 v5, s39, 5
19; CHECK-NEXT:    v_writelane_b32 v5, s40, 6
20; CHECK-NEXT:    v_writelane_b32 v5, s41, 7
21; CHECK-NEXT:    v_writelane_b32 v5, s42, 8
22; CHECK-NEXT:    v_writelane_b32 v5, s43, 9
23; CHECK-NEXT:    v_writelane_b32 v5, s44, 10
24; CHECK-NEXT:    v_writelane_b32 v5, s45, 11
25; CHECK-NEXT:    v_writelane_b32 v5, s46, 12
26; CHECK-NEXT:    v_writelane_b32 v5, s47, 13
27; CHECK-NEXT:    v_writelane_b32 v5, s48, 14
28; CHECK-NEXT:    v_writelane_b32 v5, s49, 15
29; CHECK-NEXT:    s_getpc_b64 s[24:25]
30; CHECK-NEXT:    v_writelane_b32 v5, s50, 16
31; CHECK-NEXT:    s_movk_i32 s4, 0xf0
32; CHECK-NEXT:    s_mov_b32 s5, s24
33; CHECK-NEXT:    v_writelane_b32 v5, s51, 17
34; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
35; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
36; CHECK-NEXT:    s_mov_b64 s[4:5], 0
37; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
38; CHECK-NEXT:    s_movk_i32 s20, 0x130
39; CHECK-NEXT:    s_mov_b32 s21, s24
40; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
41; CHECK-NEXT:    v_writelane_b32 v7, s36, 0
42; CHECK-NEXT:    v_writelane_b32 v7, s37, 1
43; CHECK-NEXT:    v_writelane_b32 v7, s38, 2
44; CHECK-NEXT:    v_writelane_b32 v7, s39, 3
45; CHECK-NEXT:    v_writelane_b32 v7, s40, 4
46; CHECK-NEXT:    v_writelane_b32 v7, s41, 5
47; CHECK-NEXT:    v_writelane_b32 v7, s42, 6
48; CHECK-NEXT:    v_writelane_b32 v7, s43, 7
49; CHECK-NEXT:    v_writelane_b32 v7, s44, 8
50; CHECK-NEXT:    v_writelane_b32 v7, s45, 9
51; CHECK-NEXT:    v_writelane_b32 v7, s46, 10
52; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
53; CHECK-NEXT:    v_writelane_b32 v7, s47, 11
54; CHECK-NEXT:    v_writelane_b32 v7, s48, 12
55; CHECK-NEXT:    s_mov_b32 s20, 0
56; CHECK-NEXT:    v_mov_b32_e32 v1, 0
57; CHECK-NEXT:    v_writelane_b32 v7, s49, 13
58; CHECK-NEXT:    v_mov_b32_e32 v2, s28
59; CHECK-NEXT:    v_mov_b32_e32 v3, v1
60; CHECK-NEXT:    s_mov_b32 s21, s20
61; CHECK-NEXT:    s_mov_b32 s22, s20
62; CHECK-NEXT:    s_mov_b32 s23, s20
63; CHECK-NEXT:    v_writelane_b32 v7, s50, 14
64; CHECK-NEXT:    v_writelane_b32 v7, s51, 15
65; CHECK-NEXT:    image_sample_lz v3, v[2:3], s[44:51], s[20:23] dmask:0x1
66; CHECK-NEXT:    v_mov_b32_e32 v2, v1
67; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
68; CHECK-NEXT:    v_writelane_b32 v7, s4, 16
69; CHECK-NEXT:    v_writelane_b32 v7, s5, 17
70; CHECK-NEXT:    v_writelane_b32 v7, s6, 18
71; CHECK-NEXT:    v_writelane_b32 v7, s7, 19
72; CHECK-NEXT:    v_writelane_b32 v7, s8, 20
73; CHECK-NEXT:    v_writelane_b32 v7, s9, 21
74; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[4:11], s[20:23] dmask:0x1
75; CHECK-NEXT:    v_writelane_b32 v7, s10, 22
76; CHECK-NEXT:    v_writelane_b32 v7, s11, 23
77; CHECK-NEXT:    v_writelane_b32 v7, s12, 24
78; CHECK-NEXT:    v_writelane_b32 v7, s13, 25
79; CHECK-NEXT:    v_writelane_b32 v7, s14, 26
80; CHECK-NEXT:    v_writelane_b32 v7, s15, 27
81; CHECK-NEXT:    v_writelane_b32 v5, s52, 18
82; CHECK-NEXT:    v_writelane_b32 v7, s16, 28
83; CHECK-NEXT:    v_writelane_b32 v5, s53, 19
84; CHECK-NEXT:    v_writelane_b32 v7, s17, 29
85; CHECK-NEXT:    v_writelane_b32 v5, s54, 20
86; CHECK-NEXT:    v_writelane_b32 v7, s18, 30
87; CHECK-NEXT:    s_mov_b32 s26, 48
88; CHECK-NEXT:    s_mov_b32 s27, s24
89; CHECK-NEXT:    v_writelane_b32 v5, s55, 21
90; CHECK-NEXT:    v_writelane_b32 v7, s19, 31
91; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[26:27], 0x0
92; CHECK-NEXT:    v_writelane_b32 v5, s56, 22
93; CHECK-NEXT:    v_writelane_b32 v5, s57, 23
94; CHECK-NEXT:    v_writelane_b32 v5, s58, 24
95; CHECK-NEXT:    v_writelane_b32 v5, s59, 25
96; CHECK-NEXT:    v_writelane_b32 v5, s60, 26
97; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
98; CHECK-NEXT:    v_writelane_b32 v7, s4, 32
99; CHECK-NEXT:    v_writelane_b32 v5, s61, 27
100; CHECK-NEXT:    v_writelane_b32 v7, s5, 33
101; CHECK-NEXT:    v_writelane_b32 v5, s62, 28
102; CHECK-NEXT:    v_writelane_b32 v7, s6, 34
103; CHECK-NEXT:    v_writelane_b32 v5, s63, 29
104; CHECK-NEXT:    v_writelane_b32 v7, s7, 35
105; CHECK-NEXT:    v_writelane_b32 v5, s64, 30
106; CHECK-NEXT:    v_writelane_b32 v7, s8, 36
107; CHECK-NEXT:    v_writelane_b32 v5, s65, 31
108; CHECK-NEXT:    v_writelane_b32 v7, s9, 37
109; CHECK-NEXT:    v_writelane_b32 v5, s66, 32
110; CHECK-NEXT:    s_movk_i32 s28, 0x1f0
111; CHECK-NEXT:    s_movk_i32 s30, 0x2f0
112; CHECK-NEXT:    s_mov_b32 s29, s24
113; CHECK-NEXT:    s_mov_b32 s31, s24
114; CHECK-NEXT:    v_writelane_b32 v7, s10, 38
115; CHECK-NEXT:    v_writelane_b32 v5, s67, 33
116; CHECK-NEXT:    v_writelane_b32 v7, s11, 39
117; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
118; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
119; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
120; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
121; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
122; CHECK-NEXT:    s_waitcnt vmcnt(0)
123; CHECK-NEXT:    v_mul_f32_e32 v0, v4, v3
124; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
125; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
126; CHECK-NEXT:    s_cbranch_execz .LBB0_3
127; CHECK-NEXT:  ; %bb.1: ; %bb48
128; CHECK-NEXT:    v_readlane_b32 s36, v7, 0
129; CHECK-NEXT:    v_readlane_b32 s44, v7, 8
130; CHECK-NEXT:    v_readlane_b32 s45, v7, 9
131; CHECK-NEXT:    v_readlane_b32 s46, v7, 10
132; CHECK-NEXT:    v_readlane_b32 s47, v7, 11
133; CHECK-NEXT:    v_readlane_b32 s48, v7, 12
134; CHECK-NEXT:    v_readlane_b32 s49, v7, 13
135; CHECK-NEXT:    v_readlane_b32 s50, v7, 14
136; CHECK-NEXT:    v_readlane_b32 s51, v7, 15
137; CHECK-NEXT:    s_and_b64 vcc, exec, -1
138; CHECK-NEXT:    v_readlane_b32 s37, v7, 1
139; CHECK-NEXT:    v_readlane_b32 s38, v7, 2
140; CHECK-NEXT:    v_readlane_b32 s39, v7, 3
141; CHECK-NEXT:    v_readlane_b32 s40, v7, 4
142; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[44:51], s[20:23] dmask:0x1
143; CHECK-NEXT:    v_mov_b32_e32 v2, 0
144; CHECK-NEXT:    v_readlane_b32 s41, v7, 5
145; CHECK-NEXT:    v_readlane_b32 s42, v7, 6
146; CHECK-NEXT:    v_readlane_b32 s43, v7, 7
147; CHECK-NEXT:  .LBB0_2: ; %bb50
148; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
149; CHECK-NEXT:    v_readlane_b32 s36, v7, 32
150; CHECK-NEXT:    v_readlane_b32 s40, v7, 36
151; CHECK-NEXT:    v_readlane_b32 s41, v7, 37
152; CHECK-NEXT:    v_readlane_b32 s42, v7, 38
153; CHECK-NEXT:    v_readlane_b32 s43, v7, 39
154; CHECK-NEXT:    s_mov_b32 s21, s20
155; CHECK-NEXT:    s_mov_b32 s22, s20
156; CHECK-NEXT:    s_mov_b32 s23, s20
157; CHECK-NEXT:    v_readlane_b32 s37, v7, 33
158; CHECK-NEXT:    v_readlane_b32 s38, v7, 34
159; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
160; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[60:67], s[40:43] dmask:0x1
161; CHECK-NEXT:    v_readlane_b32 s39, v7, 35
162; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
163; CHECK-NEXT:    s_waitcnt vmcnt(0)
164; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v4
165; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
166; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v3
167; CHECK-NEXT:    s_mov_b64 vcc, vcc
168; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
169; CHECK-NEXT:  .LBB0_3: ; %Flow14
170; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
171; CHECK-NEXT:    v_readlane_b32 s12, v7, 32
172; CHECK-NEXT:    v_readlane_b32 s13, v7, 33
173; CHECK-NEXT:    v_readlane_b32 s14, v7, 34
174; CHECK-NEXT:    v_readlane_b32 s15, v7, 35
175; CHECK-NEXT:    v_readlane_b32 s16, v7, 36
176; CHECK-NEXT:    v_readlane_b32 s17, v7, 37
177; CHECK-NEXT:    v_readlane_b32 s18, v7, 38
178; CHECK-NEXT:    v_readlane_b32 s19, v7, 39
179; CHECK-NEXT:    v_writelane_b32 v7, s4, 40
180; CHECK-NEXT:    v_writelane_b32 v7, s5, 41
181; CHECK-NEXT:    v_writelane_b32 v7, s6, 42
182; CHECK-NEXT:    v_writelane_b32 v7, s7, 43
183; CHECK-NEXT:    v_writelane_b32 v7, s8, 44
184; CHECK-NEXT:    v_writelane_b32 v7, s9, 45
185; CHECK-NEXT:    v_writelane_b32 v7, s10, 46
186; CHECK-NEXT:    v_writelane_b32 v7, s11, 47
187; CHECK-NEXT:    v_writelane_b32 v7, s12, 48
188; CHECK-NEXT:    v_writelane_b32 v7, s13, 49
189; CHECK-NEXT:    v_writelane_b32 v7, s14, 50
190; CHECK-NEXT:    v_writelane_b32 v7, s15, 51
191; CHECK-NEXT:    v_writelane_b32 v7, s16, 52
192; CHECK-NEXT:    v_writelane_b32 v7, s17, 53
193; CHECK-NEXT:    v_writelane_b32 v7, s18, 54
194; CHECK-NEXT:    v_writelane_b32 v7, s19, 55
195; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
196; CHECK-NEXT:    v_writelane_b32 v7, s52, 56
197; CHECK-NEXT:    v_writelane_b32 v6, s60, 0
198; CHECK-NEXT:    v_writelane_b32 v7, s53, 57
199; CHECK-NEXT:    v_writelane_b32 v6, s61, 1
200; CHECK-NEXT:    v_writelane_b32 v7, s54, 58
201; CHECK-NEXT:    v_writelane_b32 v6, s62, 2
202; CHECK-NEXT:    v_writelane_b32 v7, s55, 59
203; CHECK-NEXT:    v_writelane_b32 v6, s63, 3
204; CHECK-NEXT:    v_writelane_b32 v7, s56, 60
205; CHECK-NEXT:    v_writelane_b32 v6, s64, 4
206; CHECK-NEXT:    v_writelane_b32 v7, s57, 61
207; CHECK-NEXT:    v_writelane_b32 v6, s65, 5
208; CHECK-NEXT:    v_writelane_b32 v7, s58, 62
209; CHECK-NEXT:    v_writelane_b32 v6, s66, 6
210; CHECK-NEXT:    v_writelane_b32 v7, s59, 63
211; CHECK-NEXT:    v_writelane_b32 v6, s67, 7
212; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[26:27]
213; CHECK-NEXT:    s_cbranch_execz .LBB0_10
214; CHECK-NEXT:  ; %bb.4: ; %bb32
215; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
216; CHECK-NEXT:    s_xor_b64 s[22:23], exec, s[8:9]
217; CHECK-NEXT:    s_cbranch_execz .LBB0_6
218; CHECK-NEXT:  ; %bb.5: ; %bb43
219; CHECK-NEXT:    s_mov_b32 s8, 0
220; CHECK-NEXT:    s_mov_b32 s9, s8
221; CHECK-NEXT:    v_mov_b32_e32 v0, s8
222; CHECK-NEXT:    v_readlane_b32 s36, v7, 0
223; CHECK-NEXT:    v_mov_b32_e32 v1, s9
224; CHECK-NEXT:    s_mov_b32 s10, s8
225; CHECK-NEXT:    s_mov_b32 s11, s8
226; CHECK-NEXT:    v_readlane_b32 s37, v7, 1
227; CHECK-NEXT:    v_readlane_b32 s38, v7, 2
228; CHECK-NEXT:    v_readlane_b32 s39, v7, 3
229; CHECK-NEXT:    v_readlane_b32 s40, v7, 4
230; CHECK-NEXT:    v_readlane_b32 s41, v7, 5
231; CHECK-NEXT:    v_readlane_b32 s42, v7, 6
232; CHECK-NEXT:    v_readlane_b32 s43, v7, 7
233; CHECK-NEXT:    v_readlane_b32 s44, v7, 8
234; CHECK-NEXT:    v_readlane_b32 s45, v7, 9
235; CHECK-NEXT:    v_readlane_b32 s46, v7, 10
236; CHECK-NEXT:    v_readlane_b32 s47, v7, 11
237; CHECK-NEXT:    v_readlane_b32 s48, v7, 12
238; CHECK-NEXT:    v_readlane_b32 s49, v7, 13
239; CHECK-NEXT:    v_readlane_b32 s50, v7, 14
240; CHECK-NEXT:    v_readlane_b32 s51, v7, 15
241; CHECK-NEXT:    image_sample_lz v2, v[0:1], s[36:43], s[8:11] dmask:0x1
242; CHECK-NEXT:    v_readlane_b32 s36, v7, 16
243; CHECK-NEXT:    v_readlane_b32 s44, v7, 24
244; CHECK-NEXT:    v_readlane_b32 s45, v7, 25
245; CHECK-NEXT:    v_readlane_b32 s46, v7, 26
246; CHECK-NEXT:    v_readlane_b32 s47, v7, 27
247; CHECK-NEXT:    v_readlane_b32 s48, v7, 28
248; CHECK-NEXT:    v_readlane_b32 s49, v7, 29
249; CHECK-NEXT:    v_readlane_b32 s50, v7, 30
250; CHECK-NEXT:    v_readlane_b32 s51, v7, 31
251; CHECK-NEXT:    v_mov_b32_e32 v3, 0
252; CHECK-NEXT:    v_mov_b32_e32 v4, v3
253; CHECK-NEXT:    v_readlane_b32 s37, v7, 17
254; CHECK-NEXT:    v_readlane_b32 s38, v7, 18
255; CHECK-NEXT:    v_readlane_b32 s39, v7, 19
256; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
257; CHECK-NEXT:    v_readlane_b32 s40, v7, 20
258; CHECK-NEXT:    v_readlane_b32 s41, v7, 21
259; CHECK-NEXT:    v_readlane_b32 s42, v7, 22
260; CHECK-NEXT:    v_readlane_b32 s43, v7, 23
261; CHECK-NEXT:    s_waitcnt vmcnt(1)
262; CHECK-NEXT:    buffer_store_dwordx3 v[2:4], off, s[8:11], 0
263; CHECK-NEXT:    s_waitcnt vmcnt(1)
264; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
265; CHECK-NEXT:    ; implicit-def: $vgpr0
266; CHECK-NEXT:  .LBB0_6: ; %Flow12
267; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[22:23]
268; CHECK-NEXT:    v_readlane_b32 s52, v7, 40
269; CHECK-NEXT:    v_readlane_b32 s53, v7, 41
270; CHECK-NEXT:    v_readlane_b32 s54, v7, 42
271; CHECK-NEXT:    v_readlane_b32 s55, v7, 43
272; CHECK-NEXT:    v_readlane_b32 s56, v7, 44
273; CHECK-NEXT:    v_readlane_b32 s57, v7, 45
274; CHECK-NEXT:    v_readlane_b32 s58, v7, 46
275; CHECK-NEXT:    v_readlane_b32 s59, v7, 47
276; CHECK-NEXT:    v_readlane_b32 s60, v7, 48
277; CHECK-NEXT:    v_readlane_b32 s61, v7, 49
278; CHECK-NEXT:    v_readlane_b32 s62, v7, 50
279; CHECK-NEXT:    v_readlane_b32 s63, v7, 51
280; CHECK-NEXT:    v_readlane_b32 s64, v7, 52
281; CHECK-NEXT:    v_readlane_b32 s65, v7, 53
282; CHECK-NEXT:    v_readlane_b32 s66, v7, 54
283; CHECK-NEXT:    v_readlane_b32 s67, v7, 55
284; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
285; CHECK-NEXT:    s_cbranch_execz .LBB0_9
286; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
287; CHECK-NEXT:    s_mov_b32 s8, 0
288; CHECK-NEXT:    s_mov_b32 s6, s8
289; CHECK-NEXT:    s_mov_b32 s7, s8
290; CHECK-NEXT:    v_mov_b32_e32 v1, s6
291; CHECK-NEXT:    v_readlane_b32 s36, v7, 56
292; CHECK-NEXT:    s_mov_b32 s9, s8
293; CHECK-NEXT:    s_mov_b32 s10, s8
294; CHECK-NEXT:    s_mov_b32 s11, s8
295; CHECK-NEXT:    v_mov_b32_e32 v2, s7
296; CHECK-NEXT:    v_readlane_b32 s37, v7, 57
297; CHECK-NEXT:    v_readlane_b32 s38, v7, 58
298; CHECK-NEXT:    v_readlane_b32 s39, v7, 59
299; CHECK-NEXT:    v_readlane_b32 s40, v7, 60
300; CHECK-NEXT:    v_readlane_b32 s41, v7, 61
301; CHECK-NEXT:    v_readlane_b32 s42, v7, 62
302; CHECK-NEXT:    v_readlane_b32 s43, v7, 63
303; CHECK-NEXT:    s_nop 4
304; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
305; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
306; CHECK-NEXT:    ; kill: killed $vgpr1_vgpr2
307; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
308; CHECK-NEXT:    s_and_b64 vcc, exec, 0
309; CHECK-NEXT:    v_readlane_b32 s44, v6, 0
310; CHECK-NEXT:    v_readlane_b32 s45, v6, 1
311; CHECK-NEXT:    v_readlane_b32 s46, v6, 2
312; CHECK-NEXT:    v_readlane_b32 s47, v6, 3
313; CHECK-NEXT:    v_readlane_b32 s48, v6, 4
314; CHECK-NEXT:    v_readlane_b32 s49, v6, 5
315; CHECK-NEXT:    v_readlane_b32 s50, v6, 6
316; CHECK-NEXT:    v_readlane_b32 s51, v6, 7
317; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
318; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
319; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
320; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
321; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
322; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
323; CHECK-NEXT:    s_waitcnt vmcnt(0)
324; CHECK-NEXT:    v_sub_f32_e32 v1, v4, v3
325; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
326; CHECK-NEXT:    v_mov_b32_e32 v1, 0
327; CHECK-NEXT:  .LBB0_8: ; %bb33
328; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
329; CHECK-NEXT:    v_add_f32_e32 v2, v1, v0
330; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v2
331; CHECK-NEXT:    s_mov_b64 vcc, vcc
332; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
333; CHECK-NEXT:  .LBB0_9: ; %Flow13
334; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
335; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
336; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
337; CHECK-NEXT:    v_readlane_b32 s67, v5, 33
338; CHECK-NEXT:    v_readlane_b32 s66, v5, 32
339; CHECK-NEXT:    v_readlane_b32 s65, v5, 31
340; CHECK-NEXT:    v_readlane_b32 s64, v5, 30
341; CHECK-NEXT:    v_readlane_b32 s63, v5, 29
342; CHECK-NEXT:    v_readlane_b32 s62, v5, 28
343; CHECK-NEXT:    v_readlane_b32 s61, v5, 27
344; CHECK-NEXT:    v_readlane_b32 s60, v5, 26
345; CHECK-NEXT:    v_readlane_b32 s59, v5, 25
346; CHECK-NEXT:    v_readlane_b32 s58, v5, 24
347; CHECK-NEXT:    v_readlane_b32 s57, v5, 23
348; CHECK-NEXT:    v_readlane_b32 s56, v5, 22
349; CHECK-NEXT:    v_readlane_b32 s55, v5, 21
350; CHECK-NEXT:    v_readlane_b32 s54, v5, 20
351; CHECK-NEXT:    v_readlane_b32 s53, v5, 19
352; CHECK-NEXT:    v_readlane_b32 s52, v5, 18
353; CHECK-NEXT:    v_readlane_b32 s51, v5, 17
354; CHECK-NEXT:    v_readlane_b32 s50, v5, 16
355; CHECK-NEXT:    v_readlane_b32 s49, v5, 15
356; CHECK-NEXT:    v_readlane_b32 s48, v5, 14
357; CHECK-NEXT:    v_readlane_b32 s47, v5, 13
358; CHECK-NEXT:    v_readlane_b32 s46, v5, 12
359; CHECK-NEXT:    v_readlane_b32 s45, v5, 11
360; CHECK-NEXT:    v_readlane_b32 s44, v5, 10
361; CHECK-NEXT:    v_readlane_b32 s43, v5, 9
362; CHECK-NEXT:    v_readlane_b32 s42, v5, 8
363; CHECK-NEXT:    v_readlane_b32 s41, v5, 7
364; CHECK-NEXT:    v_readlane_b32 s40, v5, 6
365; CHECK-NEXT:    v_readlane_b32 s39, v5, 5
366; CHECK-NEXT:    v_readlane_b32 s38, v5, 4
367; CHECK-NEXT:    v_readlane_b32 s37, v5, 3
368; CHECK-NEXT:    v_readlane_b32 s36, v5, 2
369; CHECK-NEXT:    v_readlane_b32 s31, v5, 1
370; CHECK-NEXT:    v_readlane_b32 s30, v5, 0
371; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
372; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
373; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
374; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
375; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
376; CHECK-NEXT:    s_waitcnt vmcnt(0)
377; CHECK-NEXT:    s_setpc_b64 s[30:31]
378bb:
379  %i = call i64 @llvm.amdgcn.s.getpc()
380  %i1 = trunc i64 %i to i32
381  %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 1
382  %i3 = bitcast <2 x i32> %i2 to i64
383  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
384  %i5 = getelementptr i8, ptr addrspace(4) %i4, i64 48
385  %i6 = load <4 x i32>, ptr addrspace(4) %i5, align 16
386  %i7 = getelementptr i8, ptr addrspace(4) %i4, i64 64
387  %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16
388  %i9 = getelementptr i8, ptr addrspace(4) %i4, i64 240
389  %i10 = load <8 x i32>, ptr addrspace(4) %i9, align 32
390  %i11 = getelementptr i8, ptr addrspace(4) %i4, i64 272
391  %i12 = load <8 x i32>, ptr addrspace(4) %i11, align 32
392  %i13 = getelementptr i8, ptr addrspace(4) %i4, i64 304
393  %i14 = load <8 x i32>, ptr addrspace(4) %i13, align 32
394  %i15 = getelementptr i8, ptr addrspace(4) %i4, i64 336
395  %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32
396  %i17 = getelementptr i8, ptr addrspace(4) %i4, i64 496
397  %i18 = load <8 x i32>, ptr addrspace(4) %i17, align 32
398  %i19 = getelementptr i8, ptr addrspace(4) %i4, i64 528
399  %i20 = load <8 x i32>, ptr addrspace(4) %i19, align 32
400  %i21 = getelementptr i8, ptr addrspace(4) %i4, i64 752
401  %i22 = load <8 x i32>, ptr addrspace(4) %i21, align 32
402  %i23 = getelementptr i8, ptr addrspace(4) %i4, i64 784
403  %i24 = load <8 x i32>, ptr addrspace(4) %i23, align 32
404  %i25 = load <4 x float>, ptr addrspace(4) null, align 16
405  %i26 = extractelement <4 x float> %i25, i64 0
406  %i27 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float %i26, float 0.000000e+00, <8 x i32> %i12, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
407  %i28 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i14, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
408  %i29 = extractelement <4 x float> %i28, i64 0
409  %i30 = fmul float %i29, %i27
410  %i31 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i16, <4 x i32> %i6, i1 false, i32 0, i32 0)
411  br i1 %arg, label %bb32, label %bb48
412
413bb32:                                             ; preds = %bb
414  br i1 %arg, label %bb33, label %bb43
415
416bb33:                                             ; preds = %bb33, %bb32
417  %i34 = phi float [ %i42, %bb33 ], [ 0.000000e+00, %bb32 ]
418  %i35 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i18, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
419  %i36 = extractelement <2 x float> %i35, i64 0
420  %i37 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i22, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
421  %i38 = extractelement <2 x float> %i37, i64 0
422  %i39 = fsub float %i38, %i36
423  %i40 = fmul float %i39, %i30
424  %i41 = fadd float %i34, %i40
425  %i42 = fsub float %i34, %i41
426  br label %bb33
427
428bb43:                                             ; preds = %bb32
429  %i44 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i10, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
430  %i45 = bitcast float %i44 to i32
431  %i46 = insertelement <3 x i32> zeroinitializer, i32 %i45, i64 0
432  call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %i46, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
433  %i47 = bitcast <4 x float> %i31 to <4 x i32>
434  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i47, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
435  ret void
436
437bb48:                                             ; preds = %bb
438  %i49 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i12, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
439  br label %bb50
440
441bb50:                                             ; preds = %bb50, %bb48
442  %i51 = phi float [ 0.000000e+00, %bb48 ], [ %i58, %bb50 ]
443  %i52 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float %i51, float 0.000000e+00, <8 x i32> %i20, <4 x i32> %i8, i1 false, i32 0, i32 0)
444  %i53 = extractelement <2 x float> %i52, i64 0
445  %i54 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float %i51, float 0.000000e+00, <8 x i32> %i24, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
446  %i55 = extractelement <2 x float> %i54, i64 0
447  %i56 = fsub float %i55, %i53
448  %i57 = fmul float %i56, %i30
449  %i58 = fmul float %i57, %i49
450  br label %bb50
451}
452
453declare i64 @llvm.amdgcn.s.getpc() #1
454declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
455declare float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
456declare <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
457declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
458declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
459
460attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
461attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
462attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) }
463attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }
464