xref: /llvm-project/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4; SelectionDAG builder was using the IR value kind to decide how to
5; split the types for copyToRegs/copyFromRegs in all contexts. This
6; was incorrect if the ABI-like value such as a call was used outside
7; of the block. The value in that case is not used directly, but
8; through another set of copies to potentially different register
9; types in the parent block.
10
11; This would then end up producing inconsistent pairs of copies with
12; the wrong sizes when the vector type result from the call was split
13; into multiple pieces, but expected to be a single register in the
14; cross-block copy.
15;
16; This isn't exactly ideal for AMDGPU, since in reality the
17; intermediate vector register type is undesirable anyway, but it
18; requires more work to be able to split all vector copies in all
19; contexts.
20;
21; This was only an issue if the value was used directly in another
22; block. If there was an intermediate operation or a phi it was fine,
23; since that didn't look like an ABI copy.
24
25
26define float @call_split_type_used_outside_block_v2f32() #0 {
27; GCN-LABEL: call_split_type_used_outside_block_v2f32:
28; GCN:       ; %bb.0: ; %bb0
29; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GCN-NEXT:    s_mov_b32 s16, s33
31; GCN-NEXT:    s_mov_b32 s33, s32
32; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
33; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
34; GCN-NEXT:    s_mov_b64 exec, s[18:19]
35; GCN-NEXT:    v_writelane_b32 v40, s16, 2
36; GCN-NEXT:    s_addk_i32 s32, 0x400
37; GCN-NEXT:    v_writelane_b32 v40, s30, 0
38; GCN-NEXT:    v_writelane_b32 v40, s31, 1
39; GCN-NEXT:    s_getpc_b64 s[16:17]
40; GCN-NEXT:    s_add_u32 s16, s16, func_v2f32@rel32@lo+4
41; GCN-NEXT:    s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
42; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
43; GCN-NEXT:    v_readlane_b32 s31, v40, 1
44; GCN-NEXT:    v_readlane_b32 s30, v40, 0
45; GCN-NEXT:    s_mov_b32 s32, s33
46; GCN-NEXT:    v_readlane_b32 s4, v40, 2
47; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
48; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
49; GCN-NEXT:    s_mov_b64 exec, s[6:7]
50; GCN-NEXT:    s_mov_b32 s33, s4
51; GCN-NEXT:    s_waitcnt vmcnt(0)
52; GCN-NEXT:    s_setpc_b64 s[30:31]
53bb0:
54  %split.ret.type = call <2 x float> @func_v2f32()
55  br label %bb1
56
57bb1:
58  %extract = extractelement <2 x float> %split.ret.type, i32 0
59  ret float %extract
60}
61
62define float @call_split_type_used_outside_block_v3f32() #0 {
63; GCN-LABEL: call_split_type_used_outside_block_v3f32:
64; GCN:       ; %bb.0: ; %bb0
65; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66; GCN-NEXT:    s_mov_b32 s16, s33
67; GCN-NEXT:    s_mov_b32 s33, s32
68; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
69; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
70; GCN-NEXT:    s_mov_b64 exec, s[18:19]
71; GCN-NEXT:    v_writelane_b32 v40, s16, 2
72; GCN-NEXT:    s_addk_i32 s32, 0x400
73; GCN-NEXT:    v_writelane_b32 v40, s30, 0
74; GCN-NEXT:    v_writelane_b32 v40, s31, 1
75; GCN-NEXT:    s_getpc_b64 s[16:17]
76; GCN-NEXT:    s_add_u32 s16, s16, func_v3f32@rel32@lo+4
77; GCN-NEXT:    s_addc_u32 s17, s17, func_v3f32@rel32@hi+12
78; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
79; GCN-NEXT:    v_readlane_b32 s31, v40, 1
80; GCN-NEXT:    v_readlane_b32 s30, v40, 0
81; GCN-NEXT:    s_mov_b32 s32, s33
82; GCN-NEXT:    v_readlane_b32 s4, v40, 2
83; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
84; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
85; GCN-NEXT:    s_mov_b64 exec, s[6:7]
86; GCN-NEXT:    s_mov_b32 s33, s4
87; GCN-NEXT:    s_waitcnt vmcnt(0)
88; GCN-NEXT:    s_setpc_b64 s[30:31]
89bb0:
90  %split.ret.type = call <3 x float> @func_v3f32()
91  br label %bb1
92
93bb1:
94  %extract = extractelement <3 x float> %split.ret.type, i32 0
95  ret float %extract
96}
97
98define half @call_split_type_used_outside_block_v4f16() #0 {
99; GCN-LABEL: call_split_type_used_outside_block_v4f16:
100; GCN:       ; %bb.0: ; %bb0
101; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GCN-NEXT:    s_mov_b32 s16, s33
103; GCN-NEXT:    s_mov_b32 s33, s32
104; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
105; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
106; GCN-NEXT:    s_mov_b64 exec, s[18:19]
107; GCN-NEXT:    v_writelane_b32 v40, s16, 2
108; GCN-NEXT:    s_addk_i32 s32, 0x400
109; GCN-NEXT:    v_writelane_b32 v40, s30, 0
110; GCN-NEXT:    v_writelane_b32 v40, s31, 1
111; GCN-NEXT:    s_getpc_b64 s[16:17]
112; GCN-NEXT:    s_add_u32 s16, s16, func_v4f16@rel32@lo+4
113; GCN-NEXT:    s_addc_u32 s17, s17, func_v4f16@rel32@hi+12
114; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
115; GCN-NEXT:    v_readlane_b32 s31, v40, 1
116; GCN-NEXT:    v_readlane_b32 s30, v40, 0
117; GCN-NEXT:    s_mov_b32 s32, s33
118; GCN-NEXT:    v_readlane_b32 s4, v40, 2
119; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
120; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
121; GCN-NEXT:    s_mov_b64 exec, s[6:7]
122; GCN-NEXT:    s_mov_b32 s33, s4
123; GCN-NEXT:    s_waitcnt vmcnt(0)
124; GCN-NEXT:    s_setpc_b64 s[30:31]
125bb0:
126  %split.ret.type = call <4 x half> @func_v4f16()
127  br label %bb1
128
129bb1:
130  %extract = extractelement <4 x half> %split.ret.type, i32 0
131  ret half %extract
132}
133
134define { i32, half } @call_split_type_used_outside_block_struct() #0 {
135; GCN-LABEL: call_split_type_used_outside_block_struct:
136; GCN:       ; %bb.0: ; %bb0
137; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138; GCN-NEXT:    s_mov_b32 s16, s33
139; GCN-NEXT:    s_mov_b32 s33, s32
140; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
141; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
142; GCN-NEXT:    s_mov_b64 exec, s[18:19]
143; GCN-NEXT:    v_writelane_b32 v40, s16, 2
144; GCN-NEXT:    s_addk_i32 s32, 0x400
145; GCN-NEXT:    v_writelane_b32 v40, s30, 0
146; GCN-NEXT:    v_writelane_b32 v40, s31, 1
147; GCN-NEXT:    s_getpc_b64 s[16:17]
148; GCN-NEXT:    s_add_u32 s16, s16, func_struct@rel32@lo+4
149; GCN-NEXT:    s_addc_u32 s17, s17, func_struct@rel32@hi+12
150; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
151; GCN-NEXT:    v_readlane_b32 s31, v40, 1
152; GCN-NEXT:    v_readlane_b32 s30, v40, 0
153; GCN-NEXT:    v_mov_b32_e32 v1, v4
154; GCN-NEXT:    s_mov_b32 s32, s33
155; GCN-NEXT:    v_readlane_b32 s4, v40, 2
156; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
157; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
158; GCN-NEXT:    s_mov_b64 exec, s[6:7]
159; GCN-NEXT:    s_mov_b32 s33, s4
160; GCN-NEXT:    s_waitcnt vmcnt(0)
161; GCN-NEXT:    s_setpc_b64 s[30:31]
162bb0:
163  %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
164  br label %bb1
165
166bb1:
167  %val0 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 0
168  %val1 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 1
169  %extract0 = extractelement <4 x i32> %val0, i32 0
170  %extract1 = extractelement <4 x half> %val1, i32 0
171  %ins0 = insertvalue { i32, half } undef, i32 %extract0, 0
172  %ins1 = insertvalue { i32, half } %ins0, half %extract1, 1
173  ret { i32, half } %ins1
174}
175
176define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
177; GCN-LABEL: v3i16_registers:
178; GCN:       ; %bb.0: ; %entry
179; GCN-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
180; GCN-NEXT:    s_load_dword s12, s[8:9], 0x0
181; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
182; GCN-NEXT:    s_add_u32 s0, s0, s17
183; GCN-NEXT:    s_addc_u32 s1, s1, 0
184; GCN-NEXT:    s_mov_b32 s32, 0
185; GCN-NEXT:    s_waitcnt lgkmcnt(0)
186; GCN-NEXT:    s_bitcmp1_b32 s12, 0
187; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
188; GCN-NEXT:    s_and_b64 vcc, exec, s[12:13]
189; GCN-NEXT:    s_cbranch_vccnz .LBB4_2
190; GCN-NEXT:  ; %bb.1: ; %if.else
191; GCN-NEXT:    s_add_u32 s8, s8, 8
192; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
193; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
194; GCN-NEXT:    s_addc_u32 s9, s9, 0
195; GCN-NEXT:    v_or3_b32 v31, v0, v1, v2
196; GCN-NEXT:    s_mov_b32 s12, s14
197; GCN-NEXT:    s_mov_b32 s13, s15
198; GCN-NEXT:    s_mov_b32 s14, s16
199; GCN-NEXT:    s_getpc_b64 s[18:19]
200; GCN-NEXT:    s_add_u32 s18, s18, func_v3i16@rel32@lo+4
201; GCN-NEXT:    s_addc_u32 s19, s19, func_v3i16@rel32@hi+12
202; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
203; GCN-NEXT:    s_branch .LBB4_3
204; GCN-NEXT:  .LBB4_2:
205; GCN-NEXT:    v_mov_b32_e32 v0, 0
206; GCN-NEXT:    v_mov_b32_e32 v1, 0
207; GCN-NEXT:  .LBB4_3: ; %if.end
208; GCN-NEXT:    global_store_short v[0:1], v1, off
209; GCN-NEXT:    global_store_dword v[0:1], v0, off
210; GCN-NEXT:    s_endpgm
211entry:
212  br i1 %cond, label %if.then, label %if.else
213
214if.then:                                          ; preds = %entry
215  br label %if.end
216
217if.else:                                          ; preds = %entry
218  %call6 = tail call <3 x i16> @func_v3i16() #0
219  br label %if.end
220
221if.end:                                           ; preds = %if.else, %if.then
222  %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
223  store <3 x i16> %call6.sink, ptr addrspace(1) undef
224  ret void
225}
226
227define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
228; GCN-LABEL: v3f16_registers:
229; GCN:       ; %bb.0: ; %entry
230; GCN-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
231; GCN-NEXT:    s_load_dword s12, s[8:9], 0x0
232; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
233; GCN-NEXT:    s_add_u32 s0, s0, s17
234; GCN-NEXT:    s_addc_u32 s1, s1, 0
235; GCN-NEXT:    s_mov_b32 s32, 0
236; GCN-NEXT:    s_waitcnt lgkmcnt(0)
237; GCN-NEXT:    s_bitcmp1_b32 s12, 0
238; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
239; GCN-NEXT:    s_and_b64 vcc, exec, s[12:13]
240; GCN-NEXT:    s_cbranch_vccnz .LBB5_2
241; GCN-NEXT:  ; %bb.1: ; %if.else
242; GCN-NEXT:    s_add_u32 s8, s8, 8
243; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
244; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
245; GCN-NEXT:    s_addc_u32 s9, s9, 0
246; GCN-NEXT:    v_or3_b32 v31, v0, v1, v2
247; GCN-NEXT:    s_mov_b32 s12, s14
248; GCN-NEXT:    s_mov_b32 s13, s15
249; GCN-NEXT:    s_mov_b32 s14, s16
250; GCN-NEXT:    s_getpc_b64 s[18:19]
251; GCN-NEXT:    s_add_u32 s18, s18, func_v3f16@rel32@lo+4
252; GCN-NEXT:    s_addc_u32 s19, s19, func_v3f16@rel32@hi+12
253; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
254; GCN-NEXT:    s_branch .LBB5_3
255; GCN-NEXT:  .LBB5_2:
256; GCN-NEXT:    v_mov_b32_e32 v0, 0
257; GCN-NEXT:    v_mov_b32_e32 v1, 0
258; GCN-NEXT:  .LBB5_3: ; %if.end
259; GCN-NEXT:    global_store_short v[0:1], v1, off
260; GCN-NEXT:    global_store_dword v[0:1], v0, off
261; GCN-NEXT:    s_endpgm
262entry:
263  br i1 %cond, label %if.then, label %if.else
264
265if.then:                                          ; preds = %entry
266  br label %if.end
267
268if.else:                                          ; preds = %entry
269  %call6 = tail call <3 x half> @func_v3f16() #0
270  br label %if.end
271
272if.end:                                           ; preds = %if.else, %if.then
273  %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
274  store <3 x half> %call6.sink, ptr addrspace(1) undef
275  ret void
276}
277
278declare hidden <2 x float> @func_v2f32() #0
279declare hidden <3 x float> @func_v3f32() #0
280declare hidden <4 x float> @func_v4f32() #0
281declare hidden <4 x half> @func_v4f16() #0
282declare hidden <3 x i16> @func_v3i16()
283declare hidden <3 x half> @func_v3f16()
284
285declare hidden { <4 x i32>, <4 x half> } @func_struct() #0
286
287attributes #0 = { nounwind}
288
289!llvm.module.flags = !{!0}
290!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
291