1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3 4; SelectionDAG builder was using the IR value kind to decide how to 5; split the types for copyToRegs/copyFromRegs in all contexts. This 6; was incorrect if the ABI-like value such as a call was used outside 7; of the block. The value in that case is not used directly, but 8; through another set of copies to potentially different register 9; types in the parent block. 10 11; This would then end up producing inconsistent pairs of copies with 12; the wrong sizes when the vector type result from the call was split 13; into multiple pieces, but expected to be a single register in the 14; cross-block copy. 15; 16; This isn't exactly ideal for AMDGPU, since in reality the 17; intermediate vector register type is undesirable anyway, but it 18; requires more work to be able to split all vector copies in all 19; contexts. 20; 21; This was only an issue if the value was used directly in another 22; block. If there was an intermediate operation or a phi it was fine, 23; since that didn't look like an ABI copy. 24 25 26define float @call_split_type_used_outside_block_v2f32() #0 { 27; GCN-LABEL: call_split_type_used_outside_block_v2f32: 28; GCN: ; %bb.0: ; %bb0 29; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; GCN-NEXT: s_mov_b32 s16, s33 31; GCN-NEXT: s_mov_b32 s33, s32 32; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 33; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 34; GCN-NEXT: s_mov_b64 exec, s[18:19] 35; GCN-NEXT: v_writelane_b32 v40, s16, 2 36; GCN-NEXT: s_addk_i32 s32, 0x400 37; GCN-NEXT: v_writelane_b32 v40, s30, 0 38; GCN-NEXT: v_writelane_b32 v40, s31, 1 39; GCN-NEXT: s_getpc_b64 s[16:17] 40; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 41; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12 42; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 43; GCN-NEXT: v_readlane_b32 s31, v40, 1 44; GCN-NEXT: v_readlane_b32 s30, v40, 0 45; GCN-NEXT: s_mov_b32 s32, s33 46; GCN-NEXT: v_readlane_b32 s4, v40, 2 47; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 48; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 49; GCN-NEXT: s_mov_b64 exec, s[6:7] 50; GCN-NEXT: s_mov_b32 s33, s4 51; GCN-NEXT: s_waitcnt vmcnt(0) 52; GCN-NEXT: s_setpc_b64 s[30:31] 53bb0: 54 %split.ret.type = call <2 x float> @func_v2f32() 55 br label %bb1 56 57bb1: 58 %extract = extractelement <2 x float> %split.ret.type, i32 0 59 ret float %extract 60} 61 62define float @call_split_type_used_outside_block_v3f32() #0 { 63; GCN-LABEL: call_split_type_used_outside_block_v3f32: 64; GCN: ; %bb.0: ; %bb0 65; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 66; GCN-NEXT: s_mov_b32 s16, s33 67; GCN-NEXT: s_mov_b32 s33, s32 68; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 69; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 70; GCN-NEXT: s_mov_b64 exec, s[18:19] 71; GCN-NEXT: v_writelane_b32 v40, s16, 2 72; GCN-NEXT: s_addk_i32 s32, 0x400 73; GCN-NEXT: v_writelane_b32 v40, s30, 0 74; GCN-NEXT: v_writelane_b32 v40, s31, 1 75; GCN-NEXT: s_getpc_b64 s[16:17] 76; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4 77; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12 78; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 79; GCN-NEXT: v_readlane_b32 s31, v40, 1 80; GCN-NEXT: v_readlane_b32 s30, v40, 0 81; GCN-NEXT: s_mov_b32 s32, s33 82; GCN-NEXT: v_readlane_b32 s4, v40, 2 83; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 84; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 85; GCN-NEXT: s_mov_b64 exec, s[6:7] 86; GCN-NEXT: s_mov_b32 s33, s4 87; GCN-NEXT: s_waitcnt vmcnt(0) 88; GCN-NEXT: s_setpc_b64 s[30:31] 89bb0: 90 %split.ret.type = call <3 x float> @func_v3f32() 91 br label %bb1 92 93bb1: 94 %extract = extractelement <3 x float> %split.ret.type, i32 0 95 ret float %extract 96} 97 98define half @call_split_type_used_outside_block_v4f16() #0 { 99; GCN-LABEL: call_split_type_used_outside_block_v4f16: 100; GCN: ; %bb.0: ; %bb0 101; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GCN-NEXT: s_mov_b32 s16, s33 103; GCN-NEXT: s_mov_b32 s33, s32 104; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 105; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 106; GCN-NEXT: s_mov_b64 exec, s[18:19] 107; GCN-NEXT: v_writelane_b32 v40, s16, 2 108; GCN-NEXT: s_addk_i32 s32, 0x400 109; GCN-NEXT: v_writelane_b32 v40, s30, 0 110; GCN-NEXT: v_writelane_b32 v40, s31, 1 111; GCN-NEXT: s_getpc_b64 s[16:17] 112; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4 113; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12 114; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 115; GCN-NEXT: v_readlane_b32 s31, v40, 1 116; GCN-NEXT: v_readlane_b32 s30, v40, 0 117; GCN-NEXT: s_mov_b32 s32, s33 118; GCN-NEXT: v_readlane_b32 s4, v40, 2 119; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 120; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 121; GCN-NEXT: s_mov_b64 exec, s[6:7] 122; GCN-NEXT: s_mov_b32 s33, s4 123; GCN-NEXT: s_waitcnt vmcnt(0) 124; GCN-NEXT: s_setpc_b64 s[30:31] 125bb0: 126 %split.ret.type = call <4 x half> @func_v4f16() 127 br label %bb1 128 129bb1: 130 %extract = extractelement <4 x half> %split.ret.type, i32 0 131 ret half %extract 132} 133 134define { i32, half } @call_split_type_used_outside_block_struct() #0 { 135; GCN-LABEL: call_split_type_used_outside_block_struct: 136; GCN: ; %bb.0: ; %bb0 137; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; GCN-NEXT: s_mov_b32 s16, s33 139; GCN-NEXT: s_mov_b32 s33, s32 140; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 141; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 142; GCN-NEXT: s_mov_b64 exec, s[18:19] 143; GCN-NEXT: v_writelane_b32 v40, s16, 2 144; GCN-NEXT: s_addk_i32 s32, 0x400 145; GCN-NEXT: v_writelane_b32 v40, s30, 0 146; GCN-NEXT: v_writelane_b32 v40, s31, 1 147; GCN-NEXT: s_getpc_b64 s[16:17] 148; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 149; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12 150; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 151; GCN-NEXT: v_readlane_b32 s31, v40, 1 152; GCN-NEXT: v_readlane_b32 s30, v40, 0 153; GCN-NEXT: v_mov_b32_e32 v1, v4 154; GCN-NEXT: s_mov_b32 s32, s33 155; GCN-NEXT: v_readlane_b32 s4, v40, 2 156; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 157; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 158; GCN-NEXT: s_mov_b64 exec, s[6:7] 159; GCN-NEXT: s_mov_b32 s33, s4 160; GCN-NEXT: s_waitcnt vmcnt(0) 161; GCN-NEXT: s_setpc_b64 s[30:31] 162bb0: 163 %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct() 164 br label %bb1 165 166bb1: 167 %val0 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 0 168 %val1 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 1 169 %extract0 = extractelement <4 x i32> %val0, i32 0 170 %extract1 = extractelement <4 x half> %val1, i32 0 171 %ins0 = insertvalue { i32, half } undef, i32 %extract0, 0 172 %ins1 = insertvalue { i32, half } %ins0, half %extract1, 1 173 ret { i32, half } %ins1 174} 175 176define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { 177; GCN-LABEL: v3i16_registers: 178; GCN: ; %bb.0: ; %entry 179; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 180; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 181; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 182; GCN-NEXT: s_add_u32 s0, s0, s17 183; GCN-NEXT: s_addc_u32 s1, s1, 0 184; GCN-NEXT: s_mov_b32 s32, 0 185; GCN-NEXT: s_waitcnt lgkmcnt(0) 186; GCN-NEXT: s_bitcmp1_b32 s12, 0 187; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 188; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] 189; GCN-NEXT: s_cbranch_vccnz .LBB4_2 190; GCN-NEXT: ; %bb.1: ; %if.else 191; GCN-NEXT: s_add_u32 s8, s8, 8 192; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 193; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 194; GCN-NEXT: s_addc_u32 s9, s9, 0 195; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 196; GCN-NEXT: s_mov_b32 s12, s14 197; GCN-NEXT: s_mov_b32 s13, s15 198; GCN-NEXT: s_mov_b32 s14, s16 199; GCN-NEXT: s_getpc_b64 s[18:19] 200; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4 201; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12 202; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] 203; GCN-NEXT: s_branch .LBB4_3 204; GCN-NEXT: .LBB4_2: 205; GCN-NEXT: v_mov_b32_e32 v0, 0 206; GCN-NEXT: v_mov_b32_e32 v1, 0 207; GCN-NEXT: .LBB4_3: ; %if.end 208; GCN-NEXT: global_store_short v[0:1], v1, off 209; GCN-NEXT: global_store_dword v[0:1], v0, off 210; GCN-NEXT: s_endpgm 211entry: 212 br i1 %cond, label %if.then, label %if.else 213 214if.then: ; preds = %entry 215 br label %if.end 216 217if.else: ; preds = %entry 218 %call6 = tail call <3 x i16> @func_v3i16() #0 219 br label %if.end 220 221if.end: ; preds = %if.else, %if.then 222 %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ] 223 store <3 x i16> %call6.sink, ptr addrspace(1) undef 224 ret void 225} 226 227define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { 228; GCN-LABEL: v3f16_registers: 229; GCN: ; %bb.0: ; %entry 230; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 231; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 232; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 233; GCN-NEXT: s_add_u32 s0, s0, s17 234; GCN-NEXT: s_addc_u32 s1, s1, 0 235; GCN-NEXT: s_mov_b32 s32, 0 236; GCN-NEXT: s_waitcnt lgkmcnt(0) 237; GCN-NEXT: s_bitcmp1_b32 s12, 0 238; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 239; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] 240; GCN-NEXT: s_cbranch_vccnz .LBB5_2 241; GCN-NEXT: ; %bb.1: ; %if.else 242; GCN-NEXT: s_add_u32 s8, s8, 8 243; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 244; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 245; GCN-NEXT: s_addc_u32 s9, s9, 0 246; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 247; GCN-NEXT: s_mov_b32 s12, s14 248; GCN-NEXT: s_mov_b32 s13, s15 249; GCN-NEXT: s_mov_b32 s14, s16 250; GCN-NEXT: s_getpc_b64 s[18:19] 251; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4 252; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12 253; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] 254; GCN-NEXT: s_branch .LBB5_3 255; GCN-NEXT: .LBB5_2: 256; GCN-NEXT: v_mov_b32_e32 v0, 0 257; GCN-NEXT: v_mov_b32_e32 v1, 0 258; GCN-NEXT: .LBB5_3: ; %if.end 259; GCN-NEXT: global_store_short v[0:1], v1, off 260; GCN-NEXT: global_store_dword v[0:1], v0, off 261; GCN-NEXT: s_endpgm 262entry: 263 br i1 %cond, label %if.then, label %if.else 264 265if.then: ; preds = %entry 266 br label %if.end 267 268if.else: ; preds = %entry 269 %call6 = tail call <3 x half> @func_v3f16() #0 270 br label %if.end 271 272if.end: ; preds = %if.else, %if.then 273 %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ] 274 store <3 x half> %call6.sink, ptr addrspace(1) undef 275 ret void 276} 277 278declare hidden <2 x float> @func_v2f32() #0 279declare hidden <3 x float> @func_v3f32() #0 280declare hidden <4 x float> @func_v4f32() #0 281declare hidden <4 x half> @func_v4f16() #0 282declare hidden <3 x i16> @func_v3i16() 283declare hidden <3 x half> @func_v3f16() 284 285declare hidden { <4 x i32>, <4 x half> } @func_struct() #0 286 287attributes #0 = { nounwind} 288 289!llvm.module.flags = !{!0} 290!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} 291