1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs -o - %s | FileCheck %s 3 4; Make sure the waterfall loop does not fail the verifier after regalloc fast 5; 6; FIXME: There are a lot of extra spills that aren't needed. This is due to the unmerge_merge combine 7; running after RegBankSelect which inserts a lot of COPY instructions, but the original merge 8; instruction (G_BUILD_VECTOR) stays because it has more than one use. 9; Those spills are not present when optimizations are enabled. 10define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { 11; CHECK-LABEL: waterfall_loop: 12; CHECK: ; %bb.0: ; %bb 13; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 15; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill 16; CHECK-NEXT: s_mov_b32 exec_lo, s4 17; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill 18; CHECK-NEXT: v_mov_b32_e32 v14, v1 19; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill 20; CHECK-NEXT: v_mov_b32_e32 v13, v2 21; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill 22; CHECK-NEXT: v_mov_b32_e32 v12, v3 23; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill 24; CHECK-NEXT: v_mov_b32_e32 v11, v4 25; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill 26; CHECK-NEXT: v_mov_b32_e32 v10, v5 27; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill 28; CHECK-NEXT: v_mov_b32_e32 v9, v6 29; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill 30; CHECK-NEXT: v_mov_b32_e32 v8, v7 31; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill 32; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec 33; CHECK-NEXT: v_mov_b32_e32 v1, v14 34; CHECK-NEXT: v_mov_b32_e32 v2, v13 35; CHECK-NEXT: v_mov_b32_e32 v3, v12 36; CHECK-NEXT: v_mov_b32_e32 v4, v11 37; CHECK-NEXT: v_mov_b32_e32 v5, v10 38; CHECK-NEXT: v_mov_b32_e32 v6, v9 39; CHECK-NEXT: v_mov_b32_e32 v7, v8 40; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill 41; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill 42; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill 43; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill 44; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill 45; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill 46; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill 47; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill 48; CHECK-NEXT: s_mov_b32 s8, 0 49; CHECK-NEXT: s_mov_b32 s4, s8 50; CHECK-NEXT: s_mov_b32 s5, s8 51; CHECK-NEXT: s_mov_b32 s6, s8 52; CHECK-NEXT: s_mov_b32 s7, s8 53; CHECK-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane 54; CHECK-NEXT: v_writelane_b32 v16, s4, 0 55; CHECK-NEXT: v_writelane_b32 v16, s5, 1 56; CHECK-NEXT: v_writelane_b32 v16, s6, 2 57; CHECK-NEXT: v_writelane_b32 v16, s7, 3 58; CHECK-NEXT: s_mov_b32 s4, 0 59; CHECK-NEXT: v_mov_b32_e32 v0, s4 60; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill 61; CHECK-NEXT: v_mov_b32_e32 v0, s4 62; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill 63; CHECK-NEXT: s_mov_b32 s4, exec_lo 64; CHECK-NEXT: v_writelane_b32 v16, s4, 4 65; CHECK-NEXT: s_or_saveexec_b32 s21, -1 66; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill 67; CHECK-NEXT: s_mov_b32 exec_lo, s21 68; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 69; CHECK-NEXT: s_or_saveexec_b32 s21, -1 70; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload 71; CHECK-NEXT: s_mov_b32 exec_lo, s21 72; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload 73; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload 74; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload 75; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload 76; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload 77; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload 78; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload 79; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload 80; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload 81; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload 82; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload 83; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload 84; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload 85; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload 86; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload 87; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload 88; CHECK-NEXT: s_waitcnt vmcnt(0) 89; CHECK-NEXT: v_readfirstlane_b32 s12, v7 90; CHECK-NEXT: v_readfirstlane_b32 s10, v6 91; CHECK-NEXT: v_readfirstlane_b32 s9, v5 92; CHECK-NEXT: v_readfirstlane_b32 s8, v4 93; CHECK-NEXT: v_readfirstlane_b32 s7, v3 94; CHECK-NEXT: v_readfirstlane_b32 s6, v2 95; CHECK-NEXT: v_readfirstlane_b32 s5, v1 96; CHECK-NEXT: v_readfirstlane_b32 s4, v0 97; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 98; CHECK-NEXT: s_mov_b32 s13, s10 99; CHECK-NEXT: s_mov_b32 s14, s9 100; CHECK-NEXT: s_mov_b32 s15, s8 101; CHECK-NEXT: s_mov_b32 s16, s7 102; CHECK-NEXT: s_mov_b32 s17, s6 103; CHECK-NEXT: s_mov_b32 s18, s5 104; CHECK-NEXT: s_mov_b32 s19, s4 105; CHECK-NEXT: v_writelane_b32 v16, s12, 5 106; CHECK-NEXT: v_writelane_b32 v16, s13, 6 107; CHECK-NEXT: v_writelane_b32 v16, s14, 7 108; CHECK-NEXT: v_writelane_b32 v16, s15, 8 109; CHECK-NEXT: v_writelane_b32 v16, s16, 9 110; CHECK-NEXT: v_writelane_b32 v16, s17, 10 111; CHECK-NEXT: v_writelane_b32 v16, s18, 11 112; CHECK-NEXT: v_writelane_b32 v16, s19, 12 113; CHECK-NEXT: v_mov_b32_e32 v6, v8 114; CHECK-NEXT: v_mov_b32_e32 v7, v9 115; CHECK-NEXT: v_mov_b32_e32 v4, v10 116; CHECK-NEXT: v_mov_b32_e32 v5, v11 117; CHECK-NEXT: v_mov_b32_e32 v2, v12 118; CHECK-NEXT: v_mov_b32_e32 v3, v13 119; CHECK-NEXT: v_mov_b32_e32 v0, v14 120; CHECK-NEXT: v_mov_b32_e32 v1, v15 121; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] 122; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] 123; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] 124; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19] 125; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7] 126; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5] 127; CHECK-NEXT: s_and_b32 s4, s4, s5 128; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3] 129; CHECK-NEXT: s_and_b32 s4, s4, s5 130; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] 131; CHECK-NEXT: s_and_b32 s4, s4, s5 132; CHECK-NEXT: s_and_saveexec_b32 s4, s4 133; CHECK-NEXT: v_writelane_b32 v16, s4, 13 134; CHECK-NEXT: s_or_saveexec_b32 s21, -1 135; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill 136; CHECK-NEXT: s_mov_b32 exec_lo, s21 137; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 138; CHECK-NEXT: s_or_saveexec_b32 s21, -1 139; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload 140; CHECK-NEXT: s_mov_b32 exec_lo, s21 141; CHECK-NEXT: s_waitcnt vmcnt(0) 142; CHECK-NEXT: v_readlane_b32 s4, v16, 13 143; CHECK-NEXT: v_readlane_b32 s8, v16, 5 144; CHECK-NEXT: v_readlane_b32 s9, v16, 6 145; CHECK-NEXT: v_readlane_b32 s10, v16, 7 146; CHECK-NEXT: v_readlane_b32 s11, v16, 8 147; CHECK-NEXT: v_readlane_b32 s12, v16, 9 148; CHECK-NEXT: v_readlane_b32 s13, v16, 10 149; CHECK-NEXT: v_readlane_b32 s14, v16, 11 150; CHECK-NEXT: v_readlane_b32 s15, v16, 12 151; CHECK-NEXT: v_readlane_b32 s16, v16, 0 152; CHECK-NEXT: v_readlane_b32 s17, v16, 1 153; CHECK-NEXT: v_readlane_b32 s18, v16, 2 154; CHECK-NEXT: v_readlane_b32 s19, v16, 3 155; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload 156; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload 157; CHECK-NEXT: s_waitcnt vmcnt(0) 158; CHECK-NEXT: image_sample v0, [v0, v1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D 159; CHECK-NEXT: s_waitcnt vmcnt(0) 160; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill 161; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 162; CHECK-NEXT: s_cbranch_execnz .LBB0_1 163; CHECK-NEXT: ; %bb.3: 164; CHECK-NEXT: s_or_saveexec_b32 s21, -1 165; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload 166; CHECK-NEXT: s_mov_b32 exec_lo, s21 167; CHECK-NEXT: s_waitcnt vmcnt(0) 168; CHECK-NEXT: v_readlane_b32 s4, v16, 4 169; CHECK-NEXT: s_mov_b32 exec_lo, s4 170; CHECK-NEXT: ; %bb.4: 171; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload 172; CHECK-NEXT: ; implicit-def: $sgpr4 173; CHECK-NEXT: v_mov_b32_e32 v1, s4 174; CHECK-NEXT: v_mov_b32_e32 v2, s4 175; CHECK-NEXT: v_mov_b32_e32 v3, s4 176; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 177; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload 178; CHECK-NEXT: s_mov_b32 exec_lo, s4 179; CHECK-NEXT: s_waitcnt vmcnt(0) 180; CHECK-NEXT: s_setpc_b64 s[30:31] 181bb: 182 %ret = tail call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %vgpr_srd, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0) 183 ret <4 x float> %ret 184} 185 186declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 187 188attributes #0 = { nounwind readonly willreturn } 189