1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s 5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s 6 7define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 { 8; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: 9; GFX908: ; %bb.0: 10; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[16:19], s20 idxen offen 12; GFX908-NEXT: s_waitcnt vmcnt(0) 13; GFX908-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: 16; GFX90A: ; %bb.0: 17; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX90A-NEXT: v_mov_b32_e32 v3, v2 19; GFX90A-NEXT: v_mov_b32_e32 v2, v1 20; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[16:19], s20 idxen offen 21; GFX90A-NEXT: s_waitcnt vmcnt(0) 22; GFX90A-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: 25; GFX940: ; %bb.0: 26; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX940-NEXT: v_mov_b32_e32 v3, v2 28; GFX940-NEXT: v_mov_b32_e32 v2, v1 29; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen 30; GFX940-NEXT: s_waitcnt vmcnt(0) 31; GFX940-NEXT: s_setpc_b64 s[30:31] 32; 33; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: 34; GFX1200: ; %bb.0: 35; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 36; GFX1200-NEXT: s_wait_expcnt 0x0 37; GFX1200-NEXT: s_wait_samplecnt 0x0 38; GFX1200-NEXT: s_wait_bvhcnt 0x0 39; GFX1200-NEXT: s_wait_kmcnt 0x0 40; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen 41; GFX1200-NEXT: s_setpc_b64 s[30:31] 42 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) 43 ret void 44} 45 46; Natural mapping, no voffset 47define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) #0 { 48; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: 49; GFX908: ; %bb.0: 50; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 idxen 52; GFX908-NEXT: s_waitcnt vmcnt(0) 53; GFX908-NEXT: s_setpc_b64 s[30:31] 54; 55; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: 56; GFX90A: ; %bb.0: 57; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 58; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 idxen 59; GFX90A-NEXT: s_waitcnt vmcnt(0) 60; GFX90A-NEXT: s_setpc_b64 s[30:31] 61; 62; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: 63; GFX940: ; %bb.0: 64; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 65; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen 66; GFX940-NEXT: s_waitcnt vmcnt(0) 67; GFX940-NEXT: s_setpc_b64 s[30:31] 68; 69; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: 70; GFX1200: ; %bb.0: 71; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 72; GFX1200-NEXT: s_wait_expcnt 0x0 73; GFX1200-NEXT: s_wait_samplecnt 0x0 74; GFX1200-NEXT: s_wait_bvhcnt 0x0 75; GFX1200-NEXT: s_wait_kmcnt 0x0 76; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen 77; GFX1200-NEXT: s_setpc_b64 s[30:31] 78 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) 79 ret void 80} 81 82define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 { 83; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: 84; GFX908: ; %bb.0: 85; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 86; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[16:19], s20 idxen offen slc 87; GFX908-NEXT: s_waitcnt vmcnt(0) 88; GFX908-NEXT: s_setpc_b64 s[30:31] 89; 90; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: 91; GFX90A: ; %bb.0: 92; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 93; GFX90A-NEXT: v_mov_b32_e32 v3, v2 94; GFX90A-NEXT: v_mov_b32_e32 v2, v1 95; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[16:19], s20 idxen offen slc 96; GFX90A-NEXT: s_waitcnt vmcnt(0) 97; GFX90A-NEXT: s_setpc_b64 s[30:31] 98; 99; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: 100; GFX940: ; %bb.0: 101; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GFX940-NEXT: v_mov_b32_e32 v3, v2 103; GFX940-NEXT: v_mov_b32_e32 v2, v1 104; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen nt 105; GFX940-NEXT: s_waitcnt vmcnt(0) 106; GFX940-NEXT: s_setpc_b64 s[30:31] 107; 108; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: 109; GFX1200: ; %bb.0: 110; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 111; GFX1200-NEXT: s_wait_expcnt 0x0 112; GFX1200-NEXT: s_wait_samplecnt 0x0 113; GFX1200-NEXT: s_wait_bvhcnt 0x0 114; GFX1200-NEXT: s_wait_kmcnt 0x0 115; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT 116; GFX1200-NEXT: s_setpc_b64 s[30:31] 117 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) 118 ret void 119} 120 121define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 { 122; GFX908-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: 123; GFX908: ; %bb.0: 124; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 125; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[16:19], s20 idxen offen 126; GFX908-NEXT: s_waitcnt vmcnt(0) 127; GFX908-NEXT: s_setpc_b64 s[30:31] 128; 129; GFX90A-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: 130; GFX90A: ; %bb.0: 131; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 132; GFX90A-NEXT: v_mov_b32_e32 v3, v2 133; GFX90A-NEXT: v_mov_b32_e32 v2, v1 134; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[16:19], s20 idxen offen 135; GFX90A-NEXT: s_waitcnt vmcnt(0) 136; GFX90A-NEXT: s_setpc_b64 s[30:31] 137; 138; GFX940-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: 139; GFX940: ; %bb.0: 140; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 141; GFX940-NEXT: v_mov_b32_e32 v3, v2 142; GFX940-NEXT: v_mov_b32_e32 v2, v1 143; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen 144; GFX940-NEXT: s_waitcnt vmcnt(0) 145; GFX940-NEXT: s_setpc_b64 s[30:31] 146; 147; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: 148; GFX1200: ; %bb.0: 149; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 150; GFX1200-NEXT: s_wait_expcnt 0x0 151; GFX1200-NEXT: s_wait_samplecnt 0x0 152; GFX1200-NEXT: s_wait_bvhcnt 0x0 153; GFX1200-NEXT: s_wait_kmcnt 0x0 154; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s16 idxen offen 155; GFX1200-NEXT: s_setpc_b64 s[30:31] 156 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) 157 ret void 158} 159 160; Test waterfall loop 161define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 { 162; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: 163; GFX908: ; %bb.0: 164; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; GFX908-NEXT: s_mov_b64 s[6:7], exec 166; GFX908-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 167; GFX908-NEXT: v_readfirstlane_b32 s8, v1 168; GFX908-NEXT: v_readfirstlane_b32 s9, v2 169; GFX908-NEXT: v_readfirstlane_b32 s10, v3 170; GFX908-NEXT: v_readfirstlane_b32 s11, v4 171; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] 172; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] 173; GFX908-NEXT: v_readfirstlane_b32 s12, v7 174; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 175; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7 176; GFX908-NEXT: s_and_b64 s[4:5], s[4:5], vcc 177; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 178; GFX908-NEXT: s_nop 0 179; GFX908-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[8:11], s12 idxen offen 180; GFX908-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 181; GFX908-NEXT: ; implicit-def: $vgpr7 182; GFX908-NEXT: ; implicit-def: $vgpr0 183; GFX908-NEXT: ; implicit-def: $vgpr5_vgpr6 184; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 185; GFX908-NEXT: s_cbranch_execnz .LBB4_1 186; GFX908-NEXT: ; %bb.2: 187; GFX908-NEXT: s_mov_b64 exec, s[6:7] 188; GFX908-NEXT: s_waitcnt vmcnt(0) 189; GFX908-NEXT: s_setpc_b64 s[30:31] 190; 191; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: 192; GFX90A: ; %bb.0: 193; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; GFX90A-NEXT: v_mov_b32_e32 v9, v6 195; GFX90A-NEXT: v_mov_b32_e32 v8, v5 196; GFX90A-NEXT: v_mov_b32_e32 v5, v4 197; GFX90A-NEXT: v_mov_b32_e32 v4, v3 198; GFX90A-NEXT: v_mov_b32_e32 v3, v2 199; GFX90A-NEXT: v_mov_b32_e32 v2, v1 200; GFX90A-NEXT: s_mov_b64 s[6:7], exec 201; GFX90A-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 202; GFX90A-NEXT: v_readfirstlane_b32 s8, v2 203; GFX90A-NEXT: v_readfirstlane_b32 s9, v3 204; GFX90A-NEXT: v_readfirstlane_b32 s10, v4 205; GFX90A-NEXT: v_readfirstlane_b32 s11, v5 206; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] 207; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] 208; GFX90A-NEXT: v_readfirstlane_b32 s12, v7 209; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 210; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7 211; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc 212; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 213; GFX90A-NEXT: s_nop 0 214; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen 215; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 216; GFX90A-NEXT: ; implicit-def: $vgpr7 217; GFX90A-NEXT: ; implicit-def: $vgpr0 218; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 219; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 220; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 221; GFX90A-NEXT: ; %bb.2: 222; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 223; GFX90A-NEXT: s_waitcnt vmcnt(0) 224; GFX90A-NEXT: s_setpc_b64 s[30:31] 225; 226; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: 227; GFX940: ; %bb.0: 228; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 229; GFX940-NEXT: v_mov_b32_e32 v9, v6 230; GFX940-NEXT: v_mov_b32_e32 v8, v5 231; GFX940-NEXT: v_mov_b32_e32 v5, v4 232; GFX940-NEXT: v_mov_b32_e32 v4, v3 233; GFX940-NEXT: v_mov_b32_e32 v3, v2 234; GFX940-NEXT: v_mov_b32_e32 v2, v1 235; GFX940-NEXT: s_mov_b64 s[2:3], exec 236; GFX940-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 237; GFX940-NEXT: v_readfirstlane_b32 s4, v2 238; GFX940-NEXT: v_readfirstlane_b32 s5, v3 239; GFX940-NEXT: v_readfirstlane_b32 s6, v4 240; GFX940-NEXT: v_readfirstlane_b32 s7, v5 241; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] 242; GFX940-NEXT: v_readfirstlane_b32 s8, v7 243; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[4:5] 244; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 245; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 246; GFX940-NEXT: s_and_b64 s[0:1], s[0:1], vcc 247; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 248; GFX940-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen 249; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 250; GFX940-NEXT: ; implicit-def: $vgpr7 251; GFX940-NEXT: ; implicit-def: $vgpr0 252; GFX940-NEXT: ; implicit-def: $vgpr8_vgpr9 253; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 254; GFX940-NEXT: s_cbranch_execnz .LBB4_1 255; GFX940-NEXT: ; %bb.2: 256; GFX940-NEXT: s_mov_b64 exec, s[2:3] 257; GFX940-NEXT: s_waitcnt vmcnt(0) 258; GFX940-NEXT: s_setpc_b64 s[30:31] 259; 260; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: 261; GFX1200: ; %bb.0: 262; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 263; GFX1200-NEXT: s_wait_expcnt 0x0 264; GFX1200-NEXT: s_wait_samplecnt 0x0 265; GFX1200-NEXT: s_wait_bvhcnt 0x0 266; GFX1200-NEXT: s_wait_kmcnt 0x0 267; GFX1200-NEXT: s_mov_b32 s2, exec_lo 268; GFX1200-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 269; GFX1200-NEXT: v_readfirstlane_b32 s4, v1 270; GFX1200-NEXT: v_readfirstlane_b32 s5, v2 271; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 272; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 273; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 274; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 275; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] 276; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] 277; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 278; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 279; GFX1200-NEXT: s_wait_alu 0xfffe 280; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 281; GFX1200-NEXT: s_wait_alu 0xfffe 282; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) 283; GFX1200-NEXT: s_and_b32 s0, s0, s1 284; GFX1200-NEXT: s_wait_alu 0xfffe 285; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 286; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen 287; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 288; GFX1200-NEXT: ; implicit-def: $vgpr7 289; GFX1200-NEXT: ; implicit-def: $vgpr0 290; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 291; GFX1200-NEXT: s_wait_alu 0xfffe 292; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 293; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 294; GFX1200-NEXT: ; %bb.2: 295; GFX1200-NEXT: s_mov_b32 exec_lo, s2 296; GFX1200-NEXT: s_wait_alu 0xfffe 297; GFX1200-NEXT: s_setpc_b64 s[30:31] 298 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) 299 ret void 300} 301 302; Test waterfall loop 303define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 { 304; GFX908-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: 305; GFX908: ; %bb.0: 306; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 307; GFX908-NEXT: s_mov_b64 s[6:7], exec 308; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 309; GFX908-NEXT: v_readfirstlane_b32 s8, v1 310; GFX908-NEXT: v_readfirstlane_b32 s9, v2 311; GFX908-NEXT: v_readfirstlane_b32 s10, v3 312; GFX908-NEXT: v_readfirstlane_b32 s11, v4 313; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] 314; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] 315; GFX908-NEXT: v_readfirstlane_b32 s12, v7 316; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 317; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7 318; GFX908-NEXT: s_and_b64 s[4:5], s[4:5], vcc 319; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 320; GFX908-NEXT: s_nop 0 321; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[8:11], s12 idxen offen 322; GFX908-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 323; GFX908-NEXT: ; implicit-def: $vgpr7 324; GFX908-NEXT: ; implicit-def: $vgpr0 325; GFX908-NEXT: ; implicit-def: $vgpr5_vgpr6 326; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 327; GFX908-NEXT: s_cbranch_execnz .LBB5_1 328; GFX908-NEXT: ; %bb.2: 329; GFX908-NEXT: s_mov_b64 exec, s[6:7] 330; GFX908-NEXT: s_waitcnt vmcnt(0) 331; GFX908-NEXT: s_setpc_b64 s[30:31] 332; 333; GFX90A-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: 334; GFX90A: ; %bb.0: 335; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 336; GFX90A-NEXT: v_mov_b32_e32 v9, v6 337; GFX90A-NEXT: v_mov_b32_e32 v8, v5 338; GFX90A-NEXT: v_mov_b32_e32 v5, v4 339; GFX90A-NEXT: v_mov_b32_e32 v4, v3 340; GFX90A-NEXT: v_mov_b32_e32 v3, v2 341; GFX90A-NEXT: v_mov_b32_e32 v2, v1 342; GFX90A-NEXT: s_mov_b64 s[6:7], exec 343; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 344; GFX90A-NEXT: v_readfirstlane_b32 s8, v2 345; GFX90A-NEXT: v_readfirstlane_b32 s9, v3 346; GFX90A-NEXT: v_readfirstlane_b32 s10, v4 347; GFX90A-NEXT: v_readfirstlane_b32 s11, v5 348; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] 349; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] 350; GFX90A-NEXT: v_readfirstlane_b32 s12, v7 351; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 352; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7 353; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc 354; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 355; GFX90A-NEXT: s_nop 0 356; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen 357; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 358; GFX90A-NEXT: ; implicit-def: $vgpr7 359; GFX90A-NEXT: ; implicit-def: $vgpr0 360; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 361; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 362; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 363; GFX90A-NEXT: ; %bb.2: 364; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 365; GFX90A-NEXT: s_waitcnt vmcnt(0) 366; GFX90A-NEXT: s_setpc_b64 s[30:31] 367; 368; GFX940-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: 369; GFX940: ; %bb.0: 370; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 371; GFX940-NEXT: v_mov_b32_e32 v9, v6 372; GFX940-NEXT: v_mov_b32_e32 v8, v5 373; GFX940-NEXT: v_mov_b32_e32 v5, v4 374; GFX940-NEXT: v_mov_b32_e32 v4, v3 375; GFX940-NEXT: v_mov_b32_e32 v3, v2 376; GFX940-NEXT: v_mov_b32_e32 v2, v1 377; GFX940-NEXT: s_mov_b64 s[2:3], exec 378; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 379; GFX940-NEXT: v_readfirstlane_b32 s4, v2 380; GFX940-NEXT: v_readfirstlane_b32 s5, v3 381; GFX940-NEXT: v_readfirstlane_b32 s6, v4 382; GFX940-NEXT: v_readfirstlane_b32 s7, v5 383; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] 384; GFX940-NEXT: v_readfirstlane_b32 s8, v7 385; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[4:5] 386; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 387; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 388; GFX940-NEXT: s_and_b64 s[0:1], s[0:1], vcc 389; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 390; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen 391; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 392; GFX940-NEXT: ; implicit-def: $vgpr7 393; GFX940-NEXT: ; implicit-def: $vgpr0 394; GFX940-NEXT: ; implicit-def: $vgpr8_vgpr9 395; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 396; GFX940-NEXT: s_cbranch_execnz .LBB5_1 397; GFX940-NEXT: ; %bb.2: 398; GFX940-NEXT: s_mov_b64 exec, s[2:3] 399; GFX940-NEXT: s_waitcnt vmcnt(0) 400; GFX940-NEXT: s_setpc_b64 s[30:31] 401; 402; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: 403; GFX1200: ; %bb.0: 404; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 405; GFX1200-NEXT: s_wait_expcnt 0x0 406; GFX1200-NEXT: s_wait_samplecnt 0x0 407; GFX1200-NEXT: s_wait_bvhcnt 0x0 408; GFX1200-NEXT: s_wait_kmcnt 0x0 409; GFX1200-NEXT: s_mov_b32 s2, exec_lo 410; GFX1200-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 411; GFX1200-NEXT: v_readfirstlane_b32 s4, v1 412; GFX1200-NEXT: v_readfirstlane_b32 s5, v2 413; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 414; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 415; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 416; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 417; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] 418; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] 419; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 420; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 421; GFX1200-NEXT: s_wait_alu 0xfffe 422; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 423; GFX1200-NEXT: s_wait_alu 0xfffe 424; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) 425; GFX1200-NEXT: s_and_b32 s0, s0, s1 426; GFX1200-NEXT: s_wait_alu 0xfffe 427; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 428; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen 429; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 430; GFX1200-NEXT: ; implicit-def: $vgpr7 431; GFX1200-NEXT: ; implicit-def: $vgpr0 432; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 433; GFX1200-NEXT: s_wait_alu 0xfffe 434; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 435; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 436; GFX1200-NEXT: ; %bb.2: 437; GFX1200-NEXT: s_mov_b32 exec_lo, s2 438; GFX1200-NEXT: s_wait_alu 0xfffe 439; GFX1200-NEXT: s_setpc_b64 s[30:31] 440 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) 441 ret void 442} 443 444declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) 445declare <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg) 446 447attributes #0 = { nounwind } 448