1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s 10; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s 11 12declare i32 @llvm.amdgcn.workitem.id.x() 13declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32) 14declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32, i32) 15 16; Show what the atomic optimization pass will do for struct buffers. 17 18define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 19; GFX6-LABEL: add_i32_constant: 20; GFX6: ; %bb.0: ; %entry 21; GFX6-NEXT: s_mov_b64 s[2:3], exec 22; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX6-NEXT: ; implicit-def: $vgpr1 26; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 27; GFX6-NEXT: s_cbranch_execz .LBB0_2 28; GFX6-NEXT: ; %bb.1: 29; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 30; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 31; GFX6-NEXT: s_mul_i32 s2, s2, 5 32; GFX6-NEXT: v_mov_b32_e32 v1, s2 33; GFX6-NEXT: v_mov_b32_e32 v2, 0 34; GFX6-NEXT: s_waitcnt lgkmcnt(0) 35; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 36; GFX6-NEXT: .LBB0_2: 37; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 38; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 39; GFX6-NEXT: s_mov_b32 s3, 0xf000 40; GFX6-NEXT: s_mov_b32 s2, -1 41; GFX6-NEXT: s_waitcnt vmcnt(0) 42; GFX6-NEXT: v_readfirstlane_b32 s4, v1 43; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 44; GFX6-NEXT: s_waitcnt lgkmcnt(0) 45; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 46; GFX6-NEXT: s_endpgm 47; 48; GFX8-LABEL: add_i32_constant: 49; GFX8: ; %bb.0: ; %entry 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 56; GFX8-NEXT: s_cbranch_execz .LBB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 59; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 60; GFX8-NEXT: s_mul_i32 s2, s2, 5 61; GFX8-NEXT: v_mov_b32_e32 v1, s2 62; GFX8-NEXT: v_mov_b32_e32 v2, 0 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 65; GFX8-NEXT: .LBB0_2: 66; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 67; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 68; GFX8-NEXT: s_waitcnt vmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 71; GFX8-NEXT: s_waitcnt lgkmcnt(0) 72; GFX8-NEXT: v_mov_b32_e32 v0, s0 73; GFX8-NEXT: v_mov_b32_e32 v1, s1 74; GFX8-NEXT: flat_store_dword v[0:1], v2 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_mov_b64 s[2:3], exec 80; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 81; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 82; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 83; GFX9-NEXT: ; implicit-def: $vgpr1 84; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 85; GFX9-NEXT: s_cbranch_execz .LBB0_2 86; GFX9-NEXT: ; %bb.1: 87; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, s2 91; GFX9-NEXT: v_mov_b32_e32 v2, 0 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 94; GFX9-NEXT: .LBB0_2: 95; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 96; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 97; GFX9-NEXT: s_waitcnt vmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mov_b32_e32 v2, 0 100; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 102; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 103; GFX9-NEXT: s_endpgm 104; 105; GFX10W64-LABEL: add_i32_constant: 106; GFX10W64: ; %bb.0: ; %entry 107; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 108; GFX10W64-NEXT: ; implicit-def: $vgpr1 109; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 110; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 111; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 112; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 113; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 114; GFX10W64-NEXT: ; %bb.1: 115; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 116; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 117; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 118; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 119; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 120; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 121; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 122; GFX10W64-NEXT: .LBB0_2: 123; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 124; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 125; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 126; GFX10W64-NEXT: s_waitcnt vmcnt(0) 127; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 128; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 129; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 130; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 131; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 132; GFX10W64-NEXT: s_endpgm 133; 134; GFX10W32-LABEL: add_i32_constant: 135; GFX10W32: ; %bb.0: ; %entry 136; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 137; GFX10W32-NEXT: ; implicit-def: $vgpr1 138; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 139; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 140; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 141; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 142; GFX10W32-NEXT: ; %bb.1: 143; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 144; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 145; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 146; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 147; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 148; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 149; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 150; GFX10W32-NEXT: .LBB0_2: 151; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 152; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 153; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 154; GFX10W32-NEXT: s_waitcnt vmcnt(0) 155; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 156; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 157; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 158; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 159; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] 160; GFX10W32-NEXT: s_endpgm 161; 162; GFX11W64-LABEL: add_i32_constant: 163; GFX11W64: ; %bb.0: ; %entry 164; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 165; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 166; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 167; GFX11W64-NEXT: ; implicit-def: $vgpr1 168; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 169; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 170; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 171; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 172; GFX11W64-NEXT: ; %bb.1: 173; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 174; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 175; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 176; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 177; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 178; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 179; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 180; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 181; GFX11W64-NEXT: .LBB0_2: 182; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 183; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 184; GFX11W64-NEXT: s_waitcnt vmcnt(0) 185; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 186; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 187; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 188; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 189; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 190; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 191; GFX11W64-NEXT: s_endpgm 192; 193; GFX11W32-LABEL: add_i32_constant: 194; GFX11W32: ; %bb.0: ; %entry 195; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 196; GFX11W32-NEXT: s_mov_b32 s0, exec_lo 197; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 198; GFX11W32-NEXT: ; implicit-def: $vgpr1 199; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 200; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 201; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 202; GFX11W32-NEXT: ; %bb.1: 203; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 204; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 205; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 206; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 207; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 208; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 209; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 210; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 211; GFX11W32-NEXT: .LBB0_2: 212; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 213; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 214; GFX11W32-NEXT: s_waitcnt vmcnt(0) 215; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 216; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 217; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 218; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 219; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 220; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 221; GFX11W32-NEXT: s_endpgm 222; 223; GFX12W64-LABEL: add_i32_constant: 224; GFX12W64: ; %bb.0: ; %entry 225; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 226; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 227; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 228; GFX12W64-NEXT: ; implicit-def: $vgpr1 229; GFX12W64-NEXT: s_wait_alu 0xfffe 230; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 231; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 232; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 233; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 234; GFX12W64-NEXT: ; %bb.1: 235; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 236; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 237; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 238; GFX12W64-NEXT: s_wait_alu 0xfffe 239; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 240; GFX12W64-NEXT: s_wait_alu 0xfffe 241; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 242; GFX12W64-NEXT: s_wait_kmcnt 0x0 243; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 244; GFX12W64-NEXT: .LBB0_2: 245; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 246; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 247; GFX12W64-NEXT: s_wait_loadcnt 0x0 248; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 249; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 250; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 251; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 252; GFX12W64-NEXT: s_wait_kmcnt 0x0 253; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 254; GFX12W64-NEXT: s_endpgm 255; 256; GFX12W32-LABEL: add_i32_constant: 257; GFX12W32: ; %bb.0: ; %entry 258; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 259; GFX12W32-NEXT: s_mov_b32 s0, exec_lo 260; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 261; GFX12W32-NEXT: ; implicit-def: $vgpr1 262; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 263; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 264; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 265; GFX12W32-NEXT: ; %bb.1: 266; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 267; GFX12W32-NEXT: s_wait_alu 0xfffe 268; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 269; GFX12W32-NEXT: s_wait_alu 0xfffe 270; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 271; GFX12W32-NEXT: s_wait_alu 0xfffe 272; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 273; GFX12W32-NEXT: s_wait_kmcnt 0x0 274; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 275; GFX12W32-NEXT: .LBB0_2: 276; GFX12W32-NEXT: s_wait_alu 0xfffe 277; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 278; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 279; GFX12W32-NEXT: s_wait_loadcnt 0x0 280; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 281; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 282; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 283; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 284; GFX12W32-NEXT: s_wait_kmcnt 0x0 285; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 286; GFX12W32-NEXT: s_endpgm 287entry: 288 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) 289 store i32 %old, ptr addrspace(1) %out 290 ret void 291} 292 293define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { 294; GFX6-LABEL: add_i32_uniform: 295; GFX6: ; %bb.0: ; %entry 296; GFX6-NEXT: s_mov_b64 s[2:3], exec 297; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 298; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 299; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 300; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 301; GFX6-NEXT: ; implicit-def: $vgpr1 302; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 303; GFX6-NEXT: s_cbranch_execz .LBB1_2 304; GFX6-NEXT: ; %bb.1: 305; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 306; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 307; GFX6-NEXT: s_waitcnt lgkmcnt(0) 308; GFX6-NEXT: s_mul_i32 s2, s6, s2 309; GFX6-NEXT: v_mov_b32_e32 v1, s2 310; GFX6-NEXT: v_mov_b32_e32 v2, 0 311; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 312; GFX6-NEXT: .LBB1_2: 313; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 314; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 315; GFX6-NEXT: s_mov_b32 s3, 0xf000 316; GFX6-NEXT: s_mov_b32 s2, -1 317; GFX6-NEXT: s_waitcnt vmcnt(0) 318; GFX6-NEXT: v_readfirstlane_b32 s4, v1 319; GFX6-NEXT: s_waitcnt lgkmcnt(0) 320; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 321; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 322; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 323; GFX6-NEXT: s_endpgm 324; 325; GFX8-LABEL: add_i32_uniform: 326; GFX8: ; %bb.0: ; %entry 327; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 328; GFX8-NEXT: s_mov_b64 s[2:3], exec 329; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 330; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 331; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 332; GFX8-NEXT: ; implicit-def: $vgpr1 333; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 334; GFX8-NEXT: s_cbranch_execz .LBB1_2 335; GFX8-NEXT: ; %bb.1: 336; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 337; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 338; GFX8-NEXT: s_waitcnt lgkmcnt(0) 339; GFX8-NEXT: s_mul_i32 s2, s6, s2 340; GFX8-NEXT: v_mov_b32_e32 v1, s2 341; GFX8-NEXT: v_mov_b32_e32 v2, 0 342; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 343; GFX8-NEXT: .LBB1_2: 344; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 345; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 346; GFX8-NEXT: s_waitcnt lgkmcnt(0) 347; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 348; GFX8-NEXT: s_waitcnt vmcnt(0) 349; GFX8-NEXT: v_readfirstlane_b32 s2, v1 350; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 351; GFX8-NEXT: v_mov_b32_e32 v0, s0 352; GFX8-NEXT: v_mov_b32_e32 v1, s1 353; GFX8-NEXT: flat_store_dword v[0:1], v2 354; GFX8-NEXT: s_endpgm 355; 356; GFX9-LABEL: add_i32_uniform: 357; GFX9: ; %bb.0: ; %entry 358; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 359; GFX9-NEXT: s_mov_b64 s[2:3], exec 360; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 361; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 362; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 363; GFX9-NEXT: ; implicit-def: $vgpr1 364; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 365; GFX9-NEXT: s_cbranch_execz .LBB1_2 366; GFX9-NEXT: ; %bb.1: 367; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 368; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 369; GFX9-NEXT: s_waitcnt lgkmcnt(0) 370; GFX9-NEXT: s_mul_i32 s2, s6, s2 371; GFX9-NEXT: v_mov_b32_e32 v1, s2 372; GFX9-NEXT: v_mov_b32_e32 v2, 0 373; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 374; GFX9-NEXT: .LBB1_2: 375; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 376; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 377; GFX9-NEXT: s_waitcnt lgkmcnt(0) 378; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 379; GFX9-NEXT: s_waitcnt vmcnt(0) 380; GFX9-NEXT: v_readfirstlane_b32 s2, v1 381; GFX9-NEXT: v_mov_b32_e32 v2, 0 382; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 383; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 384; GFX9-NEXT: s_endpgm 385; 386; GFX10W64-LABEL: add_i32_uniform: 387; GFX10W64: ; %bb.0: ; %entry 388; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 389; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 390; GFX10W64-NEXT: ; implicit-def: $vgpr1 391; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 392; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 393; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 394; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 395; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 396; GFX10W64-NEXT: ; %bb.1: 397; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 398; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 399; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 400; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 401; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 402; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 403; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 404; GFX10W64-NEXT: .LBB1_2: 405; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 406; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 407; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 408; GFX10W64-NEXT: s_waitcnt vmcnt(0) 409; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 410; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 411; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] 412; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 413; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 414; GFX10W64-NEXT: s_endpgm 415; 416; GFX10W32-LABEL: add_i32_uniform: 417; GFX10W32: ; %bb.0: ; %entry 418; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 419; GFX10W32-NEXT: s_mov_b32 s2, exec_lo 420; GFX10W32-NEXT: ; implicit-def: $vgpr1 421; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 422; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 423; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 424; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 425; GFX10W32-NEXT: ; %bb.1: 426; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 427; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 428; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 429; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 430; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 431; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 432; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc 433; GFX10W32-NEXT: .LBB1_2: 434; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 435; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 436; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 437; GFX10W32-NEXT: s_waitcnt vmcnt(0) 438; GFX10W32-NEXT: s_mov_b32 null, 0 439; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 440; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 441; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] 442; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 443; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 444; GFX10W32-NEXT: s_endpgm 445; 446; GFX11W64-LABEL: add_i32_uniform: 447; GFX11W64: ; %bb.0: ; %entry 448; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 449; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 450; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 451; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 452; GFX11W64-NEXT: ; implicit-def: $vgpr1 453; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 454; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 455; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 456; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 457; GFX11W64-NEXT: ; %bb.1: 458; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 459; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 460; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 461; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 462; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 463; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 464; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 465; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 466; GFX11W64-NEXT: .LBB1_2: 467; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 468; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 469; GFX11W64-NEXT: s_waitcnt vmcnt(0) 470; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 471; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 472; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 473; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] 474; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 475; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 476; GFX11W64-NEXT: s_endpgm 477; 478; GFX11W32-LABEL: add_i32_uniform: 479; GFX11W32: ; %bb.0: ; %entry 480; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 481; GFX11W32-NEXT: s_mov_b32 s2, exec_lo 482; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 483; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 484; GFX11W32-NEXT: ; implicit-def: $vgpr1 485; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 486; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 487; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 488; GFX11W32-NEXT: ; %bb.1: 489; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 490; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 491; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 492; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 493; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 494; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 495; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 496; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 497; GFX11W32-NEXT: .LBB1_2: 498; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 499; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 500; GFX11W32-NEXT: s_waitcnt vmcnt(0) 501; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 502; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 503; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 504; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] 505; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 506; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] 507; GFX11W32-NEXT: s_endpgm 508; 509; GFX12W64-LABEL: add_i32_uniform: 510; GFX12W64: ; %bb.0: ; %entry 511; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 512; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 513; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 514; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 515; GFX12W64-NEXT: ; implicit-def: $vgpr1 516; GFX12W64-NEXT: s_wait_alu 0xfffe 517; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 518; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 519; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 520; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 521; GFX12W64-NEXT: ; %bb.1: 522; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 523; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 524; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 525; GFX12W64-NEXT: s_wait_kmcnt 0x0 526; GFX12W64-NEXT: s_wait_alu 0xfffe 527; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 528; GFX12W64-NEXT: s_wait_alu 0xfffe 529; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 530; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 531; GFX12W64-NEXT: .LBB1_2: 532; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 533; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 534; GFX12W64-NEXT: s_wait_loadcnt 0x0 535; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 536; GFX12W64-NEXT: s_wait_kmcnt 0x0 537; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 538; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] 539; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 540; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 541; GFX12W64-NEXT: s_endpgm 542; 543; GFX12W32-LABEL: add_i32_uniform: 544; GFX12W32: ; %bb.0: ; %entry 545; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 546; GFX12W32-NEXT: s_mov_b32 s2, exec_lo 547; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 548; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 549; GFX12W32-NEXT: ; implicit-def: $vgpr1 550; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 551; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 552; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 553; GFX12W32-NEXT: ; %bb.1: 554; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 555; GFX12W32-NEXT: s_wait_alu 0xfffe 556; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 557; GFX12W32-NEXT: s_wait_kmcnt 0x0 558; GFX12W32-NEXT: s_wait_alu 0xfffe 559; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 560; GFX12W32-NEXT: s_wait_alu 0xfffe 561; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 562; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 563; GFX12W32-NEXT: .LBB1_2: 564; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 565; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 566; GFX12W32-NEXT: s_wait_loadcnt 0x0 567; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 568; GFX12W32-NEXT: s_wait_kmcnt 0x0 569; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 570; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] 571; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 572; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] 573; GFX12W32-NEXT: s_endpgm 574entry: 575 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) 576 store i32 %old, ptr addrspace(1) %out 577 ret void 578} 579 580define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 581; GFX6-LABEL: add_i32_varying_vdata: 582; GFX6: ; %bb.0: ; %entry 583; GFX6-NEXT: s_mov_b64 s[0:1], exec 584; GFX6-NEXT: s_mov_b32 s2, 0 585; GFX6-NEXT: ; implicit-def: $vgpr1 586; GFX6-NEXT: .LBB2_1: ; %ComputeLoop 587; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 588; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] 589; GFX6-NEXT: s_mov_b32 m0, s3 590; GFX6-NEXT: v_readlane_b32 s8, v0, s3 591; GFX6-NEXT: v_writelane_b32 v1, s2, m0 592; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 593; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 594; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 595; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] 596; GFX6-NEXT: s_add_i32 s2, s2, s8 597; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 598; GFX6-NEXT: ; %bb.2: ; %ComputeEnd 599; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 600; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 601; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 602; GFX6-NEXT: ; implicit-def: $vgpr0 603; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 604; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 605; GFX6-NEXT: s_cbranch_execz .LBB2_4 606; GFX6-NEXT: ; %bb.3: 607; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 608; GFX6-NEXT: v_mov_b32_e32 v0, s2 609; GFX6-NEXT: v_mov_b32_e32 v2, 0 610; GFX6-NEXT: s_waitcnt lgkmcnt(0) 611; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 612; GFX6-NEXT: .LBB2_4: 613; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 614; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 615; GFX6-NEXT: s_mov_b32 s3, 0xf000 616; GFX6-NEXT: s_mov_b32 s2, -1 617; GFX6-NEXT: s_waitcnt vmcnt(0) 618; GFX6-NEXT: v_readfirstlane_b32 s4, v0 619; GFX6-NEXT: s_waitcnt expcnt(0) 620; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 621; GFX6-NEXT: s_waitcnt lgkmcnt(0) 622; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 623; GFX6-NEXT: s_endpgm 624; 625; GFX8-LABEL: add_i32_varying_vdata: 626; GFX8: ; %bb.0: ; %entry 627; GFX8-NEXT: s_mov_b64 s[0:1], exec 628; GFX8-NEXT: s_mov_b32 s2, 0 629; GFX8-NEXT: ; implicit-def: $vgpr1 630; GFX8-NEXT: .LBB2_1: ; %ComputeLoop 631; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 632; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] 633; GFX8-NEXT: s_mov_b32 m0, s3 634; GFX8-NEXT: v_readlane_b32 s8, v0, s3 635; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 636; GFX8-NEXT: v_writelane_b32 v1, s2, m0 637; GFX8-NEXT: s_add_i32 s2, s2, s8 638; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 639; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 640; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 641; GFX8-NEXT: ; %bb.2: ; %ComputeEnd 642; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 643; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 644; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 645; GFX8-NEXT: ; implicit-def: $vgpr0 646; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 647; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 648; GFX8-NEXT: s_cbranch_execz .LBB2_4 649; GFX8-NEXT: ; %bb.3: 650; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 651; GFX8-NEXT: v_mov_b32_e32 v0, s2 652; GFX8-NEXT: v_mov_b32_e32 v2, 0 653; GFX8-NEXT: s_waitcnt lgkmcnt(0) 654; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 655; GFX8-NEXT: .LBB2_4: 656; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 657; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 658; GFX8-NEXT: s_waitcnt vmcnt(0) 659; GFX8-NEXT: v_readfirstlane_b32 s2, v0 660; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 661; GFX8-NEXT: s_waitcnt lgkmcnt(0) 662; GFX8-NEXT: v_mov_b32_e32 v0, s0 663; GFX8-NEXT: v_mov_b32_e32 v1, s1 664; GFX8-NEXT: flat_store_dword v[0:1], v2 665; GFX8-NEXT: s_endpgm 666; 667; GFX9-LABEL: add_i32_varying_vdata: 668; GFX9: ; %bb.0: ; %entry 669; GFX9-NEXT: s_mov_b64 s[0:1], exec 670; GFX9-NEXT: s_mov_b32 s2, 0 671; GFX9-NEXT: ; implicit-def: $vgpr1 672; GFX9-NEXT: .LBB2_1: ; %ComputeLoop 673; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 674; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] 675; GFX9-NEXT: s_mov_b32 m0, s3 676; GFX9-NEXT: v_readlane_b32 s8, v0, s3 677; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 678; GFX9-NEXT: v_writelane_b32 v1, s2, m0 679; GFX9-NEXT: s_add_i32 s2, s2, s8 680; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 681; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 682; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 683; GFX9-NEXT: ; %bb.2: ; %ComputeEnd 684; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 685; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 686; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 687; GFX9-NEXT: ; implicit-def: $vgpr0 688; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 689; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 690; GFX9-NEXT: s_cbranch_execz .LBB2_4 691; GFX9-NEXT: ; %bb.3: 692; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 693; GFX9-NEXT: v_mov_b32_e32 v0, s2 694; GFX9-NEXT: v_mov_b32_e32 v2, 0 695; GFX9-NEXT: s_waitcnt lgkmcnt(0) 696; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 697; GFX9-NEXT: .LBB2_4: 698; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 699; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 700; GFX9-NEXT: s_waitcnt vmcnt(0) 701; GFX9-NEXT: v_readfirstlane_b32 s2, v0 702; GFX9-NEXT: v_mov_b32_e32 v2, 0 703; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 704; GFX9-NEXT: s_waitcnt lgkmcnt(0) 705; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 706; GFX9-NEXT: s_endpgm 707; 708; GFX10W64-LABEL: add_i32_varying_vdata: 709; GFX10W64: ; %bb.0: ; %entry 710; GFX10W64-NEXT: s_mov_b64 s[0:1], exec 711; GFX10W64-NEXT: s_mov_b32 s2, 0 712; GFX10W64-NEXT: ; implicit-def: $vgpr1 713; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop 714; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 715; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] 716; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 717; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 718; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 719; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 720; GFX10W64-NEXT: s_add_i32 s2, s2, s8 721; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 722; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 723; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd 724; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 725; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 726; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 727; GFX10W64-NEXT: ; implicit-def: $vgpr0 728; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 729; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 730; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 731; GFX10W64-NEXT: ; %bb.3: 732; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 733; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 734; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 735; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 736; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 737; GFX10W64-NEXT: .LBB2_4: 738; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 739; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 740; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 741; GFX10W64-NEXT: s_waitcnt vmcnt(0) 742; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 743; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 744; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 745; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 746; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] 747; GFX10W64-NEXT: s_endpgm 748; 749; GFX10W32-LABEL: add_i32_varying_vdata: 750; GFX10W32: ; %bb.0: ; %entry 751; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 752; GFX10W32-NEXT: s_mov_b32 s0, 0 753; GFX10W32-NEXT: ; implicit-def: $vgpr1 754; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop 755; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 756; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 757; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 758; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 759; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 760; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 761; GFX10W32-NEXT: s_add_i32 s0, s0, s3 762; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 763; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 764; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd 765; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 766; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 767; GFX10W32-NEXT: ; implicit-def: $vgpr0 768; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 769; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 770; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 771; GFX10W32-NEXT: ; %bb.3: 772; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 773; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 774; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 775; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 776; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 777; GFX10W32-NEXT: .LBB2_4: 778; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 779; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 780; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 781; GFX10W32-NEXT: s_waitcnt vmcnt(0) 782; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 783; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 784; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 785; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 786; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] 787; GFX10W32-NEXT: s_endpgm 788; 789; GFX11W64-LABEL: add_i32_varying_vdata: 790; GFX11W64: ; %bb.0: ; %entry 791; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 792; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 793; GFX11W64-NEXT: s_mov_b32 s2, 0 794; GFX11W64-NEXT: ; implicit-def: $vgpr0 795; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop 796; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 797; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 798; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 799; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 800; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 801; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 802; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 803; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 804; GFX11W64-NEXT: s_add_i32 s2, s2, s8 805; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 806; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 807; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd 808; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 809; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 810; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 811; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 812; GFX11W64-NEXT: ; implicit-def: $vgpr1 813; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 814; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 815; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 816; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 817; GFX11W64-NEXT: ; %bb.3: 818; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 819; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 820; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 821; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 822; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 823; GFX11W64-NEXT: .LBB2_4: 824; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 825; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 826; GFX11W64-NEXT: s_waitcnt vmcnt(0) 827; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 828; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 829; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 830; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 831; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 832; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 833; GFX11W64-NEXT: s_endpgm 834; 835; GFX11W32-LABEL: add_i32_varying_vdata: 836; GFX11W32: ; %bb.0: ; %entry 837; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 838; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 839; GFX11W32-NEXT: s_mov_b32 s0, 0 840; GFX11W32-NEXT: ; implicit-def: $vgpr0 841; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop 842; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 843; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 844; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 845; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 846; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 847; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 848; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 849; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 850; GFX11W32-NEXT: s_add_i32 s0, s0, s3 851; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 852; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 853; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd 854; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 855; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 856; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 857; GFX11W32-NEXT: ; implicit-def: $vgpr1 858; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 859; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 860; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 861; GFX11W32-NEXT: ; %bb.3: 862; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 863; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 864; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 865; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 866; GFX11W32-NEXT: .LBB2_4: 867; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 868; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 869; GFX11W32-NEXT: s_waitcnt vmcnt(0) 870; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 871; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 872; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 873; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 874; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 875; GFX11W32-NEXT: s_endpgm 876; 877; GFX12W64-LABEL: add_i32_varying_vdata: 878; GFX12W64: ; %bb.0: ; %entry 879; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 880; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 881; GFX12W64-NEXT: s_mov_b32 s2, 0 882; GFX12W64-NEXT: ; implicit-def: $vgpr0 883; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop 884; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 885; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 886; GFX12W64-NEXT: s_wait_alu 0xfffe 887; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 888; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 889; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 890; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 891; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 892; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 893; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 894; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 895; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd 896; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 897; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 898; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 899; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 900; GFX12W64-NEXT: ; implicit-def: $vgpr1 901; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 902; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 903; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 904; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 905; GFX12W64-NEXT: ; %bb.3: 906; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 907; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 908; GFX12W64-NEXT: s_wait_alu 0xfffe 909; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 910; GFX12W64-NEXT: s_wait_kmcnt 0x0 911; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 912; GFX12W64-NEXT: .LBB2_4: 913; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 914; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 915; GFX12W64-NEXT: s_wait_loadcnt 0x0 916; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 917; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 918; GFX12W64-NEXT: s_wait_alu 0xfffe 919; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 920; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 921; GFX12W64-NEXT: s_wait_kmcnt 0x0 922; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 923; GFX12W64-NEXT: s_endpgm 924; 925; GFX12W32-LABEL: add_i32_varying_vdata: 926; GFX12W32: ; %bb.0: ; %entry 927; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 928; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 929; GFX12W32-NEXT: s_mov_b32 s0, 0 930; GFX12W32-NEXT: ; implicit-def: $vgpr0 931; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop 932; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 933; GFX12W32-NEXT: s_wait_alu 0xfffe 934; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 935; GFX12W32-NEXT: s_wait_alu 0xfffe 936; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 937; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 938; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 939; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 940; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 941; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 942; GFX12W32-NEXT: s_wait_alu 0xfffe 943; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 944; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 945; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd 946; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 947; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 948; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 949; GFX12W32-NEXT: ; implicit-def: $vgpr1 950; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 951; GFX12W32-NEXT: s_wait_alu 0xfffe 952; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 953; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 954; GFX12W32-NEXT: ; %bb.3: 955; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 956; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 957; GFX12W32-NEXT: s_wait_kmcnt 0x0 958; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 959; GFX12W32-NEXT: .LBB2_4: 960; GFX12W32-NEXT: s_wait_alu 0xfffe 961; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 962; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 963; GFX12W32-NEXT: s_wait_loadcnt 0x0 964; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 965; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 966; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 967; GFX12W32-NEXT: s_wait_kmcnt 0x0 968; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 969; GFX12W32-NEXT: s_endpgm 970entry: 971 %lane = call i32 @llvm.amdgcn.workitem.id.x() 972 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) 973 store i32 %old, ptr addrspace(1) %out 974 ret void 975} 976 977define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 978; GFX6-LABEL: add_i32_varying_vindex: 979; GFX6: ; %bb.0: ; %entry 980; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 981; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 982; GFX6-NEXT: v_mov_b32_e32 v1, 1 983; GFX6-NEXT: s_waitcnt lgkmcnt(0) 984; GFX6-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen glc 985; GFX6-NEXT: s_mov_b32 s7, 0xf000 986; GFX6-NEXT: s_mov_b32 s6, -1 987; GFX6-NEXT: s_waitcnt vmcnt(0) 988; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 989; GFX6-NEXT: s_endpgm 990; 991; GFX8-LABEL: add_i32_varying_vindex: 992; GFX8: ; %bb.0: ; %entry 993; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 994; GFX8-NEXT: v_mov_b32_e32 v2, 1 995; GFX8-NEXT: s_waitcnt lgkmcnt(0) 996; GFX8-NEXT: buffer_atomic_add v2, v0, s[0:3], 0 idxen glc 997; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 998; GFX8-NEXT: s_waitcnt lgkmcnt(0) 999; GFX8-NEXT: v_mov_b32_e32 v0, s0 1000; GFX8-NEXT: v_mov_b32_e32 v1, s1 1001; GFX8-NEXT: s_waitcnt vmcnt(0) 1002; GFX8-NEXT: flat_store_dword v[0:1], v2 1003; GFX8-NEXT: s_endpgm 1004; 1005; GFX9-LABEL: add_i32_varying_vindex: 1006; GFX9: ; %bb.0: ; %entry 1007; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1008; GFX9-NEXT: v_mov_b32_e32 v1, 1 1009; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1010; GFX9-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen glc 1011; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1012; GFX9-NEXT: v_mov_b32_e32 v0, 0 1013; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1014; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1015; GFX9-NEXT: s_endpgm 1016; 1017; GFX10-LABEL: add_i32_varying_vindex: 1018; GFX10: ; %bb.0: ; %entry 1019; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1020; GFX10-NEXT: v_mov_b32_e32 v1, 1 1021; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1022; GFX10-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen glc 1023; GFX10-NEXT: s_waitcnt_depctr 0xffe3 1024; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1025; GFX10-NEXT: v_mov_b32_e32 v0, 0 1026; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1027; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1028; GFX10-NEXT: s_endpgm 1029; 1030; GFX11W64-LABEL: add_i32_varying_vindex: 1031; GFX11W64: ; %bb.0: ; %entry 1032; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1033; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1034; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 1035; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1036; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 idxen glc 1037; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1038; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 1039; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1040; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 1041; GFX11W64-NEXT: s_endpgm 1042; 1043; GFX11W32-LABEL: add_i32_varying_vindex: 1044; GFX11W32: ; %bb.0: ; %entry 1045; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1046; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 1047; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1048; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 idxen glc 1049; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1050; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 1051; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1052; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] 1053; GFX11W32-NEXT: s_endpgm 1054; 1055; GFX12W64-LABEL: add_i32_varying_vindex: 1056; GFX12W64: ; %bb.0: ; %entry 1057; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1058; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1059; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 1060; GFX12W64-NEXT: s_wait_kmcnt 0x0 1061; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN 1062; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1063; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 1064; GFX12W64-NEXT: s_wait_loadcnt 0x0 1065; GFX12W64-NEXT: s_wait_kmcnt 0x0 1066; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] 1067; GFX12W64-NEXT: s_endpgm 1068; 1069; GFX12W32-LABEL: add_i32_varying_vindex: 1070; GFX12W32: ; %bb.0: ; %entry 1071; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1072; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 1073; GFX12W32-NEXT: s_wait_kmcnt 0x0 1074; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN 1075; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1076; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 1077; GFX12W32-NEXT: s_wait_loadcnt 0x0 1078; GFX12W32-NEXT: s_wait_kmcnt 0x0 1079; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] 1080; GFX12W32-NEXT: s_endpgm 1081entry: 1082 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1083 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) 1084 store i32 %old, ptr addrspace(1) %out 1085 ret void 1086} 1087 1088define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 1089; GFX6-LABEL: add_i32_varying_offset: 1090; GFX6: ; %bb.0: ; %entry 1091; GFX6-NEXT: v_mov_b32_e32 v1, v0 1092; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 1093; GFX6-NEXT: s_mov_b32 s6, 0 1094; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1095; GFX6-NEXT: v_mov_b32_e32 v0, s6 1096; GFX6-NEXT: v_mov_b32_e32 v2, 1 1097; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1098; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc 1099; GFX6-NEXT: s_mov_b32 s7, 0xf000 1100; GFX6-NEXT: s_mov_b32 s6, -1 1101; GFX6-NEXT: s_waitcnt vmcnt(0) 1102; GFX6-NEXT: buffer_store_dword v2, off, s[4:7], 0 1103; GFX6-NEXT: s_endpgm 1104; 1105; GFX8-LABEL: add_i32_varying_offset: 1106; GFX8: ; %bb.0: ; %entry 1107; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1108; GFX8-NEXT: s_mov_b32 s6, 0 1109; GFX8-NEXT: v_mov_b32_e32 v1, v0 1110; GFX8-NEXT: v_mov_b32_e32 v0, s6 1111; GFX8-NEXT: v_mov_b32_e32 v2, 1 1112; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc 1114; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1115; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX8-NEXT: v_mov_b32_e32 v0, s0 1117; GFX8-NEXT: v_mov_b32_e32 v1, s1 1118; GFX8-NEXT: s_waitcnt vmcnt(0) 1119; GFX8-NEXT: flat_store_dword v[0:1], v2 1120; GFX8-NEXT: s_endpgm 1121; 1122; GFX9-LABEL: add_i32_varying_offset: 1123; GFX9: ; %bb.0: ; %entry 1124; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1125; GFX9-NEXT: s_mov_b32 s6, 0 1126; GFX9-NEXT: v_mov_b32_e32 v1, v0 1127; GFX9-NEXT: v_mov_b32_e32 v0, s6 1128; GFX9-NEXT: v_mov_b32_e32 v2, 1 1129; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc 1131; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1132; GFX9-NEXT: v_mov_b32_e32 v0, 0 1133; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1134; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 1135; GFX9-NEXT: s_endpgm 1136; 1137; GFX10-LABEL: add_i32_varying_offset: 1138; GFX10: ; %bb.0: ; %entry 1139; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1140; GFX10-NEXT: s_mov_b32 s6, 0 1141; GFX10-NEXT: v_mov_b32_e32 v1, v0 1142; GFX10-NEXT: v_mov_b32_e32 v0, s6 1143; GFX10-NEXT: v_mov_b32_e32 v2, 1 1144; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1145; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc 1146; GFX10-NEXT: s_waitcnt_depctr 0xffe3 1147; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1148; GFX10-NEXT: v_mov_b32_e32 v0, 0 1149; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1150; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 1151; GFX10-NEXT: s_endpgm 1152; 1153; GFX11W64-LABEL: add_i32_varying_offset: 1154; GFX11W64: ; %bb.0: ; %entry 1155; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1156; GFX11W64-NEXT: s_mov_b32 s6, 0 1157; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1158; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 1159; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 1160; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1161; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc 1162; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1163; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 1164; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1165; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] 1166; GFX11W64-NEXT: s_endpgm 1167; 1168; GFX11W32-LABEL: add_i32_varying_offset: 1169; GFX11W32: ; %bb.0: ; %entry 1170; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1171; GFX11W32-NEXT: s_mov_b32 s6, 0 1172; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1173; GFX11W32-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0 1174; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 1175; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1176; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc 1177; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1178; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 1179; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1180; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] 1181; GFX11W32-NEXT: s_endpgm 1182; 1183; GFX12W64-LABEL: add_i32_varying_offset: 1184; GFX12W64: ; %bb.0: ; %entry 1185; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1186; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1187; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 1188; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 1189; GFX12W64-NEXT: s_wait_kmcnt 0x0 1190; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN 1191; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1192; GFX12W64-NEXT: s_wait_loadcnt 0x0 1193; GFX12W64-NEXT: s_wait_kmcnt 0x0 1194; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1] 1195; GFX12W64-NEXT: s_endpgm 1196; 1197; GFX12W32-LABEL: add_i32_varying_offset: 1198; GFX12W32: ; %bb.0: ; %entry 1199; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1200; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 1201; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 1202; GFX12W32-NEXT: s_wait_kmcnt 0x0 1203; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN 1204; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1205; GFX12W32-NEXT: s_wait_loadcnt 0x0 1206; GFX12W32-NEXT: s_wait_kmcnt 0x0 1207; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1] 1208; GFX12W32-NEXT: s_endpgm 1209entry: 1210 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1211 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0) 1212 store i32 %old, ptr addrspace(1) %out 1213 ret void 1214} 1215 1216define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 1217; GFX6-LABEL: sub_i32_constant: 1218; GFX6: ; %bb.0: ; %entry 1219; GFX6-NEXT: s_mov_b64 s[2:3], exec 1220; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1221; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1222; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1223; GFX6-NEXT: ; implicit-def: $vgpr1 1224; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 1225; GFX6-NEXT: s_cbranch_execz .LBB5_2 1226; GFX6-NEXT: ; %bb.1: 1227; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 1228; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1229; GFX6-NEXT: s_mul_i32 s2, s2, 5 1230; GFX6-NEXT: v_mov_b32_e32 v1, s2 1231; GFX6-NEXT: v_mov_b32_e32 v2, 0 1232; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1233; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1234; GFX6-NEXT: .LBB5_2: 1235; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 1236; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1237; GFX6-NEXT: s_mov_b32 s3, 0xf000 1238; GFX6-NEXT: s_mov_b32 s2, -1 1239; GFX6-NEXT: s_waitcnt vmcnt(0) 1240; GFX6-NEXT: v_readfirstlane_b32 s4, v1 1241; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1242; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1243; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1244; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1245; GFX6-NEXT: s_endpgm 1246; 1247; GFX8-LABEL: sub_i32_constant: 1248; GFX8: ; %bb.0: ; %entry 1249; GFX8-NEXT: s_mov_b64 s[2:3], exec 1250; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1251; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1252; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1253; GFX8-NEXT: ; implicit-def: $vgpr1 1254; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1255; GFX8-NEXT: s_cbranch_execz .LBB5_2 1256; GFX8-NEXT: ; %bb.1: 1257; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1258; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1259; GFX8-NEXT: s_mul_i32 s2, s2, 5 1260; GFX8-NEXT: v_mov_b32_e32 v1, s2 1261; GFX8-NEXT: v_mov_b32_e32 v2, 0 1262; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1263; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1264; GFX8-NEXT: .LBB5_2: 1265; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1266; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1267; GFX8-NEXT: s_waitcnt vmcnt(0) 1268; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1269; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1270; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 1271; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1272; GFX8-NEXT: v_mov_b32_e32 v0, s0 1273; GFX8-NEXT: v_mov_b32_e32 v1, s1 1274; GFX8-NEXT: flat_store_dword v[0:1], v2 1275; GFX8-NEXT: s_endpgm 1276; 1277; GFX9-LABEL: sub_i32_constant: 1278; GFX9: ; %bb.0: ; %entry 1279; GFX9-NEXT: s_mov_b64 s[2:3], exec 1280; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1281; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1282; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1283; GFX9-NEXT: ; implicit-def: $vgpr1 1284; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1285; GFX9-NEXT: s_cbranch_execz .LBB5_2 1286; GFX9-NEXT: ; %bb.1: 1287; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1288; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1289; GFX9-NEXT: s_mul_i32 s2, s2, 5 1290; GFX9-NEXT: v_mov_b32_e32 v1, s2 1291; GFX9-NEXT: v_mov_b32_e32 v2, 0 1292; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1294; GFX9-NEXT: .LBB5_2: 1295; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1296; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1297; GFX9-NEXT: s_waitcnt vmcnt(0) 1298; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1299; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1300; GFX9-NEXT: v_mov_b32_e32 v2, 0 1301; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1302; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1303; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1304; GFX9-NEXT: s_endpgm 1305; 1306; GFX10W64-LABEL: sub_i32_constant: 1307; GFX10W64: ; %bb.0: ; %entry 1308; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 1309; GFX10W64-NEXT: ; implicit-def: $vgpr1 1310; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1311; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1312; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1313; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1314; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 1315; GFX10W64-NEXT: ; %bb.1: 1316; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1317; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1318; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 1319; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 1320; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 1321; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1323; GFX10W64-NEXT: .LBB5_2: 1324; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1325; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1326; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1327; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1328; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 1329; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1330; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1331; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1332; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1333; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 1334; GFX10W64-NEXT: s_endpgm 1335; 1336; GFX10W32-LABEL: sub_i32_constant: 1337; GFX10W32: ; %bb.0: ; %entry 1338; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 1339; GFX10W32-NEXT: ; implicit-def: $vgpr1 1340; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1341; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1342; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 1343; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 1344; GFX10W32-NEXT: ; %bb.1: 1345; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1346; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 1347; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 1348; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 1349; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 1350; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1351; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1352; GFX10W32-NEXT: .LBB5_2: 1353; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1354; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1355; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1356; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1357; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 1358; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1359; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1360; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1361; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] 1363; GFX10W32-NEXT: s_endpgm 1364; 1365; GFX11W64-LABEL: sub_i32_constant: 1366; GFX11W64: ; %bb.0: ; %entry 1367; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 1368; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 1369; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1370; GFX11W64-NEXT: ; implicit-def: $vgpr1 1371; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1372; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1373; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1374; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 1375; GFX11W64-NEXT: ; %bb.1: 1376; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1377; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1378; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 1379; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 1380; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1381; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 1382; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 1384; GFX11W64-NEXT: .LBB5_2: 1385; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 1386; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1387; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1388; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 1389; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1390; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1391; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1392; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1393; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1394; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 1395; GFX11W64-NEXT: s_endpgm 1396; 1397; GFX11W32-LABEL: sub_i32_constant: 1398; GFX11W32: ; %bb.0: ; %entry 1399; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 1400; GFX11W32-NEXT: s_mov_b32 s0, exec_lo 1401; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1402; GFX11W32-NEXT: ; implicit-def: $vgpr1 1403; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1404; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1405; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 1406; GFX11W32-NEXT: ; %bb.1: 1407; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1408; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 1409; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 1410; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 1411; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1412; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 1413; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1414; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 1415; GFX11W32-NEXT: .LBB5_2: 1416; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1417; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1418; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1419; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 1420; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1421; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1422; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1423; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1424; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1425; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 1426; GFX11W32-NEXT: s_endpgm 1427; 1428; GFX12W64-LABEL: sub_i32_constant: 1429; GFX12W64: ; %bb.0: ; %entry 1430; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 1431; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 1432; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1433; GFX12W64-NEXT: ; implicit-def: $vgpr1 1434; GFX12W64-NEXT: s_wait_alu 0xfffe 1435; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1436; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1437; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1438; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 1439; GFX12W64-NEXT: ; %bb.1: 1440; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1441; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1442; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 1443; GFX12W64-NEXT: s_wait_alu 0xfffe 1444; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 1445; GFX12W64-NEXT: s_wait_alu 0xfffe 1446; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 1447; GFX12W64-NEXT: s_wait_kmcnt 0x0 1448; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 1449; GFX12W64-NEXT: .LBB5_2: 1450; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 1451; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1452; GFX12W64-NEXT: s_wait_loadcnt 0x0 1453; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 1454; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1455; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 1456; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1457; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1458; GFX12W64-NEXT: s_wait_kmcnt 0x0 1459; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 1460; GFX12W64-NEXT: s_endpgm 1461; 1462; GFX12W32-LABEL: sub_i32_constant: 1463; GFX12W32: ; %bb.0: ; %entry 1464; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 1465; GFX12W32-NEXT: s_mov_b32 s0, exec_lo 1466; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1467; GFX12W32-NEXT: ; implicit-def: $vgpr1 1468; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1469; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1470; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 1471; GFX12W32-NEXT: ; %bb.1: 1472; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1473; GFX12W32-NEXT: s_wait_alu 0xfffe 1474; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 1475; GFX12W32-NEXT: s_wait_alu 0xfffe 1476; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 1477; GFX12W32-NEXT: s_wait_alu 0xfffe 1478; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 1479; GFX12W32-NEXT: s_wait_kmcnt 0x0 1480; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 1481; GFX12W32-NEXT: .LBB5_2: 1482; GFX12W32-NEXT: s_wait_alu 0xfffe 1483; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1484; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1485; GFX12W32-NEXT: s_wait_loadcnt 0x0 1486; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 1487; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1488; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 1489; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1490; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1491; GFX12W32-NEXT: s_wait_kmcnt 0x0 1492; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 1493; GFX12W32-NEXT: s_endpgm 1494entry: 1495 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) 1496 store i32 %old, ptr addrspace(1) %out 1497 ret void 1498} 1499 1500define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { 1501; GFX6-LABEL: sub_i32_uniform: 1502; GFX6: ; %bb.0: ; %entry 1503; GFX6-NEXT: s_mov_b64 s[2:3], exec 1504; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 1505; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1506; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1507; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1508; GFX6-NEXT: ; implicit-def: $vgpr1 1509; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 1510; GFX6-NEXT: s_cbranch_execz .LBB6_2 1511; GFX6-NEXT: ; %bb.1: 1512; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 1513; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1514; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1515; GFX6-NEXT: s_mul_i32 s2, s6, s2 1516; GFX6-NEXT: v_mov_b32_e32 v1, s2 1517; GFX6-NEXT: v_mov_b32_e32 v2, 0 1518; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1519; GFX6-NEXT: .LBB6_2: 1520; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 1521; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1522; GFX6-NEXT: s_mov_b32 s3, 0xf000 1523; GFX6-NEXT: s_mov_b32 s2, -1 1524; GFX6-NEXT: s_waitcnt vmcnt(0) 1525; GFX6-NEXT: v_readfirstlane_b32 s4, v1 1526; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1527; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 1528; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1529; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1530; GFX6-NEXT: s_endpgm 1531; 1532; GFX8-LABEL: sub_i32_uniform: 1533; GFX8: ; %bb.0: ; %entry 1534; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 1535; GFX8-NEXT: s_mov_b64 s[2:3], exec 1536; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1537; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1538; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1539; GFX8-NEXT: ; implicit-def: $vgpr1 1540; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1541; GFX8-NEXT: s_cbranch_execz .LBB6_2 1542; GFX8-NEXT: ; %bb.1: 1543; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1544; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1545; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1546; GFX8-NEXT: s_mul_i32 s2, s6, s2 1547; GFX8-NEXT: v_mov_b32_e32 v1, s2 1548; GFX8-NEXT: v_mov_b32_e32 v2, 0 1549; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1550; GFX8-NEXT: .LBB6_2: 1551; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1552; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1553; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1554; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1555; GFX8-NEXT: s_waitcnt vmcnt(0) 1556; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1557; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 1558; GFX8-NEXT: v_mov_b32_e32 v0, s0 1559; GFX8-NEXT: v_mov_b32_e32 v1, s1 1560; GFX8-NEXT: flat_store_dword v[0:1], v2 1561; GFX8-NEXT: s_endpgm 1562; 1563; GFX9-LABEL: sub_i32_uniform: 1564; GFX9: ; %bb.0: ; %entry 1565; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 1566; GFX9-NEXT: s_mov_b64 s[2:3], exec 1567; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1568; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1569; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1570; GFX9-NEXT: ; implicit-def: $vgpr1 1571; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1572; GFX9-NEXT: s_cbranch_execz .LBB6_2 1573; GFX9-NEXT: ; %bb.1: 1574; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1575; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1576; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1577; GFX9-NEXT: s_mul_i32 s2, s6, s2 1578; GFX9-NEXT: v_mov_b32_e32 v1, s2 1579; GFX9-NEXT: v_mov_b32_e32 v2, 0 1580; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1581; GFX9-NEXT: .LBB6_2: 1582; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1583; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1584; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1586; GFX9-NEXT: s_waitcnt vmcnt(0) 1587; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1588; GFX9-NEXT: v_mov_b32_e32 v2, 0 1589; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1590; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1591; GFX9-NEXT: s_endpgm 1592; 1593; GFX10W64-LABEL: sub_i32_uniform: 1594; GFX10W64: ; %bb.0: ; %entry 1595; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 1596; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 1597; GFX10W64-NEXT: ; implicit-def: $vgpr1 1598; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1599; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1600; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1601; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1602; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1603; GFX10W64-NEXT: ; %bb.1: 1604; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1605; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1606; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 1607; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1608; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 1609; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 1610; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1611; GFX10W64-NEXT: .LBB6_2: 1612; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1613; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1614; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1615; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1616; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 1617; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1618; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 1619; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1620; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1621; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 1622; GFX10W64-NEXT: s_endpgm 1623; 1624; GFX10W32-LABEL: sub_i32_uniform: 1625; GFX10W32: ; %bb.0: ; %entry 1626; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 1627; GFX10W32-NEXT: s_mov_b32 s2, exec_lo 1628; GFX10W32-NEXT: ; implicit-def: $vgpr1 1629; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1630; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1631; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1632; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1633; GFX10W32-NEXT: ; %bb.1: 1634; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1635; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 1636; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 1637; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1638; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 1639; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 1640; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc 1641; GFX10W32-NEXT: .LBB6_2: 1642; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1643; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1644; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1645; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1646; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 1647; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1648; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1649; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1650; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1651; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1652; GFX10W32-NEXT: s_endpgm 1653; 1654; GFX11W64-LABEL: sub_i32_uniform: 1655; GFX11W64: ; %bb.0: ; %entry 1656; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 1657; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 1658; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 1659; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1660; GFX11W64-NEXT: ; implicit-def: $vgpr1 1661; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1662; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1663; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1664; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 1665; GFX11W64-NEXT: ; %bb.1: 1666; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1667; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1668; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 1669; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 1671; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1672; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 1673; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 1674; GFX11W64-NEXT: .LBB6_2: 1675; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 1676; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1677; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1678; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 1679; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1680; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 1681; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1682; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1683; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1684; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 1685; GFX11W64-NEXT: s_endpgm 1686; 1687; GFX11W32-LABEL: sub_i32_uniform: 1688; GFX11W32: ; %bb.0: ; %entry 1689; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 1690; GFX11W32-NEXT: s_mov_b32 s2, exec_lo 1691; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 1692; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1693; GFX11W32-NEXT: ; implicit-def: $vgpr1 1694; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1695; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1696; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 1697; GFX11W32-NEXT: ; %bb.1: 1698; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1699; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 1700; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 1701; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1702; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 1703; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1704; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 1705; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 1706; GFX11W32-NEXT: .LBB6_2: 1707; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1708; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 1709; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 1711; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1712; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1713; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1714; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1715; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1716; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1717; GFX11W32-NEXT: s_endpgm 1718; 1719; GFX12W64-LABEL: sub_i32_uniform: 1720; GFX12W64: ; %bb.0: ; %entry 1721; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 1722; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 1723; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 1724; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1725; GFX12W64-NEXT: ; implicit-def: $vgpr1 1726; GFX12W64-NEXT: s_wait_alu 0xfffe 1727; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1728; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1729; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1730; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 1731; GFX12W64-NEXT: ; %bb.1: 1732; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1733; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1734; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 1735; GFX12W64-NEXT: s_wait_kmcnt 0x0 1736; GFX12W64-NEXT: s_wait_alu 0xfffe 1737; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 1738; GFX12W64-NEXT: s_wait_alu 0xfffe 1739; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 1740; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 1741; GFX12W64-NEXT: .LBB6_2: 1742; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 1743; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1744; GFX12W64-NEXT: s_wait_kmcnt 0x0 1745; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 1746; GFX12W64-NEXT: s_wait_loadcnt 0x0 1747; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 1748; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 1749; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1750; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1751; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 1752; GFX12W64-NEXT: s_endpgm 1753; 1754; GFX12W32-LABEL: sub_i32_uniform: 1755; GFX12W32: ; %bb.0: ; %entry 1756; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 1757; GFX12W32-NEXT: s_mov_b32 s2, exec_lo 1758; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 1759; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1760; GFX12W32-NEXT: ; implicit-def: $vgpr1 1761; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1762; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1763; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 1764; GFX12W32-NEXT: ; %bb.1: 1765; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1766; GFX12W32-NEXT: s_wait_alu 0xfffe 1767; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 1768; GFX12W32-NEXT: s_wait_kmcnt 0x0 1769; GFX12W32-NEXT: s_wait_alu 0xfffe 1770; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 1771; GFX12W32-NEXT: s_wait_alu 0xfffe 1772; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 1773; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 1774; GFX12W32-NEXT: .LBB6_2: 1775; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1776; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 1777; GFX12W32-NEXT: s_wait_kmcnt 0x0 1778; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 1779; GFX12W32-NEXT: s_wait_loadcnt 0x0 1780; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 1781; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 1782; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1783; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1784; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] 1785; GFX12W32-NEXT: s_endpgm 1786entry: 1787 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) 1788 store i32 %old, ptr addrspace(1) %out 1789 ret void 1790} 1791 1792define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 1793; GFX6-LABEL: sub_i32_varying_vdata: 1794; GFX6: ; %bb.0: ; %entry 1795; GFX6-NEXT: s_mov_b64 s[0:1], exec 1796; GFX6-NEXT: s_mov_b32 s2, 0 1797; GFX6-NEXT: ; implicit-def: $vgpr1 1798; GFX6-NEXT: .LBB7_1: ; %ComputeLoop 1799; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 1800; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] 1801; GFX6-NEXT: s_mov_b32 m0, s3 1802; GFX6-NEXT: v_readlane_b32 s8, v0, s3 1803; GFX6-NEXT: v_writelane_b32 v1, s2, m0 1804; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 1805; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1806; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 1807; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] 1808; GFX6-NEXT: s_add_i32 s2, s2, s8 1809; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 1810; GFX6-NEXT: ; %bb.2: ; %ComputeEnd 1811; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1812; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 1813; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1814; GFX6-NEXT: ; implicit-def: $vgpr0 1815; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 1816; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1817; GFX6-NEXT: s_cbranch_execz .LBB7_4 1818; GFX6-NEXT: ; %bb.3: 1819; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 1820; GFX6-NEXT: v_mov_b32_e32 v0, s2 1821; GFX6-NEXT: v_mov_b32_e32 v2, 0 1822; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1823; GFX6-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc 1824; GFX6-NEXT: .LBB7_4: 1825; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 1826; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1827; GFX6-NEXT: s_mov_b32 s3, 0xf000 1828; GFX6-NEXT: s_mov_b32 s2, -1 1829; GFX6-NEXT: s_waitcnt vmcnt(0) 1830; GFX6-NEXT: v_readfirstlane_b32 s4, v0 1831; GFX6-NEXT: s_waitcnt expcnt(0) 1832; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 1833; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1834; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1835; GFX6-NEXT: s_endpgm 1836; 1837; GFX8-LABEL: sub_i32_varying_vdata: 1838; GFX8: ; %bb.0: ; %entry 1839; GFX8-NEXT: s_mov_b64 s[0:1], exec 1840; GFX8-NEXT: s_mov_b32 s2, 0 1841; GFX8-NEXT: ; implicit-def: $vgpr1 1842; GFX8-NEXT: .LBB7_1: ; %ComputeLoop 1843; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1844; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] 1845; GFX8-NEXT: s_mov_b32 m0, s3 1846; GFX8-NEXT: v_readlane_b32 s8, v0, s3 1847; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 1848; GFX8-NEXT: v_writelane_b32 v1, s2, m0 1849; GFX8-NEXT: s_add_i32 s2, s2, s8 1850; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1851; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 1852; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 1853; GFX8-NEXT: ; %bb.2: ; %ComputeEnd 1854; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1855; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1856; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1857; GFX8-NEXT: ; implicit-def: $vgpr0 1858; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1859; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1860; GFX8-NEXT: s_cbranch_execz .LBB7_4 1861; GFX8-NEXT: ; %bb.3: 1862; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1863; GFX8-NEXT: v_mov_b32_e32 v0, s2 1864; GFX8-NEXT: v_mov_b32_e32 v2, 0 1865; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1866; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc 1867; GFX8-NEXT: .LBB7_4: 1868; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1869; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1870; GFX8-NEXT: s_waitcnt vmcnt(0) 1871; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1872; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 1873; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1874; GFX8-NEXT: v_mov_b32_e32 v0, s0 1875; GFX8-NEXT: v_mov_b32_e32 v1, s1 1876; GFX8-NEXT: flat_store_dword v[0:1], v2 1877; GFX8-NEXT: s_endpgm 1878; 1879; GFX9-LABEL: sub_i32_varying_vdata: 1880; GFX9: ; %bb.0: ; %entry 1881; GFX9-NEXT: s_mov_b64 s[0:1], exec 1882; GFX9-NEXT: s_mov_b32 s2, 0 1883; GFX9-NEXT: ; implicit-def: $vgpr1 1884; GFX9-NEXT: .LBB7_1: ; %ComputeLoop 1885; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1886; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] 1887; GFX9-NEXT: s_mov_b32 m0, s3 1888; GFX9-NEXT: v_readlane_b32 s8, v0, s3 1889; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 1890; GFX9-NEXT: v_writelane_b32 v1, s2, m0 1891; GFX9-NEXT: s_add_i32 s2, s2, s8 1892; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1893; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 1894; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 1895; GFX9-NEXT: ; %bb.2: ; %ComputeEnd 1896; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1897; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1898; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1899; GFX9-NEXT: ; implicit-def: $vgpr0 1900; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1901; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1902; GFX9-NEXT: s_cbranch_execz .LBB7_4 1903; GFX9-NEXT: ; %bb.3: 1904; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1905; GFX9-NEXT: v_mov_b32_e32 v0, s2 1906; GFX9-NEXT: v_mov_b32_e32 v2, 0 1907; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1908; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc 1909; GFX9-NEXT: .LBB7_4: 1910; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1911; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1912; GFX9-NEXT: s_waitcnt vmcnt(0) 1913; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1914; GFX9-NEXT: v_mov_b32_e32 v2, 0 1915; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 1916; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1917; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1918; GFX9-NEXT: s_endpgm 1919; 1920; GFX10W64-LABEL: sub_i32_varying_vdata: 1921; GFX10W64: ; %bb.0: ; %entry 1922; GFX10W64-NEXT: s_mov_b64 s[0:1], exec 1923; GFX10W64-NEXT: s_mov_b32 s2, 0 1924; GFX10W64-NEXT: ; implicit-def: $vgpr1 1925; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop 1926; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 1927; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] 1928; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 1929; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 1930; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 1931; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1932; GFX10W64-NEXT: s_add_i32 s2, s2, s8 1933; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 1934; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 1935; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd 1936; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1937; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1938; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1939; GFX10W64-NEXT: ; implicit-def: $vgpr0 1940; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1941; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1942; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 1943; GFX10W64-NEXT: ; %bb.3: 1944; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1945; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 1946; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 1947; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1948; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc 1949; GFX10W64-NEXT: .LBB7_4: 1950; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1951; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1952; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1953; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1954; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 1955; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 1956; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 1957; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1958; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] 1959; GFX10W64-NEXT: s_endpgm 1960; 1961; GFX10W32-LABEL: sub_i32_varying_vdata: 1962; GFX10W32: ; %bb.0: ; %entry 1963; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 1964; GFX10W32-NEXT: s_mov_b32 s0, 0 1965; GFX10W32-NEXT: ; implicit-def: $vgpr1 1966; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop 1967; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 1968; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 1969; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 1970; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 1971; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 1972; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 1973; GFX10W32-NEXT: s_add_i32 s0, s0, s3 1974; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 1975; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 1976; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd 1977; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1978; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1979; GFX10W32-NEXT: ; implicit-def: $vgpr0 1980; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1981; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 1982; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 1983; GFX10W32-NEXT: ; %bb.3: 1984; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1985; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 1986; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 1987; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1988; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc 1989; GFX10W32-NEXT: .LBB7_4: 1990; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1991; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1992; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1993; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1994; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 1995; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 1996; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 1997; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1998; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] 1999; GFX10W32-NEXT: s_endpgm 2000; 2001; GFX11W64-LABEL: sub_i32_varying_vdata: 2002; GFX11W64: ; %bb.0: ; %entry 2003; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2004; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 2005; GFX11W64-NEXT: s_mov_b32 s2, 0 2006; GFX11W64-NEXT: ; implicit-def: $vgpr0 2007; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop 2008; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 2009; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 2010; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 2011; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 2012; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 2013; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 2014; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 2015; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 2016; GFX11W64-NEXT: s_add_i32 s2, s2, s8 2017; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2018; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 2019; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd 2020; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 2021; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2022; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 2023; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2024; GFX11W64-NEXT: ; implicit-def: $vgpr1 2025; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2026; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2027; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2028; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 2029; GFX11W64-NEXT: ; %bb.3: 2030; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2031; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 2032; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 2033; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 2034; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 2035; GFX11W64-NEXT: .LBB7_4: 2036; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 2037; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2038; GFX11W64-NEXT: s_waitcnt vmcnt(0) 2039; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 2040; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 2041; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 2042; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2043; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 2044; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 2045; GFX11W64-NEXT: s_endpgm 2046; 2047; GFX11W32-LABEL: sub_i32_varying_vdata: 2048; GFX11W32: ; %bb.0: ; %entry 2049; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2050; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 2051; GFX11W32-NEXT: s_mov_b32 s0, 0 2052; GFX11W32-NEXT: ; implicit-def: $vgpr0 2053; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop 2054; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 2055; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 2056; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 2057; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 2058; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 2059; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 2060; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 2061; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2062; GFX11W32-NEXT: s_add_i32 s0, s0, s3 2063; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 2064; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 2065; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd 2066; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 2067; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2068; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2069; GFX11W32-NEXT: ; implicit-def: $vgpr1 2070; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2071; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 2072; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 2073; GFX11W32-NEXT: ; %bb.3: 2074; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2075; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 2076; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 2077; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc 2078; GFX11W32-NEXT: .LBB7_4: 2079; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2080; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2081; GFX11W32-NEXT: s_waitcnt vmcnt(0) 2082; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 2083; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 2084; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2085; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2086; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 2087; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 2088; GFX11W32-NEXT: s_endpgm 2089; 2090; GFX12W64-LABEL: sub_i32_varying_vdata: 2091; GFX12W64: ; %bb.0: ; %entry 2092; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2093; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 2094; GFX12W64-NEXT: s_mov_b32 s2, 0 2095; GFX12W64-NEXT: ; implicit-def: $vgpr0 2096; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop 2097; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 2098; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 2099; GFX12W64-NEXT: s_wait_alu 0xfffe 2100; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 2101; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 2102; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 2103; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 2104; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 2105; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 2106; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2107; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 2108; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd 2109; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 2110; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2111; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 2112; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2113; GFX12W64-NEXT: ; implicit-def: $vgpr1 2114; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2115; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2116; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2117; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 2118; GFX12W64-NEXT: ; %bb.3: 2119; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2120; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 2121; GFX12W64-NEXT: s_wait_alu 0xfffe 2122; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 2123; GFX12W64-NEXT: s_wait_kmcnt 0x0 2124; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 2125; GFX12W64-NEXT: .LBB7_4: 2126; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 2127; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2128; GFX12W64-NEXT: s_wait_loadcnt 0x0 2129; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 2130; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 2131; GFX12W64-NEXT: s_wait_alu 0xfffe 2132; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 2133; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2134; GFX12W64-NEXT: s_wait_kmcnt 0x0 2135; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 2136; GFX12W64-NEXT: s_endpgm 2137; 2138; GFX12W32-LABEL: sub_i32_varying_vdata: 2139; GFX12W32: ; %bb.0: ; %entry 2140; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2141; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 2142; GFX12W32-NEXT: s_mov_b32 s0, 0 2143; GFX12W32-NEXT: ; implicit-def: $vgpr0 2144; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop 2145; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 2146; GFX12W32-NEXT: s_wait_alu 0xfffe 2147; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 2148; GFX12W32-NEXT: s_wait_alu 0xfffe 2149; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 2150; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 2151; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 2152; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 2153; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2154; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 2155; GFX12W32-NEXT: s_wait_alu 0xfffe 2156; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 2157; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 2158; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd 2159; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 2160; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 2161; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2162; GFX12W32-NEXT: ; implicit-def: $vgpr1 2163; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2164; GFX12W32-NEXT: s_wait_alu 0xfffe 2165; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 2166; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 2167; GFX12W32-NEXT: ; %bb.3: 2168; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2169; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 2170; GFX12W32-NEXT: s_wait_kmcnt 0x0 2171; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 2172; GFX12W32-NEXT: .LBB7_4: 2173; GFX12W32-NEXT: s_wait_alu 0xfffe 2174; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2175; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2176; GFX12W32-NEXT: s_wait_loadcnt 0x0 2177; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 2178; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 2179; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2180; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2181; GFX12W32-NEXT: s_wait_kmcnt 0x0 2182; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 2183; GFX12W32-NEXT: s_endpgm 2184entry: 2185 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2186 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) 2187 store i32 %old, ptr addrspace(1) %out 2188 ret void 2189} 2190 2191define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 2192; GFX6-LABEL: sub_i32_varying_vindex: 2193; GFX6: ; %bb.0: ; %entry 2194; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 2195; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 2196; GFX6-NEXT: v_mov_b32_e32 v1, 1 2197; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2198; GFX6-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc 2199; GFX6-NEXT: s_mov_b32 s7, 0xf000 2200; GFX6-NEXT: s_mov_b32 s6, -1 2201; GFX6-NEXT: s_waitcnt vmcnt(0) 2202; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 2203; GFX6-NEXT: s_endpgm 2204; 2205; GFX8-LABEL: sub_i32_varying_vindex: 2206; GFX8: ; %bb.0: ; %entry 2207; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2208; GFX8-NEXT: v_mov_b32_e32 v2, 1 2209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2210; GFX8-NEXT: buffer_atomic_sub v2, v0, s[0:3], 0 idxen glc 2211; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2212; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX8-NEXT: v_mov_b32_e32 v0, s0 2214; GFX8-NEXT: v_mov_b32_e32 v1, s1 2215; GFX8-NEXT: s_waitcnt vmcnt(0) 2216; GFX8-NEXT: flat_store_dword v[0:1], v2 2217; GFX8-NEXT: s_endpgm 2218; 2219; GFX9-LABEL: sub_i32_varying_vindex: 2220; GFX9: ; %bb.0: ; %entry 2221; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2222; GFX9-NEXT: v_mov_b32_e32 v1, 1 2223; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2224; GFX9-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc 2225; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2226; GFX9-NEXT: v_mov_b32_e32 v0, 0 2227; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2228; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2229; GFX9-NEXT: s_endpgm 2230; 2231; GFX10-LABEL: sub_i32_varying_vindex: 2232; GFX10: ; %bb.0: ; %entry 2233; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2234; GFX10-NEXT: v_mov_b32_e32 v1, 1 2235; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2236; GFX10-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc 2237; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2238; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2239; GFX10-NEXT: v_mov_b32_e32 v0, 0 2240; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2241; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2242; GFX10-NEXT: s_endpgm 2243; 2244; GFX11W64-LABEL: sub_i32_varying_vindex: 2245; GFX11W64: ; %bb.0: ; %entry 2246; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2247; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2248; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 2249; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 2250; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 idxen glc 2251; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2252; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 2253; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2254; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 2255; GFX11W64-NEXT: s_endpgm 2256; 2257; GFX11W32-LABEL: sub_i32_varying_vindex: 2258; GFX11W32: ; %bb.0: ; %entry 2259; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2260; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 2261; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 2262; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 idxen glc 2263; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2264; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 2265; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2266; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] 2267; GFX11W32-NEXT: s_endpgm 2268; 2269; GFX12W64-LABEL: sub_i32_varying_vindex: 2270; GFX12W64: ; %bb.0: ; %entry 2271; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2272; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2273; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 2274; GFX12W64-NEXT: s_wait_kmcnt 0x0 2275; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN 2276; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2277; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 2278; GFX12W64-NEXT: s_wait_loadcnt 0x0 2279; GFX12W64-NEXT: s_wait_kmcnt 0x0 2280; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] 2281; GFX12W64-NEXT: s_endpgm 2282; 2283; GFX12W32-LABEL: sub_i32_varying_vindex: 2284; GFX12W32: ; %bb.0: ; %entry 2285; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2286; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 2287; GFX12W32-NEXT: s_wait_kmcnt 0x0 2288; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN 2289; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2290; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 2291; GFX12W32-NEXT: s_wait_loadcnt 0x0 2292; GFX12W32-NEXT: s_wait_kmcnt 0x0 2293; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] 2294; GFX12W32-NEXT: s_endpgm 2295entry: 2296 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2297 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) 2298 store i32 %old, ptr addrspace(1) %out 2299 ret void 2300} 2301 2302define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 2303; GFX6-LABEL: sub_i32_varying_offset: 2304; GFX6: ; %bb.0: ; %entry 2305; GFX6-NEXT: v_mov_b32_e32 v1, v0 2306; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 2307; GFX6-NEXT: s_mov_b32 s6, 0 2308; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 2309; GFX6-NEXT: v_mov_b32_e32 v0, s6 2310; GFX6-NEXT: v_mov_b32_e32 v2, 1 2311; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2312; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc 2313; GFX6-NEXT: s_mov_b32 s7, 0xf000 2314; GFX6-NEXT: s_mov_b32 s6, -1 2315; GFX6-NEXT: s_waitcnt vmcnt(0) 2316; GFX6-NEXT: buffer_store_dword v2, off, s[4:7], 0 2317; GFX6-NEXT: s_endpgm 2318; 2319; GFX8-LABEL: sub_i32_varying_offset: 2320; GFX8: ; %bb.0: ; %entry 2321; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2322; GFX8-NEXT: s_mov_b32 s6, 0 2323; GFX8-NEXT: v_mov_b32_e32 v1, v0 2324; GFX8-NEXT: v_mov_b32_e32 v0, s6 2325; GFX8-NEXT: v_mov_b32_e32 v2, 1 2326; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2327; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc 2328; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2329; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2330; GFX8-NEXT: v_mov_b32_e32 v0, s0 2331; GFX8-NEXT: v_mov_b32_e32 v1, s1 2332; GFX8-NEXT: s_waitcnt vmcnt(0) 2333; GFX8-NEXT: flat_store_dword v[0:1], v2 2334; GFX8-NEXT: s_endpgm 2335; 2336; GFX9-LABEL: sub_i32_varying_offset: 2337; GFX9: ; %bb.0: ; %entry 2338; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2339; GFX9-NEXT: s_mov_b32 s6, 0 2340; GFX9-NEXT: v_mov_b32_e32 v1, v0 2341; GFX9-NEXT: v_mov_b32_e32 v0, s6 2342; GFX9-NEXT: v_mov_b32_e32 v2, 1 2343; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2344; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc 2345; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2346; GFX9-NEXT: v_mov_b32_e32 v0, 0 2347; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2348; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 2349; GFX9-NEXT: s_endpgm 2350; 2351; GFX10-LABEL: sub_i32_varying_offset: 2352; GFX10: ; %bb.0: ; %entry 2353; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2354; GFX10-NEXT: s_mov_b32 s6, 0 2355; GFX10-NEXT: v_mov_b32_e32 v1, v0 2356; GFX10-NEXT: v_mov_b32_e32 v0, s6 2357; GFX10-NEXT: v_mov_b32_e32 v2, 1 2358; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2359; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc 2360; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2361; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2362; GFX10-NEXT: v_mov_b32_e32 v0, 0 2363; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2364; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 2365; GFX10-NEXT: s_endpgm 2366; 2367; GFX11W64-LABEL: sub_i32_varying_offset: 2368; GFX11W64: ; %bb.0: ; %entry 2369; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2370; GFX11W64-NEXT: s_mov_b32 s6, 0 2371; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2372; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 2373; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 2374; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 2375; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc 2376; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2377; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 2378; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2379; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] 2380; GFX11W64-NEXT: s_endpgm 2381; 2382; GFX11W32-LABEL: sub_i32_varying_offset: 2383; GFX11W32: ; %bb.0: ; %entry 2384; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2385; GFX11W32-NEXT: s_mov_b32 s6, 0 2386; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2387; GFX11W32-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0 2388; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 2389; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 2390; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc 2391; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2392; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 2393; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2394; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] 2395; GFX11W32-NEXT: s_endpgm 2396; 2397; GFX12W64-LABEL: sub_i32_varying_offset: 2398; GFX12W64: ; %bb.0: ; %entry 2399; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2400; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2401; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 2402; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 2403; GFX12W64-NEXT: s_wait_kmcnt 0x0 2404; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN 2405; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2406; GFX12W64-NEXT: s_wait_loadcnt 0x0 2407; GFX12W64-NEXT: s_wait_kmcnt 0x0 2408; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1] 2409; GFX12W64-NEXT: s_endpgm 2410; 2411; GFX12W32-LABEL: sub_i32_varying_offset: 2412; GFX12W32: ; %bb.0: ; %entry 2413; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2414; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 2415; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 2416; GFX12W32-NEXT: s_wait_kmcnt 0x0 2417; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN 2418; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2419; GFX12W32-NEXT: s_wait_loadcnt 0x0 2420; GFX12W32-NEXT: s_wait_kmcnt 0x0 2421; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1] 2422; GFX12W32-NEXT: s_endpgm 2423entry: 2424 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2425 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0) 2426 store i32 %old, ptr addrspace(1) %out 2427 ret void 2428} 2429;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 2430; GFX11: {{.*}} 2431; GFX12: {{.*}} 2432