1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s 10; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s 11 12declare i32 @llvm.amdgcn.workitem.id.x() 13declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32) 14declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32) 15 16; Show what the atomic optimization pass will do for raw buffers. 17 18define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 19; GFX6-LABEL: add_i32_constant: 20; GFX6: ; %bb.0: ; %entry 21; GFX6-NEXT: s_mov_b64 s[2:3], exec 22; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX6-NEXT: ; implicit-def: $vgpr1 26; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 27; GFX6-NEXT: s_cbranch_execz .LBB0_2 28; GFX6-NEXT: ; %bb.1: 29; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 30; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 31; GFX6-NEXT: s_mul_i32 s2, s2, 5 32; GFX6-NEXT: v_mov_b32_e32 v1, s2 33; GFX6-NEXT: s_waitcnt lgkmcnt(0) 34; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 35; GFX6-NEXT: .LBB0_2: 36; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 37; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 38; GFX6-NEXT: s_mov_b32 s3, 0xf000 39; GFX6-NEXT: s_mov_b32 s2, -1 40; GFX6-NEXT: s_waitcnt vmcnt(0) 41; GFX6-NEXT: v_readfirstlane_b32 s4, v1 42; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 43; GFX6-NEXT: s_waitcnt lgkmcnt(0) 44; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX6-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_mov_b64 s[2:3], exec 50; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 51; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 52; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX8-NEXT: ; implicit-def: $vgpr1 54; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 55; GFX8-NEXT: s_cbranch_execz .LBB0_2 56; GFX8-NEXT: ; %bb.1: 57; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, s2 61; GFX8-NEXT: s_waitcnt lgkmcnt(0) 62; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 63; GFX8-NEXT: .LBB0_2: 64; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 65; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 66; GFX8-NEXT: s_waitcnt vmcnt(0) 67; GFX8-NEXT: v_readfirstlane_b32 s2, v1 68; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 69; GFX8-NEXT: s_waitcnt lgkmcnt(0) 70; GFX8-NEXT: v_mov_b32_e32 v0, s0 71; GFX8-NEXT: v_mov_b32_e32 v1, s1 72; GFX8-NEXT: flat_store_dword v[0:1], v2 73; GFX8-NEXT: s_endpgm 74; 75; GFX9-LABEL: add_i32_constant: 76; GFX9: ; %bb.0: ; %entry 77; GFX9-NEXT: s_mov_b64 s[2:3], exec 78; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 79; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 80; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 81; GFX9-NEXT: ; implicit-def: $vgpr1 82; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 83; GFX9-NEXT: s_cbranch_execz .LBB0_2 84; GFX9-NEXT: ; %bb.1: 85; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 86; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 87; GFX9-NEXT: s_mul_i32 s2, s2, 5 88; GFX9-NEXT: v_mov_b32_e32 v1, s2 89; GFX9-NEXT: s_waitcnt lgkmcnt(0) 90; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 91; GFX9-NEXT: .LBB0_2: 92; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 93; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 94; GFX9-NEXT: s_waitcnt vmcnt(0) 95; GFX9-NEXT: v_readfirstlane_b32 s2, v1 96; GFX9-NEXT: v_mov_b32_e32 v2, 0 97; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 98; GFX9-NEXT: s_waitcnt lgkmcnt(0) 99; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 100; GFX9-NEXT: s_endpgm 101; 102; GFX10W64-LABEL: add_i32_constant: 103; GFX10W64: ; %bb.0: ; %entry 104; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 105; GFX10W64-NEXT: ; implicit-def: $vgpr1 106; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 107; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 108; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 109; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 110; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 111; GFX10W64-NEXT: ; %bb.1: 112; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 113; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 114; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 115; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 116; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 117; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 118; GFX10W64-NEXT: .LBB0_2: 119; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 120; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 121; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 122; GFX10W64-NEXT: s_waitcnt vmcnt(0) 123; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 124; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 125; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 126; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 127; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 128; GFX10W64-NEXT: s_endpgm 129; 130; GFX10W32-LABEL: add_i32_constant: 131; GFX10W32: ; %bb.0: ; %entry 132; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 133; GFX10W32-NEXT: ; implicit-def: $vgpr1 134; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 135; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 136; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 137; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 138; GFX10W32-NEXT: ; %bb.1: 139; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 140; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 141; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 142; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 143; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 144; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 145; GFX10W32-NEXT: .LBB0_2: 146; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 147; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 148; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 149; GFX10W32-NEXT: s_waitcnt vmcnt(0) 150; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 151; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 152; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 153; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 154; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] 155; GFX10W32-NEXT: s_endpgm 156; 157; GFX11W64-LABEL: add_i32_constant: 158; GFX11W64: ; %bb.0: ; %entry 159; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 160; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 161; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 162; GFX11W64-NEXT: ; implicit-def: $vgpr1 163; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 164; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 165; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 166; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 167; GFX11W64-NEXT: ; %bb.1: 168; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 169; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 170; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 171; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 172; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 173; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 174; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 175; GFX11W64-NEXT: .LBB0_2: 176; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 177; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 178; GFX11W64-NEXT: s_waitcnt vmcnt(0) 179; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 180; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 181; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 182; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 183; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 184; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 185; GFX11W64-NEXT: s_endpgm 186; 187; GFX11W32-LABEL: add_i32_constant: 188; GFX11W32: ; %bb.0: ; %entry 189; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 190; GFX11W32-NEXT: s_mov_b32 s0, exec_lo 191; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 192; GFX11W32-NEXT: ; implicit-def: $vgpr1 193; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 194; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 195; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 196; GFX11W32-NEXT: ; %bb.1: 197; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 198; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 199; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 200; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 201; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 202; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 203; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 204; GFX11W32-NEXT: .LBB0_2: 205; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 206; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 207; GFX11W32-NEXT: s_waitcnt vmcnt(0) 208; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 209; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 210; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 211; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 212; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 213; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 214; GFX11W32-NEXT: s_endpgm 215; 216; GFX12W64-LABEL: add_i32_constant: 217; GFX12W64: ; %bb.0: ; %entry 218; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 219; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 220; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 221; GFX12W64-NEXT: ; implicit-def: $vgpr1 222; GFX12W64-NEXT: s_wait_alu 0xfffe 223; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 224; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 225; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 226; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 227; GFX12W64-NEXT: ; %bb.1: 228; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 229; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 230; GFX12W64-NEXT: s_wait_alu 0xfffe 231; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 232; GFX12W64-NEXT: s_wait_alu 0xfffe 233; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 234; GFX12W64-NEXT: s_wait_kmcnt 0x0 235; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 236; GFX12W64-NEXT: .LBB0_2: 237; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 238; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 239; GFX12W64-NEXT: s_wait_loadcnt 0x0 240; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 241; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 242; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 243; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 244; GFX12W64-NEXT: s_wait_kmcnt 0x0 245; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 246; GFX12W64-NEXT: s_endpgm 247; 248; GFX12W32-LABEL: add_i32_constant: 249; GFX12W32: ; %bb.0: ; %entry 250; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 251; GFX12W32-NEXT: s_mov_b32 s0, exec_lo 252; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 253; GFX12W32-NEXT: ; implicit-def: $vgpr1 254; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 255; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 256; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 257; GFX12W32-NEXT: ; %bb.1: 258; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 259; GFX12W32-NEXT: s_wait_alu 0xfffe 260; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 261; GFX12W32-NEXT: s_wait_alu 0xfffe 262; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 263; GFX12W32-NEXT: s_wait_alu 0xfffe 264; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 265; GFX12W32-NEXT: s_wait_kmcnt 0x0 266; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 267; GFX12W32-NEXT: .LBB0_2: 268; GFX12W32-NEXT: s_wait_alu 0xfffe 269; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 270; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 271; GFX12W32-NEXT: s_wait_loadcnt 0x0 272; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 273; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 274; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 275; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 276; GFX12W32-NEXT: s_wait_kmcnt 0x0 277; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 278; GFX12W32-NEXT: s_endpgm 279entry: 280 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 281 store i32 %old, ptr addrspace(1) %out 282 ret void 283} 284 285define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { 286; GFX6-LABEL: add_i32_uniform: 287; GFX6: ; %bb.0: ; %entry 288; GFX6-NEXT: s_mov_b64 s[2:3], exec 289; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 290; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 291; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 292; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 293; GFX6-NEXT: ; implicit-def: $vgpr1 294; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 295; GFX6-NEXT: s_cbranch_execz .LBB1_2 296; GFX6-NEXT: ; %bb.1: 297; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 298; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 299; GFX6-NEXT: s_waitcnt lgkmcnt(0) 300; GFX6-NEXT: s_mul_i32 s2, s6, s2 301; GFX6-NEXT: v_mov_b32_e32 v1, s2 302; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 303; GFX6-NEXT: .LBB1_2: 304; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 305; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 306; GFX6-NEXT: s_mov_b32 s3, 0xf000 307; GFX6-NEXT: s_mov_b32 s2, -1 308; GFX6-NEXT: s_waitcnt vmcnt(0) 309; GFX6-NEXT: v_readfirstlane_b32 s4, v1 310; GFX6-NEXT: s_waitcnt lgkmcnt(0) 311; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 312; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 313; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 314; GFX6-NEXT: s_endpgm 315; 316; GFX8-LABEL: add_i32_uniform: 317; GFX8: ; %bb.0: ; %entry 318; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 319; GFX8-NEXT: s_mov_b64 s[2:3], exec 320; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 321; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 322; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 323; GFX8-NEXT: ; implicit-def: $vgpr1 324; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 325; GFX8-NEXT: s_cbranch_execz .LBB1_2 326; GFX8-NEXT: ; %bb.1: 327; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 328; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 329; GFX8-NEXT: s_waitcnt lgkmcnt(0) 330; GFX8-NEXT: s_mul_i32 s2, s6, s2 331; GFX8-NEXT: v_mov_b32_e32 v1, s2 332; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 333; GFX8-NEXT: .LBB1_2: 334; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 335; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 336; GFX8-NEXT: s_waitcnt lgkmcnt(0) 337; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 338; GFX8-NEXT: s_waitcnt vmcnt(0) 339; GFX8-NEXT: v_readfirstlane_b32 s2, v1 340; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 341; GFX8-NEXT: v_mov_b32_e32 v0, s0 342; GFX8-NEXT: v_mov_b32_e32 v1, s1 343; GFX8-NEXT: flat_store_dword v[0:1], v2 344; GFX8-NEXT: s_endpgm 345; 346; GFX9-LABEL: add_i32_uniform: 347; GFX9: ; %bb.0: ; %entry 348; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 349; GFX9-NEXT: s_mov_b64 s[2:3], exec 350; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 351; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 352; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 353; GFX9-NEXT: ; implicit-def: $vgpr1 354; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 355; GFX9-NEXT: s_cbranch_execz .LBB1_2 356; GFX9-NEXT: ; %bb.1: 357; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 358; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 359; GFX9-NEXT: s_waitcnt lgkmcnt(0) 360; GFX9-NEXT: s_mul_i32 s2, s6, s2 361; GFX9-NEXT: v_mov_b32_e32 v1, s2 362; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 363; GFX9-NEXT: .LBB1_2: 364; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 365; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 367; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 368; GFX9-NEXT: s_waitcnt vmcnt(0) 369; GFX9-NEXT: v_readfirstlane_b32 s2, v1 370; GFX9-NEXT: v_mov_b32_e32 v2, 0 371; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 372; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 373; GFX9-NEXT: s_endpgm 374; 375; GFX10W64-LABEL: add_i32_uniform: 376; GFX10W64: ; %bb.0: ; %entry 377; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 378; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 379; GFX10W64-NEXT: ; implicit-def: $vgpr1 380; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 381; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 382; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 383; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 384; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 385; GFX10W64-NEXT: ; %bb.1: 386; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 387; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 388; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 389; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 390; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 391; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 392; GFX10W64-NEXT: .LBB1_2: 393; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 394; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 395; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 396; GFX10W64-NEXT: s_waitcnt vmcnt(0) 397; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 398; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 399; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] 400; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 401; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 402; GFX10W64-NEXT: s_endpgm 403; 404; GFX10W32-LABEL: add_i32_uniform: 405; GFX10W32: ; %bb.0: ; %entry 406; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 407; GFX10W32-NEXT: s_mov_b32 s2, exec_lo 408; GFX10W32-NEXT: ; implicit-def: $vgpr1 409; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 410; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 411; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 412; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 413; GFX10W32-NEXT: ; %bb.1: 414; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 415; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 416; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 417; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 418; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 419; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 420; GFX10W32-NEXT: .LBB1_2: 421; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 422; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 423; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 424; GFX10W32-NEXT: s_waitcnt vmcnt(0) 425; GFX10W32-NEXT: s_mov_b32 null, 0 426; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 427; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 428; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] 429; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 430; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 431; GFX10W32-NEXT: s_endpgm 432; 433; GFX11W64-LABEL: add_i32_uniform: 434; GFX11W64: ; %bb.0: ; %entry 435; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 436; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 437; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 438; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 439; GFX11W64-NEXT: ; implicit-def: $vgpr1 440; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 441; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 442; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 443; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 444; GFX11W64-NEXT: ; %bb.1: 445; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 446; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 447; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 448; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 449; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 450; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 451; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 452; GFX11W64-NEXT: .LBB1_2: 453; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 454; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 455; GFX11W64-NEXT: s_waitcnt vmcnt(0) 456; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 457; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 458; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 459; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] 460; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 461; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 462; GFX11W64-NEXT: s_endpgm 463; 464; GFX11W32-LABEL: add_i32_uniform: 465; GFX11W32: ; %bb.0: ; %entry 466; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 467; GFX11W32-NEXT: s_mov_b32 s2, exec_lo 468; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 469; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 470; GFX11W32-NEXT: ; implicit-def: $vgpr1 471; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 472; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 473; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 474; GFX11W32-NEXT: ; %bb.1: 475; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 476; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 477; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 478; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 479; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 480; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 481; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 482; GFX11W32-NEXT: .LBB1_2: 483; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 484; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 485; GFX11W32-NEXT: s_waitcnt vmcnt(0) 486; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 487; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 488; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 489; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] 490; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 491; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] 492; GFX11W32-NEXT: s_endpgm 493; 494; GFX12W64-LABEL: add_i32_uniform: 495; GFX12W64: ; %bb.0: ; %entry 496; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 497; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 498; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 499; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 500; GFX12W64-NEXT: ; implicit-def: $vgpr1 501; GFX12W64-NEXT: s_wait_alu 0xfffe 502; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 503; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 504; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 505; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 506; GFX12W64-NEXT: ; %bb.1: 507; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 508; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 509; GFX12W64-NEXT: s_wait_kmcnt 0x0 510; GFX12W64-NEXT: s_wait_alu 0xfffe 511; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 512; GFX12W64-NEXT: s_wait_alu 0xfffe 513; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 514; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 515; GFX12W64-NEXT: .LBB1_2: 516; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 517; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 518; GFX12W64-NEXT: s_wait_loadcnt 0x0 519; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 520; GFX12W64-NEXT: s_wait_kmcnt 0x0 521; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 522; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] 523; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 524; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 525; GFX12W64-NEXT: s_endpgm 526; 527; GFX12W32-LABEL: add_i32_uniform: 528; GFX12W32: ; %bb.0: ; %entry 529; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 530; GFX12W32-NEXT: s_mov_b32 s2, exec_lo 531; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 532; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 533; GFX12W32-NEXT: ; implicit-def: $vgpr1 534; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 535; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 536; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 537; GFX12W32-NEXT: ; %bb.1: 538; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 539; GFX12W32-NEXT: s_wait_alu 0xfffe 540; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 541; GFX12W32-NEXT: s_wait_kmcnt 0x0 542; GFX12W32-NEXT: s_wait_alu 0xfffe 543; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 544; GFX12W32-NEXT: s_wait_alu 0xfffe 545; GFX12W32-NEXT: v_mov_b32_e32 v1, s2 546; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 547; GFX12W32-NEXT: .LBB1_2: 548; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 549; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 550; GFX12W32-NEXT: s_wait_loadcnt 0x0 551; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 552; GFX12W32-NEXT: s_wait_kmcnt 0x0 553; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 554; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] 555; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 556; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] 557; GFX12W32-NEXT: s_endpgm 558entry: 559 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 560 store i32 %old, ptr addrspace(1) %out 561 ret void 562} 563 564define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 565; GFX6-LABEL: add_i32_varying_vdata: 566; GFX6: ; %bb.0: ; %entry 567; GFX6-NEXT: s_mov_b64 s[0:1], exec 568; GFX6-NEXT: s_mov_b32 s2, 0 569; GFX6-NEXT: ; implicit-def: $vgpr1 570; GFX6-NEXT: .LBB2_1: ; %ComputeLoop 571; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 572; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] 573; GFX6-NEXT: s_mov_b32 m0, s3 574; GFX6-NEXT: v_readlane_b32 s8, v0, s3 575; GFX6-NEXT: v_writelane_b32 v1, s2, m0 576; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 577; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 578; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 579; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] 580; GFX6-NEXT: s_add_i32 s2, s2, s8 581; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 582; GFX6-NEXT: ; %bb.2: ; %ComputeEnd 583; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 584; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 585; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 586; GFX6-NEXT: ; implicit-def: $vgpr0 587; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 588; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 589; GFX6-NEXT: s_cbranch_execz .LBB2_4 590; GFX6-NEXT: ; %bb.3: 591; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 592; GFX6-NEXT: v_mov_b32_e32 v0, s2 593; GFX6-NEXT: s_waitcnt lgkmcnt(0) 594; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 595; GFX6-NEXT: .LBB2_4: 596; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 597; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 598; GFX6-NEXT: s_mov_b32 s3, 0xf000 599; GFX6-NEXT: s_mov_b32 s2, -1 600; GFX6-NEXT: s_waitcnt vmcnt(0) 601; GFX6-NEXT: v_readfirstlane_b32 s4, v0 602; GFX6-NEXT: s_waitcnt expcnt(0) 603; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 604; GFX6-NEXT: s_waitcnt lgkmcnt(0) 605; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 606; GFX6-NEXT: s_endpgm 607; 608; GFX8-LABEL: add_i32_varying_vdata: 609; GFX8: ; %bb.0: ; %entry 610; GFX8-NEXT: s_mov_b64 s[0:1], exec 611; GFX8-NEXT: s_mov_b32 s2, 0 612; GFX8-NEXT: ; implicit-def: $vgpr1 613; GFX8-NEXT: .LBB2_1: ; %ComputeLoop 614; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 615; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] 616; GFX8-NEXT: s_mov_b32 m0, s3 617; GFX8-NEXT: v_readlane_b32 s8, v0, s3 618; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 619; GFX8-NEXT: v_writelane_b32 v1, s2, m0 620; GFX8-NEXT: s_add_i32 s2, s2, s8 621; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 622; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 623; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 624; GFX8-NEXT: ; %bb.2: ; %ComputeEnd 625; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 626; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 627; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 628; GFX8-NEXT: ; implicit-def: $vgpr0 629; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 630; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 631; GFX8-NEXT: s_cbranch_execz .LBB2_4 632; GFX8-NEXT: ; %bb.3: 633; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 634; GFX8-NEXT: v_mov_b32_e32 v0, s2 635; GFX8-NEXT: s_waitcnt lgkmcnt(0) 636; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 637; GFX8-NEXT: .LBB2_4: 638; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 639; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 640; GFX8-NEXT: s_waitcnt vmcnt(0) 641; GFX8-NEXT: v_readfirstlane_b32 s2, v0 642; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 643; GFX8-NEXT: s_waitcnt lgkmcnt(0) 644; GFX8-NEXT: v_mov_b32_e32 v0, s0 645; GFX8-NEXT: v_mov_b32_e32 v1, s1 646; GFX8-NEXT: flat_store_dword v[0:1], v2 647; GFX8-NEXT: s_endpgm 648; 649; GFX9-LABEL: add_i32_varying_vdata: 650; GFX9: ; %bb.0: ; %entry 651; GFX9-NEXT: s_mov_b64 s[0:1], exec 652; GFX9-NEXT: s_mov_b32 s2, 0 653; GFX9-NEXT: ; implicit-def: $vgpr1 654; GFX9-NEXT: .LBB2_1: ; %ComputeLoop 655; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 656; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] 657; GFX9-NEXT: s_mov_b32 m0, s3 658; GFX9-NEXT: v_readlane_b32 s8, v0, s3 659; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 660; GFX9-NEXT: v_writelane_b32 v1, s2, m0 661; GFX9-NEXT: s_add_i32 s2, s2, s8 662; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 663; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 664; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 665; GFX9-NEXT: ; %bb.2: ; %ComputeEnd 666; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 667; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 668; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 669; GFX9-NEXT: ; implicit-def: $vgpr0 670; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 671; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 672; GFX9-NEXT: s_cbranch_execz .LBB2_4 673; GFX9-NEXT: ; %bb.3: 674; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 675; GFX9-NEXT: v_mov_b32_e32 v0, s2 676; GFX9-NEXT: s_waitcnt lgkmcnt(0) 677; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 678; GFX9-NEXT: .LBB2_4: 679; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 680; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 681; GFX9-NEXT: s_waitcnt vmcnt(0) 682; GFX9-NEXT: v_readfirstlane_b32 s2, v0 683; GFX9-NEXT: v_mov_b32_e32 v2, 0 684; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 685; GFX9-NEXT: s_waitcnt lgkmcnt(0) 686; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 687; GFX9-NEXT: s_endpgm 688; 689; GFX10W64-LABEL: add_i32_varying_vdata: 690; GFX10W64: ; %bb.0: ; %entry 691; GFX10W64-NEXT: s_mov_b64 s[0:1], exec 692; GFX10W64-NEXT: s_mov_b32 s2, 0 693; GFX10W64-NEXT: ; implicit-def: $vgpr1 694; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop 695; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 696; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] 697; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 698; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 699; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 700; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 701; GFX10W64-NEXT: s_add_i32 s2, s2, s8 702; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 703; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 704; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd 705; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 706; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 707; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 708; GFX10W64-NEXT: ; implicit-def: $vgpr0 709; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 710; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 711; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 712; GFX10W64-NEXT: ; %bb.3: 713; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 714; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 715; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 716; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 717; GFX10W64-NEXT: .LBB2_4: 718; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 719; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 720; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 721; GFX10W64-NEXT: s_waitcnt vmcnt(0) 722; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 723; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 724; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 725; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 726; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] 727; GFX10W64-NEXT: s_endpgm 728; 729; GFX10W32-LABEL: add_i32_varying_vdata: 730; GFX10W32: ; %bb.0: ; %entry 731; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 732; GFX10W32-NEXT: s_mov_b32 s0, 0 733; GFX10W32-NEXT: ; implicit-def: $vgpr1 734; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop 735; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 736; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 737; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 738; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 739; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 740; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 741; GFX10W32-NEXT: s_add_i32 s0, s0, s3 742; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 743; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 744; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd 745; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 746; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 747; GFX10W32-NEXT: ; implicit-def: $vgpr0 748; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 749; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 750; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 751; GFX10W32-NEXT: ; %bb.3: 752; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 753; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 754; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 755; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 756; GFX10W32-NEXT: .LBB2_4: 757; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 758; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 759; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 760; GFX10W32-NEXT: s_waitcnt vmcnt(0) 761; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 762; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 763; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 764; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 765; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] 766; GFX10W32-NEXT: s_endpgm 767; 768; GFX11W64-LABEL: add_i32_varying_vdata: 769; GFX11W64: ; %bb.0: ; %entry 770; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 771; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 772; GFX11W64-NEXT: s_mov_b32 s2, 0 773; GFX11W64-NEXT: ; implicit-def: $vgpr0 774; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop 775; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 776; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 777; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 778; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 779; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 780; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 781; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 782; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 783; GFX11W64-NEXT: s_add_i32 s2, s2, s8 784; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 785; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 786; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd 787; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 788; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 789; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 790; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 791; GFX11W64-NEXT: ; implicit-def: $vgpr1 792; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 793; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 794; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 795; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 796; GFX11W64-NEXT: ; %bb.3: 797; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 798; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 799; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 800; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 801; GFX11W64-NEXT: .LBB2_4: 802; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 803; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 804; GFX11W64-NEXT: s_waitcnt vmcnt(0) 805; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 806; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 807; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 808; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 809; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 810; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 811; GFX11W64-NEXT: s_endpgm 812; 813; GFX11W32-LABEL: add_i32_varying_vdata: 814; GFX11W32: ; %bb.0: ; %entry 815; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 816; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 817; GFX11W32-NEXT: s_mov_b32 s0, 0 818; GFX11W32-NEXT: ; implicit-def: $vgpr0 819; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop 820; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 821; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 822; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 823; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 824; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 825; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 826; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 827; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 828; GFX11W32-NEXT: s_add_i32 s0, s0, s3 829; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 830; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 831; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd 832; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 833; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 834; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 835; GFX11W32-NEXT: ; implicit-def: $vgpr1 836; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 837; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 838; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 839; GFX11W32-NEXT: ; %bb.3: 840; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 841; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 842; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 843; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 844; GFX11W32-NEXT: .LBB2_4: 845; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 846; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 847; GFX11W32-NEXT: s_waitcnt vmcnt(0) 848; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 849; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 850; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 851; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 852; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 853; GFX11W32-NEXT: s_endpgm 854; 855; GFX12W64-LABEL: add_i32_varying_vdata: 856; GFX12W64: ; %bb.0: ; %entry 857; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 858; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 859; GFX12W64-NEXT: s_mov_b32 s2, 0 860; GFX12W64-NEXT: ; implicit-def: $vgpr0 861; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop 862; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 863; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 864; GFX12W64-NEXT: s_wait_alu 0xfffe 865; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 866; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 867; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 868; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 869; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 870; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 871; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 872; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 873; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd 874; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 875; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 876; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 877; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 878; GFX12W64-NEXT: ; implicit-def: $vgpr1 879; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 880; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 881; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 882; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 883; GFX12W64-NEXT: ; %bb.3: 884; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 885; GFX12W64-NEXT: s_wait_alu 0xfffe 886; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 887; GFX12W64-NEXT: s_wait_kmcnt 0x0 888; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 889; GFX12W64-NEXT: .LBB2_4: 890; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 891; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 892; GFX12W64-NEXT: s_wait_loadcnt 0x0 893; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 894; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 895; GFX12W64-NEXT: s_wait_alu 0xfffe 896; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 897; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 898; GFX12W64-NEXT: s_wait_kmcnt 0x0 899; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 900; GFX12W64-NEXT: s_endpgm 901; 902; GFX12W32-LABEL: add_i32_varying_vdata: 903; GFX12W32: ; %bb.0: ; %entry 904; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 905; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 906; GFX12W32-NEXT: s_mov_b32 s0, 0 907; GFX12W32-NEXT: ; implicit-def: $vgpr0 908; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop 909; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 910; GFX12W32-NEXT: s_wait_alu 0xfffe 911; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 912; GFX12W32-NEXT: s_wait_alu 0xfffe 913; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 914; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 915; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 916; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 917; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 918; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 919; GFX12W32-NEXT: s_wait_alu 0xfffe 920; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 921; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 922; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd 923; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 924; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 925; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 926; GFX12W32-NEXT: ; implicit-def: $vgpr1 927; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 928; GFX12W32-NEXT: s_wait_alu 0xfffe 929; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 930; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 931; GFX12W32-NEXT: ; %bb.3: 932; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 933; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 934; GFX12W32-NEXT: s_wait_kmcnt 0x0 935; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 936; GFX12W32-NEXT: .LBB2_4: 937; GFX12W32-NEXT: s_wait_alu 0xfffe 938; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 939; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 940; GFX12W32-NEXT: s_wait_loadcnt 0x0 941; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 942; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 943; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 944; GFX12W32-NEXT: s_wait_kmcnt 0x0 945; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 946; GFX12W32-NEXT: s_endpgm 947entry: 948 %lane = call i32 @llvm.amdgcn.workitem.id.x() 949 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 950 store i32 %old, ptr addrspace(1) %out 951 ret void 952} 953 954define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 955; GFX6-LABEL: add_i32_varying_offset: 956; GFX6: ; %bb.0: ; %entry 957; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 958; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 959; GFX6-NEXT: v_mov_b32_e32 v1, 1 960; GFX6-NEXT: s_waitcnt lgkmcnt(0) 961; GFX6-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc 962; GFX6-NEXT: s_mov_b32 s7, 0xf000 963; GFX6-NEXT: s_mov_b32 s6, -1 964; GFX6-NEXT: s_waitcnt vmcnt(0) 965; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 966; GFX6-NEXT: s_endpgm 967; 968; GFX8-LABEL: add_i32_varying_offset: 969; GFX8: ; %bb.0: ; %entry 970; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 971; GFX8-NEXT: v_mov_b32_e32 v2, 1 972; GFX8-NEXT: s_waitcnt lgkmcnt(0) 973; GFX8-NEXT: buffer_atomic_add v2, v0, s[0:3], 0 offen glc 974; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 975; GFX8-NEXT: s_waitcnt lgkmcnt(0) 976; GFX8-NEXT: v_mov_b32_e32 v0, s0 977; GFX8-NEXT: v_mov_b32_e32 v1, s1 978; GFX8-NEXT: s_waitcnt vmcnt(0) 979; GFX8-NEXT: flat_store_dword v[0:1], v2 980; GFX8-NEXT: s_endpgm 981; 982; GFX9-LABEL: add_i32_varying_offset: 983; GFX9: ; %bb.0: ; %entry 984; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 985; GFX9-NEXT: v_mov_b32_e32 v1, 1 986; GFX9-NEXT: s_waitcnt lgkmcnt(0) 987; GFX9-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc 988; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 989; GFX9-NEXT: v_mov_b32_e32 v0, 0 990; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 991; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 992; GFX9-NEXT: s_endpgm 993; 994; GFX10-LABEL: add_i32_varying_offset: 995; GFX10: ; %bb.0: ; %entry 996; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 997; GFX10-NEXT: v_mov_b32_e32 v1, 1 998; GFX10-NEXT: s_waitcnt lgkmcnt(0) 999; GFX10-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc 1000; GFX10-NEXT: s_waitcnt_depctr 0xffe3 1001; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1002; GFX10-NEXT: v_mov_b32_e32 v0, 0 1003; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1004; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1005; GFX10-NEXT: s_endpgm 1006; 1007; GFX11W64-LABEL: add_i32_varying_offset: 1008; GFX11W64: ; %bb.0: ; %entry 1009; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1010; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1011; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 1012; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1013; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc 1014; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1015; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 1016; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1017; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 1018; GFX11W64-NEXT: s_endpgm 1019; 1020; GFX11W32-LABEL: add_i32_varying_offset: 1021; GFX11W32: ; %bb.0: ; %entry 1022; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1023; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 1024; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc 1026; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1027; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 1028; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1029; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] 1030; GFX11W32-NEXT: s_endpgm 1031; 1032; GFX12W64-LABEL: add_i32_varying_offset: 1033; GFX12W64: ; %bb.0: ; %entry 1034; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1035; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1036; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 1037; GFX12W64-NEXT: s_wait_kmcnt 0x0 1038; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN 1039; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1040; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 1041; GFX12W64-NEXT: s_wait_loadcnt 0x0 1042; GFX12W64-NEXT: s_wait_kmcnt 0x0 1043; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] 1044; GFX12W64-NEXT: s_endpgm 1045; 1046; GFX12W32-LABEL: add_i32_varying_offset: 1047; GFX12W32: ; %bb.0: ; %entry 1048; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1049; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 1050; GFX12W32-NEXT: s_wait_kmcnt 0x0 1051; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN 1052; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1053; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 1054; GFX12W32-NEXT: s_wait_loadcnt 0x0 1055; GFX12W32-NEXT: s_wait_kmcnt 0x0 1056; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] 1057; GFX12W32-NEXT: s_endpgm 1058entry: 1059 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1060 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) 1061 store i32 %old, ptr addrspace(1) %out 1062 ret void 1063} 1064 1065define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 1066; GFX6-LABEL: sub_i32_constant: 1067; GFX6: ; %bb.0: ; %entry 1068; GFX6-NEXT: s_mov_b64 s[2:3], exec 1069; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1070; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1071; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1072; GFX6-NEXT: ; implicit-def: $vgpr1 1073; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 1074; GFX6-NEXT: s_cbranch_execz .LBB4_2 1075; GFX6-NEXT: ; %bb.1: 1076; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 1077; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1078; GFX6-NEXT: s_mul_i32 s2, s2, 5 1079; GFX6-NEXT: v_mov_b32_e32 v1, s2 1080; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1081; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1082; GFX6-NEXT: .LBB4_2: 1083; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 1084; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1085; GFX6-NEXT: s_mov_b32 s3, 0xf000 1086; GFX6-NEXT: s_mov_b32 s2, -1 1087; GFX6-NEXT: s_waitcnt vmcnt(0) 1088; GFX6-NEXT: v_readfirstlane_b32 s4, v1 1089; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1090; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1091; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1092; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1093; GFX6-NEXT: s_endpgm 1094; 1095; GFX8-LABEL: sub_i32_constant: 1096; GFX8: ; %bb.0: ; %entry 1097; GFX8-NEXT: s_mov_b64 s[2:3], exec 1098; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1099; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1100; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1101; GFX8-NEXT: ; implicit-def: $vgpr1 1102; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1103; GFX8-NEXT: s_cbranch_execz .LBB4_2 1104; GFX8-NEXT: ; %bb.1: 1105; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1106; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1107; GFX8-NEXT: s_mul_i32 s2, s2, 5 1108; GFX8-NEXT: v_mov_b32_e32 v1, s2 1109; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1111; GFX8-NEXT: .LBB4_2: 1112; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1113; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1114; GFX8-NEXT: s_waitcnt vmcnt(0) 1115; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1116; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1117; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 1118; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1119; GFX8-NEXT: v_mov_b32_e32 v0, s0 1120; GFX8-NEXT: v_mov_b32_e32 v1, s1 1121; GFX8-NEXT: flat_store_dword v[0:1], v2 1122; GFX8-NEXT: s_endpgm 1123; 1124; GFX9-LABEL: sub_i32_constant: 1125; GFX9: ; %bb.0: ; %entry 1126; GFX9-NEXT: s_mov_b64 s[2:3], exec 1127; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1128; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1129; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1130; GFX9-NEXT: ; implicit-def: $vgpr1 1131; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1132; GFX9-NEXT: s_cbranch_execz .LBB4_2 1133; GFX9-NEXT: ; %bb.1: 1134; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1135; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1136; GFX9-NEXT: s_mul_i32 s2, s2, 5 1137; GFX9-NEXT: v_mov_b32_e32 v1, s2 1138; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1139; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1140; GFX9-NEXT: .LBB4_2: 1141; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1142; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1143; GFX9-NEXT: s_waitcnt vmcnt(0) 1144; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1145; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1146; GFX9-NEXT: v_mov_b32_e32 v2, 0 1147; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1148; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1150; GFX9-NEXT: s_endpgm 1151; 1152; GFX10W64-LABEL: sub_i32_constant: 1153; GFX10W64: ; %bb.0: ; %entry 1154; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 1155; GFX10W64-NEXT: ; implicit-def: $vgpr1 1156; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1157; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1158; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1159; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1160; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 1161; GFX10W64-NEXT: ; %bb.1: 1162; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1163; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1164; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 1165; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 1166; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1168; GFX10W64-NEXT: .LBB4_2: 1169; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1170; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1171; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1172; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1173; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 1174; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1175; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1176; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1177; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1178; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 1179; GFX10W64-NEXT: s_endpgm 1180; 1181; GFX10W32-LABEL: sub_i32_constant: 1182; GFX10W32: ; %bb.0: ; %entry 1183; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 1184; GFX10W32-NEXT: ; implicit-def: $vgpr1 1185; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1186; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1187; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 1188; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 1189; GFX10W32-NEXT: ; %bb.1: 1190; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1191; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 1192; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 1193; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 1194; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1196; GFX10W32-NEXT: .LBB4_2: 1197; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1198; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1199; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1200; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1201; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 1202; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1203; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1204; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1205; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1206; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] 1207; GFX10W32-NEXT: s_endpgm 1208; 1209; GFX11W64-LABEL: sub_i32_constant: 1210; GFX11W64: ; %bb.0: ; %entry 1211; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 1212; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 1213; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1214; GFX11W64-NEXT: ; implicit-def: $vgpr1 1215; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1216; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1217; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1218; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 1219; GFX11W64-NEXT: ; %bb.1: 1220; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1221; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1222; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1223; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 1224; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 1225; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1226; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1227; GFX11W64-NEXT: .LBB4_2: 1228; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 1229; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1230; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1231; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 1232; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1233; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1234; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1235; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1236; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1237; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 1238; GFX11W64-NEXT: s_endpgm 1239; 1240; GFX11W32-LABEL: sub_i32_constant: 1241; GFX11W32: ; %bb.0: ; %entry 1242; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 1243; GFX11W32-NEXT: s_mov_b32 s0, exec_lo 1244; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1245; GFX11W32-NEXT: ; implicit-def: $vgpr1 1246; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1247; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1248; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 1249; GFX11W32-NEXT: ; %bb.1: 1250; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1251; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 1252; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1253; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 1254; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 1255; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1256; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1257; GFX11W32-NEXT: .LBB4_2: 1258; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1259; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1260; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1261; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 1262; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1263; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1264; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1265; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1266; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1267; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 1268; GFX11W32-NEXT: s_endpgm 1269; 1270; GFX12W64-LABEL: sub_i32_constant: 1271; GFX12W64: ; %bb.0: ; %entry 1272; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 1273; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 1274; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1275; GFX12W64-NEXT: ; implicit-def: $vgpr1 1276; GFX12W64-NEXT: s_wait_alu 0xfffe 1277; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1278; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1279; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1280; GFX12W64-NEXT: s_cbranch_execz .LBB4_2 1281; GFX12W64-NEXT: ; %bb.1: 1282; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1283; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1284; GFX12W64-NEXT: s_wait_alu 0xfffe 1285; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 1286; GFX12W64-NEXT: s_wait_alu 0xfffe 1287; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 1288; GFX12W64-NEXT: s_wait_kmcnt 0x0 1289; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1290; GFX12W64-NEXT: .LBB4_2: 1291; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 1292; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1293; GFX12W64-NEXT: s_wait_loadcnt 0x0 1294; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 1295; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1296; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 1297; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1298; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1299; GFX12W64-NEXT: s_wait_kmcnt 0x0 1300; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 1301; GFX12W64-NEXT: s_endpgm 1302; 1303; GFX12W32-LABEL: sub_i32_constant: 1304; GFX12W32: ; %bb.0: ; %entry 1305; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 1306; GFX12W32-NEXT: s_mov_b32 s0, exec_lo 1307; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1308; GFX12W32-NEXT: ; implicit-def: $vgpr1 1309; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1310; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1311; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 1312; GFX12W32-NEXT: ; %bb.1: 1313; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1314; GFX12W32-NEXT: s_wait_alu 0xfffe 1315; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 1316; GFX12W32-NEXT: s_wait_alu 0xfffe 1317; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 1318; GFX12W32-NEXT: s_wait_alu 0xfffe 1319; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 1320; GFX12W32-NEXT: s_wait_kmcnt 0x0 1321; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1322; GFX12W32-NEXT: .LBB4_2: 1323; GFX12W32-NEXT: s_wait_alu 0xfffe 1324; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1325; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1326; GFX12W32-NEXT: s_wait_loadcnt 0x0 1327; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 1328; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1329; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 1330; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1331; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1332; GFX12W32-NEXT: s_wait_kmcnt 0x0 1333; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 1334; GFX12W32-NEXT: s_endpgm 1335entry: 1336 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 1337 store i32 %old, ptr addrspace(1) %out 1338 ret void 1339} 1340 1341define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { 1342; GFX6-LABEL: sub_i32_uniform: 1343; GFX6: ; %bb.0: ; %entry 1344; GFX6-NEXT: s_mov_b64 s[2:3], exec 1345; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 1346; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1347; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1348; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1349; GFX6-NEXT: ; implicit-def: $vgpr1 1350; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 1351; GFX6-NEXT: s_cbranch_execz .LBB5_2 1352; GFX6-NEXT: ; %bb.1: 1353; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 1354; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1355; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1356; GFX6-NEXT: s_mul_i32 s2, s6, s2 1357; GFX6-NEXT: v_mov_b32_e32 v1, s2 1358; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1359; GFX6-NEXT: .LBB5_2: 1360; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 1361; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1362; GFX6-NEXT: s_mov_b32 s3, 0xf000 1363; GFX6-NEXT: s_mov_b32 s2, -1 1364; GFX6-NEXT: s_waitcnt vmcnt(0) 1365; GFX6-NEXT: v_readfirstlane_b32 s4, v1 1366; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 1368; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1369; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1370; GFX6-NEXT: s_endpgm 1371; 1372; GFX8-LABEL: sub_i32_uniform: 1373; GFX8: ; %bb.0: ; %entry 1374; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 1375; GFX8-NEXT: s_mov_b64 s[2:3], exec 1376; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1377; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1378; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1379; GFX8-NEXT: ; implicit-def: $vgpr1 1380; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1381; GFX8-NEXT: s_cbranch_execz .LBB5_2 1382; GFX8-NEXT: ; %bb.1: 1383; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1384; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1385; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1386; GFX8-NEXT: s_mul_i32 s2, s6, s2 1387; GFX8-NEXT: v_mov_b32_e32 v1, s2 1388; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1389; GFX8-NEXT: .LBB5_2: 1390; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1391; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1392; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1393; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1394; GFX8-NEXT: s_waitcnt vmcnt(0) 1395; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1396; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 1397; GFX8-NEXT: v_mov_b32_e32 v0, s0 1398; GFX8-NEXT: v_mov_b32_e32 v1, s1 1399; GFX8-NEXT: flat_store_dword v[0:1], v2 1400; GFX8-NEXT: s_endpgm 1401; 1402; GFX9-LABEL: sub_i32_uniform: 1403; GFX9: ; %bb.0: ; %entry 1404; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 1405; GFX9-NEXT: s_mov_b64 s[2:3], exec 1406; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1407; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1408; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1409; GFX9-NEXT: ; implicit-def: $vgpr1 1410; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1411; GFX9-NEXT: s_cbranch_execz .LBB5_2 1412; GFX9-NEXT: ; %bb.1: 1413; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1414; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1415; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX9-NEXT: s_mul_i32 s2, s6, s2 1417; GFX9-NEXT: v_mov_b32_e32 v1, s2 1418; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1419; GFX9-NEXT: .LBB5_2: 1420; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1421; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1422; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1423; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1424; GFX9-NEXT: s_waitcnt vmcnt(0) 1425; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1426; GFX9-NEXT: v_mov_b32_e32 v2, 0 1427; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1428; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1429; GFX9-NEXT: s_endpgm 1430; 1431; GFX10W64-LABEL: sub_i32_uniform: 1432; GFX10W64: ; %bb.0: ; %entry 1433; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 1434; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 1435; GFX10W64-NEXT: ; implicit-def: $vgpr1 1436; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1437; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1438; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1439; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1440; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 1441; GFX10W64-NEXT: ; %bb.1: 1442; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1443; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1444; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1445; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 1446; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 1447; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1448; GFX10W64-NEXT: .LBB5_2: 1449; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1450; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1451; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1452; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 1454; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1455; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 1456; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1457; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1458; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 1459; GFX10W64-NEXT: s_endpgm 1460; 1461; GFX10W32-LABEL: sub_i32_uniform: 1462; GFX10W32: ; %bb.0: ; %entry 1463; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 1464; GFX10W32-NEXT: s_mov_b32 s2, exec_lo 1465; GFX10W32-NEXT: ; implicit-def: $vgpr1 1466; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1467; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1468; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1469; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 1470; GFX10W32-NEXT: ; %bb.1: 1471; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1472; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 1473; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1474; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 1475; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 1476; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1477; GFX10W32-NEXT: .LBB5_2: 1478; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1479; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1480; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1481; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1482; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 1483; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1484; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1485; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1486; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1487; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1488; GFX10W32-NEXT: s_endpgm 1489; 1490; GFX11W64-LABEL: sub_i32_uniform: 1491; GFX11W64: ; %bb.0: ; %entry 1492; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 1493; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 1494; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 1495; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1496; GFX11W64-NEXT: ; implicit-def: $vgpr1 1497; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1498; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1499; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1500; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 1501; GFX11W64-NEXT: ; %bb.1: 1502; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1503; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1504; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1505; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 1506; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1507; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 1508; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1509; GFX11W64-NEXT: .LBB5_2: 1510; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 1511; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1512; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1513; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 1514; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1515; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 1516; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1517; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1518; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1519; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 1520; GFX11W64-NEXT: s_endpgm 1521; 1522; GFX11W32-LABEL: sub_i32_uniform: 1523; GFX11W32: ; %bb.0: ; %entry 1524; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 1525; GFX11W32-NEXT: s_mov_b32 s2, exec_lo 1526; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 1527; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1528; GFX11W32-NEXT: ; implicit-def: $vgpr1 1529; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1530; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1531; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 1532; GFX11W32-NEXT: ; %bb.1: 1533; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1534; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 1535; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1536; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 1537; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1538; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 1539; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1540; GFX11W32-NEXT: .LBB5_2: 1541; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1542; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 1543; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1544; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 1545; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1546; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1547; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1548; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1549; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1550; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1551; GFX11W32-NEXT: s_endpgm 1552; 1553; GFX12W64-LABEL: sub_i32_uniform: 1554; GFX12W64: ; %bb.0: ; %entry 1555; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 1556; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 1557; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 1558; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1559; GFX12W64-NEXT: ; implicit-def: $vgpr1 1560; GFX12W64-NEXT: s_wait_alu 0xfffe 1561; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1562; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1563; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1564; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 1565; GFX12W64-NEXT: ; %bb.1: 1566; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1567; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1568; GFX12W64-NEXT: s_wait_kmcnt 0x0 1569; GFX12W64-NEXT: s_wait_alu 0xfffe 1570; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 1571; GFX12W64-NEXT: s_wait_alu 0xfffe 1572; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 1573; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1574; GFX12W64-NEXT: .LBB5_2: 1575; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 1576; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1577; GFX12W64-NEXT: s_wait_kmcnt 0x0 1578; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 1579; GFX12W64-NEXT: s_wait_loadcnt 0x0 1580; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 1581; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 1582; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1583; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1584; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 1585; GFX12W64-NEXT: s_endpgm 1586; 1587; GFX12W32-LABEL: sub_i32_uniform: 1588; GFX12W32: ; %bb.0: ; %entry 1589; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 1590; GFX12W32-NEXT: s_mov_b32 s2, exec_lo 1591; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 1592; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1593; GFX12W32-NEXT: ; implicit-def: $vgpr1 1594; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1595; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1596; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 1597; GFX12W32-NEXT: ; %bb.1: 1598; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1599; GFX12W32-NEXT: s_wait_alu 0xfffe 1600; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 1601; GFX12W32-NEXT: s_wait_kmcnt 0x0 1602; GFX12W32-NEXT: s_wait_alu 0xfffe 1603; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 1604; GFX12W32-NEXT: s_wait_alu 0xfffe 1605; GFX12W32-NEXT: v_mov_b32_e32 v1, s2 1606; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1607; GFX12W32-NEXT: .LBB5_2: 1608; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1609; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 1610; GFX12W32-NEXT: s_wait_kmcnt 0x0 1611; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 1612; GFX12W32-NEXT: s_wait_loadcnt 0x0 1613; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 1614; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 1615; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1616; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1617; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] 1618; GFX12W32-NEXT: s_endpgm 1619entry: 1620 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 1621 store i32 %old, ptr addrspace(1) %out 1622 ret void 1623} 1624 1625define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 1626; GFX6-LABEL: sub_i32_varying_vdata: 1627; GFX6: ; %bb.0: ; %entry 1628; GFX6-NEXT: s_mov_b64 s[0:1], exec 1629; GFX6-NEXT: s_mov_b32 s2, 0 1630; GFX6-NEXT: ; implicit-def: $vgpr1 1631; GFX6-NEXT: .LBB6_1: ; %ComputeLoop 1632; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 1633; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] 1634; GFX6-NEXT: s_mov_b32 m0, s3 1635; GFX6-NEXT: v_readlane_b32 s8, v0, s3 1636; GFX6-NEXT: v_writelane_b32 v1, s2, m0 1637; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 1638; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1639; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 1640; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] 1641; GFX6-NEXT: s_add_i32 s2, s2, s8 1642; GFX6-NEXT: s_cbranch_vccnz .LBB6_1 1643; GFX6-NEXT: ; %bb.2: ; %ComputeEnd 1644; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1645; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 1646; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1647; GFX6-NEXT: ; implicit-def: $vgpr0 1648; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 1649; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1650; GFX6-NEXT: s_cbranch_execz .LBB6_4 1651; GFX6-NEXT: ; %bb.3: 1652; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 1653; GFX6-NEXT: v_mov_b32_e32 v0, s2 1654; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1655; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1656; GFX6-NEXT: .LBB6_4: 1657; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 1658; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1659; GFX6-NEXT: s_mov_b32 s3, 0xf000 1660; GFX6-NEXT: s_mov_b32 s2, -1 1661; GFX6-NEXT: s_waitcnt vmcnt(0) 1662; GFX6-NEXT: v_readfirstlane_b32 s4, v0 1663; GFX6-NEXT: s_waitcnt expcnt(0) 1664; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 1665; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1666; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1667; GFX6-NEXT: s_endpgm 1668; 1669; GFX8-LABEL: sub_i32_varying_vdata: 1670; GFX8: ; %bb.0: ; %entry 1671; GFX8-NEXT: s_mov_b64 s[0:1], exec 1672; GFX8-NEXT: s_mov_b32 s2, 0 1673; GFX8-NEXT: ; implicit-def: $vgpr1 1674; GFX8-NEXT: .LBB6_1: ; %ComputeLoop 1675; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1676; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] 1677; GFX8-NEXT: s_mov_b32 m0, s3 1678; GFX8-NEXT: v_readlane_b32 s8, v0, s3 1679; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 1680; GFX8-NEXT: v_writelane_b32 v1, s2, m0 1681; GFX8-NEXT: s_add_i32 s2, s2, s8 1682; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1683; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 1684; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 1685; GFX8-NEXT: ; %bb.2: ; %ComputeEnd 1686; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1687; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1688; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1689; GFX8-NEXT: ; implicit-def: $vgpr0 1690; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1691; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1692; GFX8-NEXT: s_cbranch_execz .LBB6_4 1693; GFX8-NEXT: ; %bb.3: 1694; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1695; GFX8-NEXT: v_mov_b32_e32 v0, s2 1696; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1697; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1698; GFX8-NEXT: .LBB6_4: 1699; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1700; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1701; GFX8-NEXT: s_waitcnt vmcnt(0) 1702; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1703; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 1704; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1705; GFX8-NEXT: v_mov_b32_e32 v0, s0 1706; GFX8-NEXT: v_mov_b32_e32 v1, s1 1707; GFX8-NEXT: flat_store_dword v[0:1], v2 1708; GFX8-NEXT: s_endpgm 1709; 1710; GFX9-LABEL: sub_i32_varying_vdata: 1711; GFX9: ; %bb.0: ; %entry 1712; GFX9-NEXT: s_mov_b64 s[0:1], exec 1713; GFX9-NEXT: s_mov_b32 s2, 0 1714; GFX9-NEXT: ; implicit-def: $vgpr1 1715; GFX9-NEXT: .LBB6_1: ; %ComputeLoop 1716; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1717; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] 1718; GFX9-NEXT: s_mov_b32 m0, s3 1719; GFX9-NEXT: v_readlane_b32 s8, v0, s3 1720; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 1721; GFX9-NEXT: v_writelane_b32 v1, s2, m0 1722; GFX9-NEXT: s_add_i32 s2, s2, s8 1723; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1724; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 1725; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 1726; GFX9-NEXT: ; %bb.2: ; %ComputeEnd 1727; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1728; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1729; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1730; GFX9-NEXT: ; implicit-def: $vgpr0 1731; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1732; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1733; GFX9-NEXT: s_cbranch_execz .LBB6_4 1734; GFX9-NEXT: ; %bb.3: 1735; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1736; GFX9-NEXT: v_mov_b32_e32 v0, s2 1737; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1738; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1739; GFX9-NEXT: .LBB6_4: 1740; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1741; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1742; GFX9-NEXT: s_waitcnt vmcnt(0) 1743; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1744; GFX9-NEXT: v_mov_b32_e32 v2, 0 1745; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 1746; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1747; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1748; GFX9-NEXT: s_endpgm 1749; 1750; GFX10W64-LABEL: sub_i32_varying_vdata: 1751; GFX10W64: ; %bb.0: ; %entry 1752; GFX10W64-NEXT: s_mov_b64 s[0:1], exec 1753; GFX10W64-NEXT: s_mov_b32 s2, 0 1754; GFX10W64-NEXT: ; implicit-def: $vgpr1 1755; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop 1756; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 1757; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] 1758; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 1759; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 1760; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 1761; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1762; GFX10W64-NEXT: s_add_i32 s2, s2, s8 1763; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 1764; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 1765; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd 1766; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1767; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1768; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1769; GFX10W64-NEXT: ; implicit-def: $vgpr0 1770; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1771; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1772; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 1773; GFX10W64-NEXT: ; %bb.3: 1774; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1775; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 1776; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1777; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1778; GFX10W64-NEXT: .LBB6_4: 1779; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1780; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1781; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1782; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1783; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 1784; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 1785; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 1786; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1787; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] 1788; GFX10W64-NEXT: s_endpgm 1789; 1790; GFX10W32-LABEL: sub_i32_varying_vdata: 1791; GFX10W32: ; %bb.0: ; %entry 1792; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 1793; GFX10W32-NEXT: s_mov_b32 s0, 0 1794; GFX10W32-NEXT: ; implicit-def: $vgpr1 1795; GFX10W32-NEXT: .LBB6_1: ; %ComputeLoop 1796; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 1797; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 1798; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 1799; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 1800; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 1801; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 1802; GFX10W32-NEXT: s_add_i32 s0, s0, s3 1803; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 1804; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 1805; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd 1806; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1807; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1808; GFX10W32-NEXT: ; implicit-def: $vgpr0 1809; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1810; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 1811; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 1812; GFX10W32-NEXT: ; %bb.3: 1813; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1814; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 1815; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1816; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1817; GFX10W32-NEXT: .LBB6_4: 1818; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1819; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1820; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1821; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1822; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 1823; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 1824; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 1825; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1826; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] 1827; GFX10W32-NEXT: s_endpgm 1828; 1829; GFX11W64-LABEL: sub_i32_varying_vdata: 1830; GFX11W64: ; %bb.0: ; %entry 1831; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1832; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 1833; GFX11W64-NEXT: s_mov_b32 s2, 0 1834; GFX11W64-NEXT: ; implicit-def: $vgpr0 1835; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop 1836; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 1837; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 1838; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1839; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 1840; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 1841; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 1842; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 1843; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1844; GFX11W64-NEXT: s_add_i32 s2, s2, s8 1845; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 1846; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 1847; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd 1848; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1849; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1850; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 1851; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1852; GFX11W64-NEXT: ; implicit-def: $vgpr1 1853; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1854; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1855; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1856; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 1857; GFX11W64-NEXT: ; %bb.3: 1858; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1859; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 1860; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1861; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1862; GFX11W64-NEXT: .LBB6_4: 1863; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 1864; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1865; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1866; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 1867; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1868; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1869; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1870; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1871; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 1872; GFX11W64-NEXT: s_endpgm 1873; 1874; GFX11W32-LABEL: sub_i32_varying_vdata: 1875; GFX11W32: ; %bb.0: ; %entry 1876; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1877; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 1878; GFX11W32-NEXT: s_mov_b32 s0, 0 1879; GFX11W32-NEXT: ; implicit-def: $vgpr0 1880; GFX11W32-NEXT: .LBB6_1: ; %ComputeLoop 1881; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 1882; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 1883; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1884; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 1885; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 1886; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 1887; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 1888; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1889; GFX11W32-NEXT: s_add_i32 s0, s0, s3 1890; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 1891; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 1892; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd 1893; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1894; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1895; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1896; GFX11W32-NEXT: ; implicit-def: $vgpr1 1897; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1898; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 1899; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 1900; GFX11W32-NEXT: ; %bb.3: 1901; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1902; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 1903; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1904; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1905; GFX11W32-NEXT: .LBB6_4: 1906; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1907; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1908; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1909; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 1910; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1911; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1912; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1913; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1914; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 1915; GFX11W32-NEXT: s_endpgm 1916; 1917; GFX12W64-LABEL: sub_i32_varying_vdata: 1918; GFX12W64: ; %bb.0: ; %entry 1919; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1920; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 1921; GFX12W64-NEXT: s_mov_b32 s2, 0 1922; GFX12W64-NEXT: ; implicit-def: $vgpr0 1923; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop 1924; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 1925; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 1926; GFX12W64-NEXT: s_wait_alu 0xfffe 1927; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 1928; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 1929; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 1930; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 1931; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1932; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 1933; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 1934; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 1935; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd 1936; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1937; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1938; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 1939; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1940; GFX12W64-NEXT: ; implicit-def: $vgpr1 1941; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1942; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1943; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1944; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 1945; GFX12W64-NEXT: ; %bb.3: 1946; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1947; GFX12W64-NEXT: s_wait_alu 0xfffe 1948; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 1949; GFX12W64-NEXT: s_wait_kmcnt 0x0 1950; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1951; GFX12W64-NEXT: .LBB6_4: 1952; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 1953; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1954; GFX12W64-NEXT: s_wait_loadcnt 0x0 1955; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 1956; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 1957; GFX12W64-NEXT: s_wait_alu 0xfffe 1958; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1959; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1960; GFX12W64-NEXT: s_wait_kmcnt 0x0 1961; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 1962; GFX12W64-NEXT: s_endpgm 1963; 1964; GFX12W32-LABEL: sub_i32_varying_vdata: 1965; GFX12W32: ; %bb.0: ; %entry 1966; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1967; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 1968; GFX12W32-NEXT: s_mov_b32 s0, 0 1969; GFX12W32-NEXT: ; implicit-def: $vgpr0 1970; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop 1971; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 1972; GFX12W32-NEXT: s_wait_alu 0xfffe 1973; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 1974; GFX12W32-NEXT: s_wait_alu 0xfffe 1975; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 1976; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 1977; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 1978; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 1979; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1980; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 1981; GFX12W32-NEXT: s_wait_alu 0xfffe 1982; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 1983; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 1984; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd 1985; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1986; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1987; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1988; GFX12W32-NEXT: ; implicit-def: $vgpr1 1989; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1990; GFX12W32-NEXT: s_wait_alu 0xfffe 1991; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 1992; GFX12W32-NEXT: s_cbranch_execz .LBB6_4 1993; GFX12W32-NEXT: ; %bb.3: 1994; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1995; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 1996; GFX12W32-NEXT: s_wait_kmcnt 0x0 1997; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1998; GFX12W32-NEXT: .LBB6_4: 1999; GFX12W32-NEXT: s_wait_alu 0xfffe 2000; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2001; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2002; GFX12W32-NEXT: s_wait_loadcnt 0x0 2003; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 2004; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 2005; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2006; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2007; GFX12W32-NEXT: s_wait_kmcnt 0x0 2008; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 2009; GFX12W32-NEXT: s_endpgm 2010entry: 2011 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2012 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 2013 store i32 %old, ptr addrspace(1) %out 2014 ret void 2015} 2016 2017define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 2018; GFX6-LABEL: sub_i32_varying_offset: 2019; GFX6: ; %bb.0: ; %entry 2020; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 2021; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 2022; GFX6-NEXT: v_mov_b32_e32 v1, 1 2023; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2024; GFX6-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc 2025; GFX6-NEXT: s_mov_b32 s7, 0xf000 2026; GFX6-NEXT: s_mov_b32 s6, -1 2027; GFX6-NEXT: s_waitcnt vmcnt(0) 2028; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 2029; GFX6-NEXT: s_endpgm 2030; 2031; GFX8-LABEL: sub_i32_varying_offset: 2032; GFX8: ; %bb.0: ; %entry 2033; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2034; GFX8-NEXT: v_mov_b32_e32 v2, 1 2035; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2036; GFX8-NEXT: buffer_atomic_sub v2, v0, s[0:3], 0 offen glc 2037; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2038; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2039; GFX8-NEXT: v_mov_b32_e32 v0, s0 2040; GFX8-NEXT: v_mov_b32_e32 v1, s1 2041; GFX8-NEXT: s_waitcnt vmcnt(0) 2042; GFX8-NEXT: flat_store_dword v[0:1], v2 2043; GFX8-NEXT: s_endpgm 2044; 2045; GFX9-LABEL: sub_i32_varying_offset: 2046; GFX9: ; %bb.0: ; %entry 2047; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2048; GFX9-NEXT: v_mov_b32_e32 v1, 1 2049; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2050; GFX9-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc 2051; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2052; GFX9-NEXT: v_mov_b32_e32 v0, 0 2053; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2054; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2055; GFX9-NEXT: s_endpgm 2056; 2057; GFX10-LABEL: sub_i32_varying_offset: 2058; GFX10: ; %bb.0: ; %entry 2059; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2060; GFX10-NEXT: v_mov_b32_e32 v1, 1 2061; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2062; GFX10-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc 2063; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2064; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2065; GFX10-NEXT: v_mov_b32_e32 v0, 0 2066; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2067; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2068; GFX10-NEXT: s_endpgm 2069; 2070; GFX11W64-LABEL: sub_i32_varying_offset: 2071; GFX11W64: ; %bb.0: ; %entry 2072; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2073; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2074; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 2075; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 2076; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc 2077; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2078; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 2079; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2080; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 2081; GFX11W64-NEXT: s_endpgm 2082; 2083; GFX11W32-LABEL: sub_i32_varying_offset: 2084; GFX11W32: ; %bb.0: ; %entry 2085; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2086; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 2087; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 2088; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc 2089; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2090; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 2091; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2092; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] 2093; GFX11W32-NEXT: s_endpgm 2094; 2095; GFX12W64-LABEL: sub_i32_varying_offset: 2096; GFX12W64: ; %bb.0: ; %entry 2097; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2098; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2099; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 2100; GFX12W64-NEXT: s_wait_kmcnt 0x0 2101; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN 2102; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2103; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 2104; GFX12W64-NEXT: s_wait_loadcnt 0x0 2105; GFX12W64-NEXT: s_wait_kmcnt 0x0 2106; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] 2107; GFX12W64-NEXT: s_endpgm 2108; 2109; GFX12W32-LABEL: sub_i32_varying_offset: 2110; GFX12W32: ; %bb.0: ; %entry 2111; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2112; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 2113; GFX12W32-NEXT: s_wait_kmcnt 0x0 2114; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN 2115; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2116; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 2117; GFX12W32-NEXT: s_wait_loadcnt 0x0 2118; GFX12W32-NEXT: s_wait_kmcnt 0x0 2119; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] 2120; GFX12W32-NEXT: s_endpgm 2121entry: 2122 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2123 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) 2124 store i32 %old, ptr addrspace(1) %out 2125 ret void 2126} 2127;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 2128; GFX11: {{.*}} 2129; GFX12: {{.*}} 2130