1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s 10; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s 11 12declare i32 @llvm.amdgcn.workitem.id.x() 13declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) 14declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32 immarg) 15declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32 immarg) 16 17; Show what the atomic optimization pass will do for raw buffers. 18 19define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 20; GFX6-LABEL: add_i32_constant: 21; GFX6: ; %bb.0: ; %entry 22; GFX6-NEXT: s_mov_b64 s[2:3], exec 23; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 24; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 25; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 26; GFX6-NEXT: ; implicit-def: $vgpr1 27; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 28; GFX6-NEXT: s_cbranch_execz .LBB0_2 29; GFX6-NEXT: ; %bb.1: 30; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 31; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 32; GFX6-NEXT: s_mul_i32 s2, s2, 5 33; GFX6-NEXT: v_mov_b32_e32 v1, s2 34; GFX6-NEXT: s_waitcnt lgkmcnt(0) 35; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 36; GFX6-NEXT: .LBB0_2: 37; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 38; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 39; GFX6-NEXT: s_mov_b32 s3, 0xf000 40; GFX6-NEXT: s_mov_b32 s2, -1 41; GFX6-NEXT: s_waitcnt vmcnt(0) 42; GFX6-NEXT: v_readfirstlane_b32 s4, v1 43; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 44; GFX6-NEXT: s_waitcnt lgkmcnt(0) 45; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 46; GFX6-NEXT: s_endpgm 47; 48; GFX8-LABEL: add_i32_constant: 49; GFX8: ; %bb.0: ; %entry 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 56; GFX8-NEXT: s_cbranch_execz .LBB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 59; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 60; GFX8-NEXT: s_mul_i32 s2, s2, 5 61; GFX8-NEXT: v_mov_b32_e32 v1, s2 62; GFX8-NEXT: s_waitcnt lgkmcnt(0) 63; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 64; GFX8-NEXT: .LBB0_2: 65; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 66; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 67; GFX8-NEXT: s_waitcnt vmcnt(0) 68; GFX8-NEXT: v_readfirstlane_b32 s2, v1 69; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 70; GFX8-NEXT: s_waitcnt lgkmcnt(0) 71; GFX8-NEXT: v_mov_b32_e32 v0, s0 72; GFX8-NEXT: v_mov_b32_e32 v1, s1 73; GFX8-NEXT: flat_store_dword v[0:1], v2 74; GFX8-NEXT: s_endpgm 75; 76; GFX9-LABEL: add_i32_constant: 77; GFX9: ; %bb.0: ; %entry 78; GFX9-NEXT: s_mov_b64 s[2:3], exec 79; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 80; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 81; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 82; GFX9-NEXT: ; implicit-def: $vgpr1 83; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 84; GFX9-NEXT: s_cbranch_execz .LBB0_2 85; GFX9-NEXT: ; %bb.1: 86; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 87; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 88; GFX9-NEXT: s_mul_i32 s2, s2, 5 89; GFX9-NEXT: v_mov_b32_e32 v1, s2 90; GFX9-NEXT: s_waitcnt lgkmcnt(0) 91; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 92; GFX9-NEXT: .LBB0_2: 93; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 94; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 95; GFX9-NEXT: s_waitcnt vmcnt(0) 96; GFX9-NEXT: v_readfirstlane_b32 s2, v1 97; GFX9-NEXT: v_mov_b32_e32 v2, 0 98; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 99; GFX9-NEXT: s_waitcnt lgkmcnt(0) 100; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 101; GFX9-NEXT: s_endpgm 102; 103; GFX10W64-LABEL: add_i32_constant: 104; GFX10W64: ; %bb.0: ; %entry 105; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 106; GFX10W64-NEXT: ; implicit-def: $vgpr1 107; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 108; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 109; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 110; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 111; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 112; GFX10W64-NEXT: ; %bb.1: 113; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 114; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 115; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 116; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 117; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 118; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 119; GFX10W64-NEXT: .LBB0_2: 120; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 121; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 122; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 123; GFX10W64-NEXT: s_waitcnt vmcnt(0) 124; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 125; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 126; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 127; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 128; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 129; GFX10W64-NEXT: s_endpgm 130; 131; GFX10W32-LABEL: add_i32_constant: 132; GFX10W32: ; %bb.0: ; %entry 133; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 134; GFX10W32-NEXT: ; implicit-def: $vgpr1 135; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 136; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 137; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 138; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 139; GFX10W32-NEXT: ; %bb.1: 140; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 141; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 142; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 143; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 144; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 145; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 146; GFX10W32-NEXT: .LBB0_2: 147; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 148; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 149; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 150; GFX10W32-NEXT: s_waitcnt vmcnt(0) 151; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 152; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 153; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 154; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 155; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] 156; GFX10W32-NEXT: s_endpgm 157; 158; GFX11W64-LABEL: add_i32_constant: 159; GFX11W64: ; %bb.0: ; %entry 160; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 161; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 162; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 163; GFX11W64-NEXT: ; implicit-def: $vgpr1 164; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 165; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 166; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 167; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 168; GFX11W64-NEXT: ; %bb.1: 169; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 170; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 171; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 172; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 173; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 174; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 175; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 176; GFX11W64-NEXT: .LBB0_2: 177; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 178; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 179; GFX11W64-NEXT: s_waitcnt vmcnt(0) 180; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 181; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 182; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 183; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 184; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 185; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 186; GFX11W64-NEXT: s_endpgm 187; 188; GFX11W32-LABEL: add_i32_constant: 189; GFX11W32: ; %bb.0: ; %entry 190; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 191; GFX11W32-NEXT: s_mov_b32 s0, exec_lo 192; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 193; GFX11W32-NEXT: ; implicit-def: $vgpr1 194; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 195; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 196; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 197; GFX11W32-NEXT: ; %bb.1: 198; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 199; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 200; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 201; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 202; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 203; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 204; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 205; GFX11W32-NEXT: .LBB0_2: 206; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 207; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 208; GFX11W32-NEXT: s_waitcnt vmcnt(0) 209; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 210; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 211; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 212; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 213; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 214; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 215; GFX11W32-NEXT: s_endpgm 216; 217; GFX12W64-LABEL: add_i32_constant: 218; GFX12W64: ; %bb.0: ; %entry 219; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 220; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 221; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 222; GFX12W64-NEXT: ; implicit-def: $vgpr1 223; GFX12W64-NEXT: s_wait_alu 0xfffe 224; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 225; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 226; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 227; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 228; GFX12W64-NEXT: ; %bb.1: 229; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 230; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 231; GFX12W64-NEXT: s_wait_alu 0xfffe 232; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 233; GFX12W64-NEXT: s_wait_alu 0xfffe 234; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 235; GFX12W64-NEXT: s_wait_kmcnt 0x0 236; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 237; GFX12W64-NEXT: .LBB0_2: 238; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 239; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 240; GFX12W64-NEXT: s_wait_loadcnt 0x0 241; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 242; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 243; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 244; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 245; GFX12W64-NEXT: s_wait_kmcnt 0x0 246; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 247; GFX12W64-NEXT: s_endpgm 248; 249; GFX12W32-LABEL: add_i32_constant: 250; GFX12W32: ; %bb.0: ; %entry 251; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 252; GFX12W32-NEXT: s_mov_b32 s0, exec_lo 253; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 254; GFX12W32-NEXT: ; implicit-def: $vgpr1 255; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 256; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 257; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 258; GFX12W32-NEXT: ; %bb.1: 259; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 260; GFX12W32-NEXT: s_wait_alu 0xfffe 261; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 262; GFX12W32-NEXT: s_wait_alu 0xfffe 263; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 264; GFX12W32-NEXT: s_wait_alu 0xfffe 265; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 266; GFX12W32-NEXT: s_wait_kmcnt 0x0 267; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 268; GFX12W32-NEXT: .LBB0_2: 269; GFX12W32-NEXT: s_wait_alu 0xfffe 270; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 271; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 272; GFX12W32-NEXT: s_wait_loadcnt 0x0 273; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 274; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 275; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 276; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 277; GFX12W32-NEXT: s_wait_kmcnt 0x0 278; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 279; GFX12W32-NEXT: s_endpgm 280entry: 281 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 282 store i32 %old, ptr addrspace(1) %out 283 ret void 284} 285 286define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { 287; GFX6-LABEL: add_i32_uniform: 288; GFX6: ; %bb.0: ; %entry 289; GFX6-NEXT: s_mov_b64 s[2:3], exec 290; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 291; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 292; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 293; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 294; GFX6-NEXT: ; implicit-def: $vgpr1 295; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 296; GFX6-NEXT: s_cbranch_execz .LBB1_2 297; GFX6-NEXT: ; %bb.1: 298; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 299; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 300; GFX6-NEXT: s_waitcnt lgkmcnt(0) 301; GFX6-NEXT: s_mul_i32 s2, s6, s2 302; GFX6-NEXT: v_mov_b32_e32 v1, s2 303; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 304; GFX6-NEXT: .LBB1_2: 305; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 306; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 307; GFX6-NEXT: s_mov_b32 s3, 0xf000 308; GFX6-NEXT: s_mov_b32 s2, -1 309; GFX6-NEXT: s_waitcnt vmcnt(0) 310; GFX6-NEXT: v_readfirstlane_b32 s4, v1 311; GFX6-NEXT: s_waitcnt lgkmcnt(0) 312; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 313; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 314; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 315; GFX6-NEXT: s_endpgm 316; 317; GFX8-LABEL: add_i32_uniform: 318; GFX8: ; %bb.0: ; %entry 319; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 320; GFX8-NEXT: s_mov_b64 s[2:3], exec 321; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 322; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 323; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 324; GFX8-NEXT: ; implicit-def: $vgpr1 325; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 326; GFX8-NEXT: s_cbranch_execz .LBB1_2 327; GFX8-NEXT: ; %bb.1: 328; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 329; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 330; GFX8-NEXT: s_waitcnt lgkmcnt(0) 331; GFX8-NEXT: s_mul_i32 s2, s6, s2 332; GFX8-NEXT: v_mov_b32_e32 v1, s2 333; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 334; GFX8-NEXT: .LBB1_2: 335; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 336; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 337; GFX8-NEXT: s_waitcnt lgkmcnt(0) 338; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 339; GFX8-NEXT: s_waitcnt vmcnt(0) 340; GFX8-NEXT: v_readfirstlane_b32 s2, v1 341; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 342; GFX8-NEXT: v_mov_b32_e32 v0, s0 343; GFX8-NEXT: v_mov_b32_e32 v1, s1 344; GFX8-NEXT: flat_store_dword v[0:1], v2 345; GFX8-NEXT: s_endpgm 346; 347; GFX9-LABEL: add_i32_uniform: 348; GFX9: ; %bb.0: ; %entry 349; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 350; GFX9-NEXT: s_mov_b64 s[2:3], exec 351; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 352; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 353; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 354; GFX9-NEXT: ; implicit-def: $vgpr1 355; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 356; GFX9-NEXT: s_cbranch_execz .LBB1_2 357; GFX9-NEXT: ; %bb.1: 358; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 359; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 360; GFX9-NEXT: s_waitcnt lgkmcnt(0) 361; GFX9-NEXT: s_mul_i32 s2, s6, s2 362; GFX9-NEXT: v_mov_b32_e32 v1, s2 363; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 364; GFX9-NEXT: .LBB1_2: 365; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 366; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 367; GFX9-NEXT: s_waitcnt lgkmcnt(0) 368; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 369; GFX9-NEXT: s_waitcnt vmcnt(0) 370; GFX9-NEXT: v_readfirstlane_b32 s2, v1 371; GFX9-NEXT: v_mov_b32_e32 v2, 0 372; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 373; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 374; GFX9-NEXT: s_endpgm 375; 376; GFX10W64-LABEL: add_i32_uniform: 377; GFX10W64: ; %bb.0: ; %entry 378; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 379; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 380; GFX10W64-NEXT: ; implicit-def: $vgpr1 381; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 382; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 383; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 384; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 385; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 386; GFX10W64-NEXT: ; %bb.1: 387; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 388; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 389; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 390; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 391; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 392; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 393; GFX10W64-NEXT: .LBB1_2: 394; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 395; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 396; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 397; GFX10W64-NEXT: s_waitcnt vmcnt(0) 398; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 399; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 400; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] 401; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 402; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 403; GFX10W64-NEXT: s_endpgm 404; 405; GFX10W32-LABEL: add_i32_uniform: 406; GFX10W32: ; %bb.0: ; %entry 407; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 408; GFX10W32-NEXT: s_mov_b32 s2, exec_lo 409; GFX10W32-NEXT: ; implicit-def: $vgpr1 410; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 411; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 412; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 413; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 414; GFX10W32-NEXT: ; %bb.1: 415; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 416; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 417; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 418; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 419; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 420; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 421; GFX10W32-NEXT: .LBB1_2: 422; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 423; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 424; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 425; GFX10W32-NEXT: s_waitcnt vmcnt(0) 426; GFX10W32-NEXT: s_mov_b32 null, 0 427; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 428; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 429; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] 430; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 431; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 432; GFX10W32-NEXT: s_endpgm 433; 434; GFX11W64-LABEL: add_i32_uniform: 435; GFX11W64: ; %bb.0: ; %entry 436; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 437; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 438; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 439; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 440; GFX11W64-NEXT: ; implicit-def: $vgpr1 441; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 442; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 443; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 444; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 445; GFX11W64-NEXT: ; %bb.1: 446; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 447; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 448; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 449; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 450; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 451; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 452; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 453; GFX11W64-NEXT: .LBB1_2: 454; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 455; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 456; GFX11W64-NEXT: s_waitcnt vmcnt(0) 457; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 458; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 459; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 460; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] 461; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 462; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 463; GFX11W64-NEXT: s_endpgm 464; 465; GFX11W32-LABEL: add_i32_uniform: 466; GFX11W32: ; %bb.0: ; %entry 467; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 468; GFX11W32-NEXT: s_mov_b32 s2, exec_lo 469; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 470; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 471; GFX11W32-NEXT: ; implicit-def: $vgpr1 472; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 473; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 474; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 475; GFX11W32-NEXT: ; %bb.1: 476; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 477; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 478; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 479; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 480; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 481; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 482; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 483; GFX11W32-NEXT: .LBB1_2: 484; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 485; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 486; GFX11W32-NEXT: s_waitcnt vmcnt(0) 487; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 488; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 489; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 490; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] 491; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 492; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] 493; GFX11W32-NEXT: s_endpgm 494; 495; GFX12W64-LABEL: add_i32_uniform: 496; GFX12W64: ; %bb.0: ; %entry 497; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 498; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 499; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 500; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 501; GFX12W64-NEXT: ; implicit-def: $vgpr1 502; GFX12W64-NEXT: s_wait_alu 0xfffe 503; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 504; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 505; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 506; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 507; GFX12W64-NEXT: ; %bb.1: 508; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 509; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 510; GFX12W64-NEXT: s_wait_kmcnt 0x0 511; GFX12W64-NEXT: s_wait_alu 0xfffe 512; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 513; GFX12W64-NEXT: s_wait_alu 0xfffe 514; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 515; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 516; GFX12W64-NEXT: .LBB1_2: 517; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 518; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 519; GFX12W64-NEXT: s_wait_loadcnt 0x0 520; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 521; GFX12W64-NEXT: s_wait_kmcnt 0x0 522; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) 523; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] 524; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 525; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 526; GFX12W64-NEXT: s_endpgm 527; 528; GFX12W32-LABEL: add_i32_uniform: 529; GFX12W32: ; %bb.0: ; %entry 530; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 531; GFX12W32-NEXT: s_mov_b32 s2, exec_lo 532; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 533; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 534; GFX12W32-NEXT: ; implicit-def: $vgpr1 535; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 536; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 537; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 538; GFX12W32-NEXT: ; %bb.1: 539; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 540; GFX12W32-NEXT: s_wait_alu 0xfffe 541; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 542; GFX12W32-NEXT: s_wait_kmcnt 0x0 543; GFX12W32-NEXT: s_wait_alu 0xfffe 544; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 545; GFX12W32-NEXT: s_wait_alu 0xfffe 546; GFX12W32-NEXT: v_mov_b32_e32 v1, s2 547; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 548; GFX12W32-NEXT: .LBB1_2: 549; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 550; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 551; GFX12W32-NEXT: s_wait_loadcnt 0x0 552; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 553; GFX12W32-NEXT: s_wait_kmcnt 0x0 554; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 555; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] 556; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 557; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] 558; GFX12W32-NEXT: s_endpgm 559entry: 560 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 561 store i32 %old, ptr addrspace(1) %out 562 ret void 563} 564 565define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 566; GFX6-LABEL: add_i32_varying_vdata: 567; GFX6: ; %bb.0: ; %entry 568; GFX6-NEXT: s_mov_b64 s[0:1], exec 569; GFX6-NEXT: s_mov_b32 s2, 0 570; GFX6-NEXT: ; implicit-def: $vgpr1 571; GFX6-NEXT: .LBB2_1: ; %ComputeLoop 572; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 573; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] 574; GFX6-NEXT: s_mov_b32 m0, s3 575; GFX6-NEXT: v_readlane_b32 s8, v0, s3 576; GFX6-NEXT: v_writelane_b32 v1, s2, m0 577; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 578; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 579; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 580; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] 581; GFX6-NEXT: s_add_i32 s2, s2, s8 582; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 583; GFX6-NEXT: ; %bb.2: ; %ComputeEnd 584; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 585; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 586; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 587; GFX6-NEXT: ; implicit-def: $vgpr0 588; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 589; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 590; GFX6-NEXT: s_cbranch_execz .LBB2_4 591; GFX6-NEXT: ; %bb.3: 592; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 593; GFX6-NEXT: v_mov_b32_e32 v0, s2 594; GFX6-NEXT: s_waitcnt lgkmcnt(0) 595; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 596; GFX6-NEXT: .LBB2_4: 597; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 598; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 599; GFX6-NEXT: s_mov_b32 s3, 0xf000 600; GFX6-NEXT: s_mov_b32 s2, -1 601; GFX6-NEXT: s_waitcnt vmcnt(0) 602; GFX6-NEXT: v_readfirstlane_b32 s4, v0 603; GFX6-NEXT: s_waitcnt expcnt(0) 604; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 605; GFX6-NEXT: s_waitcnt lgkmcnt(0) 606; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 607; GFX6-NEXT: s_endpgm 608; 609; GFX8-LABEL: add_i32_varying_vdata: 610; GFX8: ; %bb.0: ; %entry 611; GFX8-NEXT: s_mov_b64 s[0:1], exec 612; GFX8-NEXT: s_mov_b32 s2, 0 613; GFX8-NEXT: ; implicit-def: $vgpr1 614; GFX8-NEXT: .LBB2_1: ; %ComputeLoop 615; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 616; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] 617; GFX8-NEXT: s_mov_b32 m0, s3 618; GFX8-NEXT: v_readlane_b32 s8, v0, s3 619; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 620; GFX8-NEXT: v_writelane_b32 v1, s2, m0 621; GFX8-NEXT: s_add_i32 s2, s2, s8 622; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 623; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 624; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 625; GFX8-NEXT: ; %bb.2: ; %ComputeEnd 626; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 627; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 628; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 629; GFX8-NEXT: ; implicit-def: $vgpr0 630; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 631; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 632; GFX8-NEXT: s_cbranch_execz .LBB2_4 633; GFX8-NEXT: ; %bb.3: 634; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 635; GFX8-NEXT: v_mov_b32_e32 v0, s2 636; GFX8-NEXT: s_waitcnt lgkmcnt(0) 637; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 638; GFX8-NEXT: .LBB2_4: 639; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 640; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 641; GFX8-NEXT: s_waitcnt vmcnt(0) 642; GFX8-NEXT: v_readfirstlane_b32 s2, v0 643; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 644; GFX8-NEXT: s_waitcnt lgkmcnt(0) 645; GFX8-NEXT: v_mov_b32_e32 v0, s0 646; GFX8-NEXT: v_mov_b32_e32 v1, s1 647; GFX8-NEXT: flat_store_dword v[0:1], v2 648; GFX8-NEXT: s_endpgm 649; 650; GFX9-LABEL: add_i32_varying_vdata: 651; GFX9: ; %bb.0: ; %entry 652; GFX9-NEXT: s_mov_b64 s[0:1], exec 653; GFX9-NEXT: s_mov_b32 s2, 0 654; GFX9-NEXT: ; implicit-def: $vgpr1 655; GFX9-NEXT: .LBB2_1: ; %ComputeLoop 656; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 657; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] 658; GFX9-NEXT: s_mov_b32 m0, s3 659; GFX9-NEXT: v_readlane_b32 s8, v0, s3 660; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 661; GFX9-NEXT: v_writelane_b32 v1, s2, m0 662; GFX9-NEXT: s_add_i32 s2, s2, s8 663; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 664; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 665; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 666; GFX9-NEXT: ; %bb.2: ; %ComputeEnd 667; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 668; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 669; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 670; GFX9-NEXT: ; implicit-def: $vgpr0 671; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 672; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 673; GFX9-NEXT: s_cbranch_execz .LBB2_4 674; GFX9-NEXT: ; %bb.3: 675; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 676; GFX9-NEXT: v_mov_b32_e32 v0, s2 677; GFX9-NEXT: s_waitcnt lgkmcnt(0) 678; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 679; GFX9-NEXT: .LBB2_4: 680; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 681; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 682; GFX9-NEXT: s_waitcnt vmcnt(0) 683; GFX9-NEXT: v_readfirstlane_b32 s2, v0 684; GFX9-NEXT: v_mov_b32_e32 v2, 0 685; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 686; GFX9-NEXT: s_waitcnt lgkmcnt(0) 687; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 688; GFX9-NEXT: s_endpgm 689; 690; GFX10W64-LABEL: add_i32_varying_vdata: 691; GFX10W64: ; %bb.0: ; %entry 692; GFX10W64-NEXT: s_mov_b64 s[0:1], exec 693; GFX10W64-NEXT: s_mov_b32 s2, 0 694; GFX10W64-NEXT: ; implicit-def: $vgpr1 695; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop 696; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 697; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] 698; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 699; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 700; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 701; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 702; GFX10W64-NEXT: s_add_i32 s2, s2, s8 703; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 704; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 705; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd 706; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 707; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 708; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 709; GFX10W64-NEXT: ; implicit-def: $vgpr0 710; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 711; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 712; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 713; GFX10W64-NEXT: ; %bb.3: 714; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 715; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 716; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 717; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 718; GFX10W64-NEXT: .LBB2_4: 719; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 720; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 721; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 722; GFX10W64-NEXT: s_waitcnt vmcnt(0) 723; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 724; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 725; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 726; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 727; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] 728; GFX10W64-NEXT: s_endpgm 729; 730; GFX10W32-LABEL: add_i32_varying_vdata: 731; GFX10W32: ; %bb.0: ; %entry 732; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 733; GFX10W32-NEXT: s_mov_b32 s0, 0 734; GFX10W32-NEXT: ; implicit-def: $vgpr1 735; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop 736; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 737; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 738; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 739; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 740; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 741; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 742; GFX10W32-NEXT: s_add_i32 s0, s0, s3 743; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 744; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 745; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd 746; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 747; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 748; GFX10W32-NEXT: ; implicit-def: $vgpr0 749; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 750; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 751; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 752; GFX10W32-NEXT: ; %bb.3: 753; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 754; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 755; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 756; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 757; GFX10W32-NEXT: .LBB2_4: 758; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 759; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 760; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 761; GFX10W32-NEXT: s_waitcnt vmcnt(0) 762; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 763; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 764; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 765; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 766; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] 767; GFX10W32-NEXT: s_endpgm 768; 769; GFX11W64-LABEL: add_i32_varying_vdata: 770; GFX11W64: ; %bb.0: ; %entry 771; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 772; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 773; GFX11W64-NEXT: s_mov_b32 s2, 0 774; GFX11W64-NEXT: ; implicit-def: $vgpr0 775; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop 776; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 777; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 778; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 779; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 780; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 781; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 782; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 783; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 784; GFX11W64-NEXT: s_add_i32 s2, s2, s8 785; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 786; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 787; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd 788; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 789; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 790; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 791; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 792; GFX11W64-NEXT: ; implicit-def: $vgpr1 793; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 794; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 795; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 796; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 797; GFX11W64-NEXT: ; %bb.3: 798; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 799; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 800; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 801; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 802; GFX11W64-NEXT: .LBB2_4: 803; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 804; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 805; GFX11W64-NEXT: s_waitcnt vmcnt(0) 806; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 807; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 808; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 809; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 810; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 811; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 812; GFX11W64-NEXT: s_endpgm 813; 814; GFX11W32-LABEL: add_i32_varying_vdata: 815; GFX11W32: ; %bb.0: ; %entry 816; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 817; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 818; GFX11W32-NEXT: s_mov_b32 s0, 0 819; GFX11W32-NEXT: ; implicit-def: $vgpr0 820; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop 821; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 822; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 823; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 824; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 825; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 826; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 827; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 828; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 829; GFX11W32-NEXT: s_add_i32 s0, s0, s3 830; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 831; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 832; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd 833; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 834; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 835; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 836; GFX11W32-NEXT: ; implicit-def: $vgpr1 837; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 838; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 839; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 840; GFX11W32-NEXT: ; %bb.3: 841; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 842; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 843; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 844; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 845; GFX11W32-NEXT: .LBB2_4: 846; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 847; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 848; GFX11W32-NEXT: s_waitcnt vmcnt(0) 849; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 850; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 851; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 852; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 853; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 854; GFX11W32-NEXT: s_endpgm 855; 856; GFX12W64-LABEL: add_i32_varying_vdata: 857; GFX12W64: ; %bb.0: ; %entry 858; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 859; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 860; GFX12W64-NEXT: s_mov_b32 s2, 0 861; GFX12W64-NEXT: ; implicit-def: $vgpr0 862; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop 863; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 864; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 865; GFX12W64-NEXT: s_wait_alu 0xfffe 866; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 867; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 868; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 869; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 870; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 871; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 872; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 873; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 874; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd 875; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 876; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 877; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 878; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 879; GFX12W64-NEXT: ; implicit-def: $vgpr1 880; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 881; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 882; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 883; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 884; GFX12W64-NEXT: ; %bb.3: 885; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 886; GFX12W64-NEXT: s_wait_alu 0xfffe 887; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 888; GFX12W64-NEXT: s_wait_kmcnt 0x0 889; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 890; GFX12W64-NEXT: .LBB2_4: 891; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 892; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 893; GFX12W64-NEXT: s_wait_loadcnt 0x0 894; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 895; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 896; GFX12W64-NEXT: s_wait_alu 0xfffe 897; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 898; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 899; GFX12W64-NEXT: s_wait_kmcnt 0x0 900; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 901; GFX12W64-NEXT: s_endpgm 902; 903; GFX12W32-LABEL: add_i32_varying_vdata: 904; GFX12W32: ; %bb.0: ; %entry 905; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 906; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 907; GFX12W32-NEXT: s_mov_b32 s0, 0 908; GFX12W32-NEXT: ; implicit-def: $vgpr0 909; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop 910; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 911; GFX12W32-NEXT: s_wait_alu 0xfffe 912; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 913; GFX12W32-NEXT: s_wait_alu 0xfffe 914; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 915; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 916; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 917; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 918; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 919; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 920; GFX12W32-NEXT: s_wait_alu 0xfffe 921; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 922; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 923; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd 924; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 925; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 926; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 927; GFX12W32-NEXT: ; implicit-def: $vgpr1 928; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 929; GFX12W32-NEXT: s_wait_alu 0xfffe 930; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 931; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 932; GFX12W32-NEXT: ; %bb.3: 933; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 934; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 935; GFX12W32-NEXT: s_wait_kmcnt 0x0 936; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 937; GFX12W32-NEXT: .LBB2_4: 938; GFX12W32-NEXT: s_wait_alu 0xfffe 939; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 940; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 941; GFX12W32-NEXT: s_wait_loadcnt 0x0 942; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 943; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 944; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 945; GFX12W32-NEXT: s_wait_kmcnt 0x0 946; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 947; GFX12W32-NEXT: s_endpgm 948entry: 949 %lane = call i32 @llvm.amdgcn.workitem.id.x() 950 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 951 store i32 %old, ptr addrspace(1) %out 952 ret void 953} 954 955define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) { 956; GFX6-LABEL: struct_add_i32_varying_vdata: 957; GFX6: ; %bb.0: ; %entry 958; GFX6-NEXT: s_mov_b64 s[0:1], exec 959; GFX6-NEXT: s_mov_b32 s2, 0 960; GFX6-NEXT: ; implicit-def: $vgpr1 961; GFX6-NEXT: .LBB3_1: ; %ComputeLoop 962; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 963; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] 964; GFX6-NEXT: s_mov_b32 m0, s3 965; GFX6-NEXT: v_readlane_b32 s8, v0, s3 966; GFX6-NEXT: v_writelane_b32 v1, s2, m0 967; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 968; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 969; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 970; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] 971; GFX6-NEXT: s_add_i32 s2, s2, s8 972; GFX6-NEXT: s_cbranch_vccnz .LBB3_1 973; GFX6-NEXT: ; %bb.2: ; %ComputeEnd 974; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 975; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 976; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 977; GFX6-NEXT: ; implicit-def: $vgpr0 978; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 979; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 980; GFX6-NEXT: s_cbranch_execz .LBB3_4 981; GFX6-NEXT: ; %bb.3: 982; GFX6-NEXT: s_load_dword s3, s[4:5], 0x11 983; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 984; GFX6-NEXT: v_mov_b32_e32 v0, s2 985; GFX6-NEXT: s_waitcnt lgkmcnt(0) 986; GFX6-NEXT: v_mov_b32_e32 v2, s3 987; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 988; GFX6-NEXT: .LBB3_4: 989; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 990; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 991; GFX6-NEXT: s_mov_b32 s3, 0xf000 992; GFX6-NEXT: s_mov_b32 s2, -1 993; GFX6-NEXT: s_waitcnt vmcnt(0) 994; GFX6-NEXT: v_readfirstlane_b32 s4, v0 995; GFX6-NEXT: s_waitcnt expcnt(0) 996; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 997; GFX6-NEXT: s_waitcnt lgkmcnt(0) 998; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 999; GFX6-NEXT: s_endpgm 1000; 1001; GFX8-LABEL: struct_add_i32_varying_vdata: 1002; GFX8: ; %bb.0: ; %entry 1003; GFX8-NEXT: s_mov_b64 s[0:1], exec 1004; GFX8-NEXT: s_mov_b32 s2, 0 1005; GFX8-NEXT: ; implicit-def: $vgpr1 1006; GFX8-NEXT: .LBB3_1: ; %ComputeLoop 1007; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1008; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] 1009; GFX8-NEXT: s_mov_b32 m0, s3 1010; GFX8-NEXT: v_readlane_b32 s8, v0, s3 1011; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 1012; GFX8-NEXT: v_writelane_b32 v1, s2, m0 1013; GFX8-NEXT: s_add_i32 s2, s2, s8 1014; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1015; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 1016; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 1017; GFX8-NEXT: ; %bb.2: ; %ComputeEnd 1018; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1019; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1020; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1021; GFX8-NEXT: ; implicit-def: $vgpr0 1022; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1023; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1024; GFX8-NEXT: s_cbranch_execz .LBB3_4 1025; GFX8-NEXT: ; %bb.3: 1026; GFX8-NEXT: s_load_dword s3, s[4:5], 0x44 1027; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1028; GFX8-NEXT: v_mov_b32_e32 v0, s2 1029; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1030; GFX8-NEXT: v_mov_b32_e32 v2, s3 1031; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 1032; GFX8-NEXT: .LBB3_4: 1033; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1034; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1035; GFX8-NEXT: s_waitcnt vmcnt(0) 1036; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1037; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 1038; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1039; GFX8-NEXT: v_mov_b32_e32 v0, s0 1040; GFX8-NEXT: v_mov_b32_e32 v1, s1 1041; GFX8-NEXT: flat_store_dword v[0:1], v2 1042; GFX8-NEXT: s_endpgm 1043; 1044; GFX9-LABEL: struct_add_i32_varying_vdata: 1045; GFX9: ; %bb.0: ; %entry 1046; GFX9-NEXT: s_mov_b64 s[0:1], exec 1047; GFX9-NEXT: s_mov_b32 s2, 0 1048; GFX9-NEXT: ; implicit-def: $vgpr1 1049; GFX9-NEXT: .LBB3_1: ; %ComputeLoop 1050; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 1051; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] 1052; GFX9-NEXT: s_mov_b32 m0, s3 1053; GFX9-NEXT: v_readlane_b32 s8, v0, s3 1054; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 1055; GFX9-NEXT: v_writelane_b32 v1, s2, m0 1056; GFX9-NEXT: s_add_i32 s2, s2, s8 1057; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1058; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 1059; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 1060; GFX9-NEXT: ; %bb.2: ; %ComputeEnd 1061; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1062; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1063; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1064; GFX9-NEXT: ; implicit-def: $vgpr0 1065; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1066; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1067; GFX9-NEXT: s_cbranch_execz .LBB3_4 1068; GFX9-NEXT: ; %bb.3: 1069; GFX9-NEXT: s_load_dword s3, s[4:5], 0x44 1070; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1071; GFX9-NEXT: v_mov_b32_e32 v0, s2 1072; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1073; GFX9-NEXT: v_mov_b32_e32 v2, s3 1074; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 1075; GFX9-NEXT: .LBB3_4: 1076; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1077; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1078; GFX9-NEXT: s_waitcnt vmcnt(0) 1079; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1080; GFX9-NEXT: v_mov_b32_e32 v2, 0 1081; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 1082; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1084; GFX9-NEXT: s_endpgm 1085; 1086; GFX10W64-LABEL: struct_add_i32_varying_vdata: 1087; GFX10W64: ; %bb.0: ; %entry 1088; GFX10W64-NEXT: s_mov_b64 s[0:1], exec 1089; GFX10W64-NEXT: s_mov_b32 s2, 0 1090; GFX10W64-NEXT: ; implicit-def: $vgpr1 1091; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop 1092; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 1093; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] 1094; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 1095; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 1096; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 1097; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 1098; GFX10W64-NEXT: s_add_i32 s2, s2, s8 1099; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 1100; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 1101; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd 1102; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1103; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1104; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1105; GFX10W64-NEXT: ; implicit-def: $vgpr0 1106; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1107; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1108; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 1109; GFX10W64-NEXT: ; %bb.3: 1110; GFX10W64-NEXT: s_clause 0x1 1111; GFX10W64-NEXT: s_load_dword s3, s[4:5], 0x44 1112; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1113; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 1114; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1115; GFX10W64-NEXT: v_mov_b32_e32 v2, s3 1116; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 1117; GFX10W64-NEXT: .LBB3_4: 1118; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1119; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1120; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1121; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1122; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 1123; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 1124; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 1125; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1126; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] 1127; GFX10W64-NEXT: s_endpgm 1128; 1129; GFX10W32-LABEL: struct_add_i32_varying_vdata: 1130; GFX10W32: ; %bb.0: ; %entry 1131; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 1132; GFX10W32-NEXT: s_mov_b32 s0, 0 1133; GFX10W32-NEXT: ; implicit-def: $vgpr1 1134; GFX10W32-NEXT: .LBB3_1: ; %ComputeLoop 1135; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 1136; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 1137; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 1138; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 1139; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 1140; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 1141; GFX10W32-NEXT: s_add_i32 s0, s0, s3 1142; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 1143; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 1144; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd 1145; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1146; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1147; GFX10W32-NEXT: ; implicit-def: $vgpr0 1148; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1149; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 1150; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 1151; GFX10W32-NEXT: ; %bb.3: 1152; GFX10W32-NEXT: s_clause 0x1 1153; GFX10W32-NEXT: s_load_dword s2, s[4:5], 0x44 1154; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1155; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 1156; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1157; GFX10W32-NEXT: v_mov_b32_e32 v2, s2 1158; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc 1159; GFX10W32-NEXT: .LBB3_4: 1160; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1161; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1162; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1163; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1164; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 1165; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 1166; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 1167; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1168; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] 1169; GFX10W32-NEXT: s_endpgm 1170; 1171; GFX11W64-LABEL: struct_add_i32_varying_vdata: 1172; GFX11W64: ; %bb.0: ; %entry 1173; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1174; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 1175; GFX11W64-NEXT: s_mov_b32 s2, 0 1176; GFX11W64-NEXT: ; implicit-def: $vgpr0 1177; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop 1178; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 1179; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 1180; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1181; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 1182; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 1183; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 1184; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 1185; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1186; GFX11W64-NEXT: s_add_i32 s2, s2, s8 1187; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 1188; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 1189; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd 1190; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1191; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1192; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 1193; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1194; GFX11W64-NEXT: ; implicit-def: $vgpr1 1195; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1196; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1197; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1198; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 1199; GFX11W64-NEXT: ; %bb.3: 1200; GFX11W64-NEXT: s_clause 0x1 1201; GFX11W64-NEXT: s_load_b32 s3, s[4:5], 0x44 1202; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1203; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 1204; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1205; GFX11W64-NEXT: v_mov_b32_e32 v2, s3 1206; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 1207; GFX11W64-NEXT: .LBB3_4: 1208; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 1209; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1210; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1211; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 1212; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1213; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1214; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 1215; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 1217; GFX11W64-NEXT: s_endpgm 1218; 1219; GFX11W32-LABEL: struct_add_i32_varying_vdata: 1220; GFX11W32: ; %bb.0: ; %entry 1221; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1222; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 1223; GFX11W32-NEXT: s_mov_b32 s0, 0 1224; GFX11W32-NEXT: ; implicit-def: $vgpr0 1225; GFX11W32-NEXT: .LBB3_1: ; %ComputeLoop 1226; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 1227; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 1228; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1229; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 1230; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 1231; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 1232; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 1233; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1234; GFX11W32-NEXT: s_add_i32 s0, s0, s3 1235; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 1236; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 1237; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd 1238; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1239; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1240; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1241; GFX11W32-NEXT: ; implicit-def: $vgpr1 1242; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1243; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 1244; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 1245; GFX11W32-NEXT: ; %bb.3: 1246; GFX11W32-NEXT: s_clause 0x1 1247; GFX11W32-NEXT: s_load_b32 s2, s[4:5], 0x44 1248; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1249; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1250; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s2 1251; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc 1252; GFX11W32-NEXT: .LBB3_4: 1253; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1254; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1255; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1256; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 1257; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1258; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 1259; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1260; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 1261; GFX11W32-NEXT: s_endpgm 1262; 1263; GFX12W64-LABEL: struct_add_i32_varying_vdata: 1264; GFX12W64: ; %bb.0: ; %entry 1265; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1266; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 1267; GFX12W64-NEXT: s_mov_b32 s2, 0 1268; GFX12W64-NEXT: ; implicit-def: $vgpr0 1269; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop 1270; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 1271; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 1272; GFX12W64-NEXT: s_wait_alu 0xfffe 1273; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 1274; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 1275; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 1276; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 1277; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1278; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 1279; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 1280; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 1281; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd 1282; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1283; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1284; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 1285; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1286; GFX12W64-NEXT: ; implicit-def: $vgpr1 1287; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1288; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1289; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1290; GFX12W64-NEXT: s_cbranch_execz .LBB3_4 1291; GFX12W64-NEXT: ; %bb.3: 1292; GFX12W64-NEXT: s_clause 0x1 1293; GFX12W64-NEXT: s_load_b32 s3, s[4:5], 0x44 1294; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1295; GFX12W64-NEXT: s_wait_alu 0xfffe 1296; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 1297; GFX12W64-NEXT: s_wait_kmcnt 0x0 1298; GFX12W64-NEXT: v_mov_b32_e32 v2, s3 1299; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 1300; GFX12W64-NEXT: .LBB3_4: 1301; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 1302; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1303; GFX12W64-NEXT: s_wait_loadcnt 0x0 1304; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 1305; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 1306; GFX12W64-NEXT: s_wait_alu 0xfffe 1307; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1308; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 1309; GFX12W64-NEXT: s_wait_kmcnt 0x0 1310; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 1311; GFX12W64-NEXT: s_endpgm 1312; 1313; GFX12W32-LABEL: struct_add_i32_varying_vdata: 1314; GFX12W32: ; %bb.0: ; %entry 1315; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1316; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 1317; GFX12W32-NEXT: s_mov_b32 s0, 0 1318; GFX12W32-NEXT: ; implicit-def: $vgpr0 1319; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop 1320; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 1321; GFX12W32-NEXT: s_wait_alu 0xfffe 1322; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 1323; GFX12W32-NEXT: s_wait_alu 0xfffe 1324; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 1325; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 1326; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 1327; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 1328; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1329; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 1330; GFX12W32-NEXT: s_wait_alu 0xfffe 1331; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 1332; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 1333; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd 1334; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1335; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1336; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1337; GFX12W32-NEXT: ; implicit-def: $vgpr1 1338; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1339; GFX12W32-NEXT: s_wait_alu 0xfffe 1340; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 1341; GFX12W32-NEXT: s_cbranch_execz .LBB3_4 1342; GFX12W32-NEXT: ; %bb.3: 1343; GFX12W32-NEXT: s_clause 0x1 1344; GFX12W32-NEXT: s_load_b32 s2, s[4:5], 0x44 1345; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1346; GFX12W32-NEXT: s_wait_kmcnt 0x0 1347; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s2 1348; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN 1349; GFX12W32-NEXT: .LBB3_4: 1350; GFX12W32-NEXT: s_wait_alu 0xfffe 1351; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1352; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1353; GFX12W32-NEXT: s_wait_loadcnt 0x0 1354; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 1355; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1356; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 1357; GFX12W32-NEXT: s_wait_kmcnt 0x0 1358; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 1359; GFX12W32-NEXT: s_endpgm 1360entry: 1361 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1362 %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 %vindex, i32 0, i32 0, i32 0) 1363 store i32 %old, ptr addrspace(1) %out 1364 ret void 1365} 1366 1367define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 1368; GFX6-LABEL: add_i32_varying_offset: 1369; GFX6: ; %bb.0: ; %entry 1370; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 1371; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1372; GFX6-NEXT: v_mov_b32_e32 v1, 1 1373; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1374; GFX6-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc 1375; GFX6-NEXT: s_mov_b32 s7, 0xf000 1376; GFX6-NEXT: s_mov_b32 s6, -1 1377; GFX6-NEXT: s_waitcnt vmcnt(0) 1378; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 1379; GFX6-NEXT: s_endpgm 1380; 1381; GFX8-LABEL: add_i32_varying_offset: 1382; GFX8: ; %bb.0: ; %entry 1383; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1384; GFX8-NEXT: v_mov_b32_e32 v2, 1 1385; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1386; GFX8-NEXT: buffer_atomic_add v2, v0, s[0:3], 0 offen glc 1387; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1388; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX8-NEXT: v_mov_b32_e32 v0, s0 1390; GFX8-NEXT: v_mov_b32_e32 v1, s1 1391; GFX8-NEXT: s_waitcnt vmcnt(0) 1392; GFX8-NEXT: flat_store_dword v[0:1], v2 1393; GFX8-NEXT: s_endpgm 1394; 1395; GFX9-LABEL: add_i32_varying_offset: 1396; GFX9: ; %bb.0: ; %entry 1397; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1398; GFX9-NEXT: v_mov_b32_e32 v1, 1 1399; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1400; GFX9-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc 1401; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1402; GFX9-NEXT: v_mov_b32_e32 v0, 0 1403; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1404; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1405; GFX9-NEXT: s_endpgm 1406; 1407; GFX10-LABEL: add_i32_varying_offset: 1408; GFX10: ; %bb.0: ; %entry 1409; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1410; GFX10-NEXT: v_mov_b32_e32 v1, 1 1411; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1412; GFX10-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc 1413; GFX10-NEXT: s_waitcnt_depctr 0xffe3 1414; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1415; GFX10-NEXT: v_mov_b32_e32 v0, 0 1416; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1417; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1418; GFX10-NEXT: s_endpgm 1419; 1420; GFX11W64-LABEL: add_i32_varying_offset: 1421; GFX11W64: ; %bb.0: ; %entry 1422; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1423; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1424; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 1425; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1426; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc 1427; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1428; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 1429; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1430; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 1431; GFX11W64-NEXT: s_endpgm 1432; 1433; GFX11W32-LABEL: add_i32_varying_offset: 1434; GFX11W32: ; %bb.0: ; %entry 1435; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1436; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 1437; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1438; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc 1439; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1440; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 1441; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1442; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] 1443; GFX11W32-NEXT: s_endpgm 1444; 1445; GFX12W64-LABEL: add_i32_varying_offset: 1446; GFX12W64: ; %bb.0: ; %entry 1447; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1448; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1449; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 1450; GFX12W64-NEXT: s_wait_kmcnt 0x0 1451; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN 1452; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1453; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 1454; GFX12W64-NEXT: s_wait_loadcnt 0x0 1455; GFX12W64-NEXT: s_wait_kmcnt 0x0 1456; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] 1457; GFX12W64-NEXT: s_endpgm 1458; 1459; GFX12W32-LABEL: add_i32_varying_offset: 1460; GFX12W32: ; %bb.0: ; %entry 1461; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 1462; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 1463; GFX12W32-NEXT: s_wait_kmcnt 0x0 1464; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN 1465; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1466; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 1467; GFX12W32-NEXT: s_wait_loadcnt 0x0 1468; GFX12W32-NEXT: s_wait_kmcnt 0x0 1469; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] 1470; GFX12W32-NEXT: s_endpgm 1471entry: 1472 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1473 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) 1474 store i32 %old, ptr addrspace(1) %out 1475 ret void 1476} 1477 1478define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 1479; GFX6-LABEL: sub_i32_constant: 1480; GFX6: ; %bb.0: ; %entry 1481; GFX6-NEXT: s_mov_b64 s[2:3], exec 1482; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1483; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1484; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1485; GFX6-NEXT: ; implicit-def: $vgpr1 1486; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 1487; GFX6-NEXT: s_cbranch_execz .LBB5_2 1488; GFX6-NEXT: ; %bb.1: 1489; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 1490; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1491; GFX6-NEXT: s_mul_i32 s2, s2, 5 1492; GFX6-NEXT: v_mov_b32_e32 v1, s2 1493; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1494; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1495; GFX6-NEXT: .LBB5_2: 1496; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 1497; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1498; GFX6-NEXT: s_mov_b32 s3, 0xf000 1499; GFX6-NEXT: s_mov_b32 s2, -1 1500; GFX6-NEXT: s_waitcnt vmcnt(0) 1501; GFX6-NEXT: v_readfirstlane_b32 s4, v1 1502; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1503; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1504; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1505; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1506; GFX6-NEXT: s_endpgm 1507; 1508; GFX8-LABEL: sub_i32_constant: 1509; GFX8: ; %bb.0: ; %entry 1510; GFX8-NEXT: s_mov_b64 s[2:3], exec 1511; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1512; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1513; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1514; GFX8-NEXT: ; implicit-def: $vgpr1 1515; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1516; GFX8-NEXT: s_cbranch_execz .LBB5_2 1517; GFX8-NEXT: ; %bb.1: 1518; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1519; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1520; GFX8-NEXT: s_mul_i32 s2, s2, 5 1521; GFX8-NEXT: v_mov_b32_e32 v1, s2 1522; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1523; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1524; GFX8-NEXT: .LBB5_2: 1525; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1526; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1527; GFX8-NEXT: s_waitcnt vmcnt(0) 1528; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1529; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1530; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 1531; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1532; GFX8-NEXT: v_mov_b32_e32 v0, s0 1533; GFX8-NEXT: v_mov_b32_e32 v1, s1 1534; GFX8-NEXT: flat_store_dword v[0:1], v2 1535; GFX8-NEXT: s_endpgm 1536; 1537; GFX9-LABEL: sub_i32_constant: 1538; GFX9: ; %bb.0: ; %entry 1539; GFX9-NEXT: s_mov_b64 s[2:3], exec 1540; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1541; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1542; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1543; GFX9-NEXT: ; implicit-def: $vgpr1 1544; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1545; GFX9-NEXT: s_cbranch_execz .LBB5_2 1546; GFX9-NEXT: ; %bb.1: 1547; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1548; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1549; GFX9-NEXT: s_mul_i32 s2, s2, 5 1550; GFX9-NEXT: v_mov_b32_e32 v1, s2 1551; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1552; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1553; GFX9-NEXT: .LBB5_2: 1554; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1555; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1556; GFX9-NEXT: s_waitcnt vmcnt(0) 1557; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1558; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1559; GFX9-NEXT: v_mov_b32_e32 v2, 0 1560; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1561; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1562; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1563; GFX9-NEXT: s_endpgm 1564; 1565; GFX10W64-LABEL: sub_i32_constant: 1566; GFX10W64: ; %bb.0: ; %entry 1567; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 1568; GFX10W64-NEXT: ; implicit-def: $vgpr1 1569; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1570; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1571; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1572; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1573; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 1574; GFX10W64-NEXT: ; %bb.1: 1575; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1576; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1577; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 1578; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 1579; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1580; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1581; GFX10W64-NEXT: .LBB5_2: 1582; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1583; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1584; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1585; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1586; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 1587; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1588; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1589; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1590; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1591; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 1592; GFX10W64-NEXT: s_endpgm 1593; 1594; GFX10W32-LABEL: sub_i32_constant: 1595; GFX10W32: ; %bb.0: ; %entry 1596; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 1597; GFX10W32-NEXT: ; implicit-def: $vgpr1 1598; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1599; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1600; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 1601; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 1602; GFX10W32-NEXT: ; %bb.1: 1603; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1604; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 1605; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 1606; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 1607; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1608; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1609; GFX10W32-NEXT: .LBB5_2: 1610; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1611; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1612; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1613; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1614; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 1615; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1616; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1617; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1618; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1619; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] 1620; GFX10W32-NEXT: s_endpgm 1621; 1622; GFX11W64-LABEL: sub_i32_constant: 1623; GFX11W64: ; %bb.0: ; %entry 1624; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 1625; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 1626; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1627; GFX11W64-NEXT: ; implicit-def: $vgpr1 1628; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1629; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1630; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1631; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 1632; GFX11W64-NEXT: ; %bb.1: 1633; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1634; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1635; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1636; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 1637; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 1638; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1639; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1640; GFX11W64-NEXT: .LBB5_2: 1641; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 1642; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1643; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1644; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 1645; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1646; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1647; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1648; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1649; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1650; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 1651; GFX11W64-NEXT: s_endpgm 1652; 1653; GFX11W32-LABEL: sub_i32_constant: 1654; GFX11W32: ; %bb.0: ; %entry 1655; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 1656; GFX11W32-NEXT: s_mov_b32 s0, exec_lo 1657; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1658; GFX11W32-NEXT: ; implicit-def: $vgpr1 1659; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1660; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1661; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 1662; GFX11W32-NEXT: ; %bb.1: 1663; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1664; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 1665; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1666; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 1667; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 1668; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1670; GFX11W32-NEXT: .LBB5_2: 1671; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1672; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1673; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1674; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 1675; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1676; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1677; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1678; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1679; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1680; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 1681; GFX11W32-NEXT: s_endpgm 1682; 1683; GFX12W64-LABEL: sub_i32_constant: 1684; GFX12W64: ; %bb.0: ; %entry 1685; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 1686; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 1687; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1688; GFX12W64-NEXT: ; implicit-def: $vgpr1 1689; GFX12W64-NEXT: s_wait_alu 0xfffe 1690; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1691; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1692; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1693; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 1694; GFX12W64-NEXT: ; %bb.1: 1695; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1696; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1697; GFX12W64-NEXT: s_wait_alu 0xfffe 1698; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 1699; GFX12W64-NEXT: s_wait_alu 0xfffe 1700; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 1701; GFX12W64-NEXT: s_wait_kmcnt 0x0 1702; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1703; GFX12W64-NEXT: .LBB5_2: 1704; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 1705; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1706; GFX12W64-NEXT: s_wait_loadcnt 0x0 1707; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 1708; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1709; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 1710; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1711; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1712; GFX12W64-NEXT: s_wait_kmcnt 0x0 1713; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 1714; GFX12W64-NEXT: s_endpgm 1715; 1716; GFX12W32-LABEL: sub_i32_constant: 1717; GFX12W32: ; %bb.0: ; %entry 1718; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 1719; GFX12W32-NEXT: s_mov_b32 s0, exec_lo 1720; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 1721; GFX12W32-NEXT: ; implicit-def: $vgpr1 1722; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1723; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1724; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 1725; GFX12W32-NEXT: ; %bb.1: 1726; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1727; GFX12W32-NEXT: s_wait_alu 0xfffe 1728; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 1729; GFX12W32-NEXT: s_wait_alu 0xfffe 1730; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 1731; GFX12W32-NEXT: s_wait_alu 0xfffe 1732; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 1733; GFX12W32-NEXT: s_wait_kmcnt 0x0 1734; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1735; GFX12W32-NEXT: .LBB5_2: 1736; GFX12W32-NEXT: s_wait_alu 0xfffe 1737; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1738; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1739; GFX12W32-NEXT: s_wait_loadcnt 0x0 1740; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 1741; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1742; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 1743; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1744; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1745; GFX12W32-NEXT: s_wait_kmcnt 0x0 1746; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 1747; GFX12W32-NEXT: s_endpgm 1748entry: 1749 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 1750 store i32 %old, ptr addrspace(1) %out 1751 ret void 1752} 1753 1754define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { 1755; GFX6-LABEL: sub_i32_uniform: 1756; GFX6: ; %bb.0: ; %entry 1757; GFX6-NEXT: s_mov_b64 s[2:3], exec 1758; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 1759; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1760; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1761; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1762; GFX6-NEXT: ; implicit-def: $vgpr1 1763; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 1764; GFX6-NEXT: s_cbranch_execz .LBB6_2 1765; GFX6-NEXT: ; %bb.1: 1766; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 1767; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1768; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1769; GFX6-NEXT: s_mul_i32 s2, s6, s2 1770; GFX6-NEXT: v_mov_b32_e32 v1, s2 1771; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1772; GFX6-NEXT: .LBB6_2: 1773; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 1774; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1775; GFX6-NEXT: s_mov_b32 s3, 0xf000 1776; GFX6-NEXT: s_mov_b32 s2, -1 1777; GFX6-NEXT: s_waitcnt vmcnt(0) 1778; GFX6-NEXT: v_readfirstlane_b32 s4, v1 1779; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1780; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 1781; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1782; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1783; GFX6-NEXT: s_endpgm 1784; 1785; GFX8-LABEL: sub_i32_uniform: 1786; GFX8: ; %bb.0: ; %entry 1787; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 1788; GFX8-NEXT: s_mov_b64 s[2:3], exec 1789; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1790; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1791; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1792; GFX8-NEXT: ; implicit-def: $vgpr1 1793; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1794; GFX8-NEXT: s_cbranch_execz .LBB6_2 1795; GFX8-NEXT: ; %bb.1: 1796; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1797; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1798; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1799; GFX8-NEXT: s_mul_i32 s2, s6, s2 1800; GFX8-NEXT: v_mov_b32_e32 v1, s2 1801; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1802; GFX8-NEXT: .LBB6_2: 1803; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1804; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1805; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1806; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1807; GFX8-NEXT: s_waitcnt vmcnt(0) 1808; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1809; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 1810; GFX8-NEXT: v_mov_b32_e32 v0, s0 1811; GFX8-NEXT: v_mov_b32_e32 v1, s1 1812; GFX8-NEXT: flat_store_dword v[0:1], v2 1813; GFX8-NEXT: s_endpgm 1814; 1815; GFX9-LABEL: sub_i32_uniform: 1816; GFX9: ; %bb.0: ; %entry 1817; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 1818; GFX9-NEXT: s_mov_b64 s[2:3], exec 1819; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1820; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1821; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1822; GFX9-NEXT: ; implicit-def: $vgpr1 1823; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1824; GFX9-NEXT: s_cbranch_execz .LBB6_2 1825; GFX9-NEXT: ; %bb.1: 1826; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1827; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1828; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1829; GFX9-NEXT: s_mul_i32 s2, s6, s2 1830; GFX9-NEXT: v_mov_b32_e32 v1, s2 1831; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1832; GFX9-NEXT: .LBB6_2: 1833; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1834; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1835; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1836; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1837; GFX9-NEXT: s_waitcnt vmcnt(0) 1838; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1839; GFX9-NEXT: v_mov_b32_e32 v2, 0 1840; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1841; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1842; GFX9-NEXT: s_endpgm 1843; 1844; GFX10W64-LABEL: sub_i32_uniform: 1845; GFX10W64: ; %bb.0: ; %entry 1846; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 1847; GFX10W64-NEXT: s_mov_b64 s[2:3], exec 1848; GFX10W64-NEXT: ; implicit-def: $vgpr1 1849; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1850; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1851; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1852; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1853; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1854; GFX10W64-NEXT: ; %bb.1: 1855; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1856; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1857; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1858; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 1859; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 1860; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1861; GFX10W64-NEXT: .LBB6_2: 1862; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1863; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 1864; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1865; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1866; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 1867; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1868; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 1869; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1870; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1871; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] 1872; GFX10W64-NEXT: s_endpgm 1873; 1874; GFX10W32-LABEL: sub_i32_uniform: 1875; GFX10W32: ; %bb.0: ; %entry 1876; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 1877; GFX10W32-NEXT: s_mov_b32 s2, exec_lo 1878; GFX10W32-NEXT: ; implicit-def: $vgpr1 1879; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1880; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1881; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1882; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1883; GFX10W32-NEXT: ; %bb.1: 1884; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 1885; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 1886; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1887; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 1888; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 1889; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1890; GFX10W32-NEXT: .LBB6_2: 1891; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1892; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1893; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1894; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1895; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 1896; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1897; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 1898; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1899; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1900; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 1901; GFX10W32-NEXT: s_endpgm 1902; 1903; GFX11W64-LABEL: sub_i32_uniform: 1904; GFX11W64: ; %bb.0: ; %entry 1905; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 1906; GFX11W64-NEXT: s_mov_b64 s[2:3], exec 1907; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 1908; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1909; GFX11W64-NEXT: ; implicit-def: $vgpr1 1910; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1911; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1912; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1913; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 1914; GFX11W64-NEXT: ; %bb.1: 1915; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1916; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1917; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1918; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 1919; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1920; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 1921; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1922; GFX11W64-NEXT: .LBB6_2: 1923; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 1924; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1925; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 1926; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 1927; GFX11W64-NEXT: s_waitcnt vmcnt(0) 1928; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 1929; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 1930; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1931; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1932; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 1933; GFX11W64-NEXT: s_endpgm 1934; 1935; GFX11W32-LABEL: sub_i32_uniform: 1936; GFX11W32: ; %bb.0: ; %entry 1937; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 1938; GFX11W32-NEXT: s_mov_b32 s2, exec_lo 1939; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 1940; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1941; GFX11W32-NEXT: ; implicit-def: $vgpr1 1942; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 1943; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 1944; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 1945; GFX11W32-NEXT: ; %bb.1: 1946; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1947; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 1948; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 1950; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1951; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 1952; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 1953; GFX11W32-NEXT: .LBB6_2: 1954; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1955; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 1956; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 1957; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 1958; GFX11W32-NEXT: s_waitcnt vmcnt(0) 1959; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 1960; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 1961; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 1962; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1963; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] 1964; GFX11W32-NEXT: s_endpgm 1965; 1966; GFX12W64-LABEL: sub_i32_uniform: 1967; GFX12W64: ; %bb.0: ; %entry 1968; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 1969; GFX12W64-NEXT: s_mov_b64 s[2:3], exec 1970; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 1971; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1972; GFX12W64-NEXT: ; implicit-def: $vgpr1 1973; GFX12W64-NEXT: s_wait_alu 0xfffe 1974; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1975; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1976; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 1977; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 1978; GFX12W64-NEXT: ; %bb.1: 1979; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 1980; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1981; GFX12W64-NEXT: s_wait_kmcnt 0x0 1982; GFX12W64-NEXT: s_wait_alu 0xfffe 1983; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 1984; GFX12W64-NEXT: s_wait_alu 0xfffe 1985; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 1986; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 1987; GFX12W64-NEXT: .LBB6_2: 1988; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 1989; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1990; GFX12W64-NEXT: s_wait_kmcnt 0x0 1991; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 1992; GFX12W64-NEXT: s_wait_loadcnt 0x0 1993; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 1994; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 1995; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 1996; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1997; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 1998; GFX12W64-NEXT: s_endpgm 1999; 2000; GFX12W32-LABEL: sub_i32_uniform: 2001; GFX12W32: ; %bb.0: ; %entry 2002; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 2003; GFX12W32-NEXT: s_mov_b32 s2, exec_lo 2004; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 2005; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2006; GFX12W32-NEXT: ; implicit-def: $vgpr1 2007; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 2008; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 2009; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 2010; GFX12W32-NEXT: ; %bb.1: 2011; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2012; GFX12W32-NEXT: s_wait_alu 0xfffe 2013; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 2014; GFX12W32-NEXT: s_wait_kmcnt 0x0 2015; GFX12W32-NEXT: s_wait_alu 0xfffe 2016; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 2017; GFX12W32-NEXT: s_wait_alu 0xfffe 2018; GFX12W32-NEXT: v_mov_b32_e32 v1, s2 2019; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 2020; GFX12W32-NEXT: .LBB6_2: 2021; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2022; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 2023; GFX12W32-NEXT: s_wait_kmcnt 0x0 2024; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 2025; GFX12W32-NEXT: s_wait_loadcnt 0x0 2026; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 2027; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 2028; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2029; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2030; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] 2031; GFX12W32-NEXT: s_endpgm 2032entry: 2033 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 2034 store i32 %old, ptr addrspace(1) %out 2035 ret void 2036} 2037 2038define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 2039; GFX6-LABEL: sub_i32_varying_vdata: 2040; GFX6: ; %bb.0: ; %entry 2041; GFX6-NEXT: s_mov_b64 s[0:1], exec 2042; GFX6-NEXT: s_mov_b32 s2, 0 2043; GFX6-NEXT: ; implicit-def: $vgpr1 2044; GFX6-NEXT: .LBB7_1: ; %ComputeLoop 2045; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 2046; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] 2047; GFX6-NEXT: s_mov_b32 m0, s3 2048; GFX6-NEXT: v_readlane_b32 s8, v0, s3 2049; GFX6-NEXT: v_writelane_b32 v1, s2, m0 2050; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 2051; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 2052; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 2053; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] 2054; GFX6-NEXT: s_add_i32 s2, s2, s8 2055; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 2056; GFX6-NEXT: ; %bb.2: ; %ComputeEnd 2057; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2058; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 2059; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2060; GFX6-NEXT: ; implicit-def: $vgpr0 2061; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc 2062; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2063; GFX6-NEXT: s_cbranch_execz .LBB7_4 2064; GFX6-NEXT: ; %bb.3: 2065; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 2066; GFX6-NEXT: v_mov_b32_e32 v0, s2 2067; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2068; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 2069; GFX6-NEXT: .LBB7_4: 2070; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] 2071; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2072; GFX6-NEXT: s_mov_b32 s3, 0xf000 2073; GFX6-NEXT: s_mov_b32 s2, -1 2074; GFX6-NEXT: s_waitcnt vmcnt(0) 2075; GFX6-NEXT: v_readfirstlane_b32 s4, v0 2076; GFX6-NEXT: s_waitcnt expcnt(0) 2077; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 2078; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2079; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 2080; GFX6-NEXT: s_endpgm 2081; 2082; GFX8-LABEL: sub_i32_varying_vdata: 2083; GFX8: ; %bb.0: ; %entry 2084; GFX8-NEXT: s_mov_b64 s[0:1], exec 2085; GFX8-NEXT: s_mov_b32 s2, 0 2086; GFX8-NEXT: ; implicit-def: $vgpr1 2087; GFX8-NEXT: .LBB7_1: ; %ComputeLoop 2088; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2089; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] 2090; GFX8-NEXT: s_mov_b32 m0, s3 2091; GFX8-NEXT: v_readlane_b32 s8, v0, s3 2092; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 2093; GFX8-NEXT: v_writelane_b32 v1, s2, m0 2094; GFX8-NEXT: s_add_i32 s2, s2, s8 2095; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 2096; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 2097; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 2098; GFX8-NEXT: ; %bb.2: ; %ComputeEnd 2099; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2100; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2101; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2102; GFX8-NEXT: ; implicit-def: $vgpr0 2103; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 2104; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2105; GFX8-NEXT: s_cbranch_execz .LBB7_4 2106; GFX8-NEXT: ; %bb.3: 2107; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 2108; GFX8-NEXT: v_mov_b32_e32 v0, s2 2109; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2110; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 2111; GFX8-NEXT: .LBB7_4: 2112; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 2113; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2114; GFX8-NEXT: s_waitcnt vmcnt(0) 2115; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2116; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 2117; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2118; GFX8-NEXT: v_mov_b32_e32 v0, s0 2119; GFX8-NEXT: v_mov_b32_e32 v1, s1 2120; GFX8-NEXT: flat_store_dword v[0:1], v2 2121; GFX8-NEXT: s_endpgm 2122; 2123; GFX9-LABEL: sub_i32_varying_vdata: 2124; GFX9: ; %bb.0: ; %entry 2125; GFX9-NEXT: s_mov_b64 s[0:1], exec 2126; GFX9-NEXT: s_mov_b32 s2, 0 2127; GFX9-NEXT: ; implicit-def: $vgpr1 2128; GFX9-NEXT: .LBB7_1: ; %ComputeLoop 2129; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2130; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] 2131; GFX9-NEXT: s_mov_b32 m0, s3 2132; GFX9-NEXT: v_readlane_b32 s8, v0, s3 2133; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 2134; GFX9-NEXT: v_writelane_b32 v1, s2, m0 2135; GFX9-NEXT: s_add_i32 s2, s2, s8 2136; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 2137; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 2138; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 2139; GFX9-NEXT: ; %bb.2: ; %ComputeEnd 2140; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2141; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2142; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2143; GFX9-NEXT: ; implicit-def: $vgpr0 2144; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 2145; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2146; GFX9-NEXT: s_cbranch_execz .LBB7_4 2147; GFX9-NEXT: ; %bb.3: 2148; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 2149; GFX9-NEXT: v_mov_b32_e32 v0, s2 2150; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2151; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 2152; GFX9-NEXT: .LBB7_4: 2153; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2154; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2155; GFX9-NEXT: s_waitcnt vmcnt(0) 2156; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2157; GFX9-NEXT: v_mov_b32_e32 v2, 0 2158; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 2159; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 2161; GFX9-NEXT: s_endpgm 2162; 2163; GFX10W64-LABEL: sub_i32_varying_vdata: 2164; GFX10W64: ; %bb.0: ; %entry 2165; GFX10W64-NEXT: s_mov_b64 s[0:1], exec 2166; GFX10W64-NEXT: s_mov_b32 s2, 0 2167; GFX10W64-NEXT: ; implicit-def: $vgpr1 2168; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop 2169; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 2170; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] 2171; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 2172; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 2173; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 2174; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 2175; GFX10W64-NEXT: s_add_i32 s2, s2, s8 2176; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2177; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 2178; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd 2179; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2180; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2181; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2182; GFX10W64-NEXT: ; implicit-def: $vgpr0 2183; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2184; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2185; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 2186; GFX10W64-NEXT: ; %bb.3: 2187; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 2188; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 2189; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 2190; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 2191; GFX10W64-NEXT: .LBB7_4: 2192; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 2193; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] 2194; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2195; GFX10W64-NEXT: s_waitcnt vmcnt(0) 2196; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 2197; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 2198; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 2199; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 2200; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] 2201; GFX10W64-NEXT: s_endpgm 2202; 2203; GFX10W32-LABEL: sub_i32_varying_vdata: 2204; GFX10W32: ; %bb.0: ; %entry 2205; GFX10W32-NEXT: s_mov_b32 s1, exec_lo 2206; GFX10W32-NEXT: s_mov_b32 s0, 0 2207; GFX10W32-NEXT: ; implicit-def: $vgpr1 2208; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop 2209; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 2210; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 2211; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 2212; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 2213; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 2214; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 2215; GFX10W32-NEXT: s_add_i32 s0, s0, s3 2216; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 2217; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 2218; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd 2219; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2220; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2221; GFX10W32-NEXT: ; implicit-def: $vgpr0 2222; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2223; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 2224; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 2225; GFX10W32-NEXT: ; %bb.3: 2226; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 2227; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 2228; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 2229; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 2230; GFX10W32-NEXT: .LBB7_4: 2231; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 2232; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2233; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2234; GFX10W32-NEXT: s_waitcnt vmcnt(0) 2235; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 2236; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 2237; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 2238; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 2239; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] 2240; GFX10W32-NEXT: s_endpgm 2241; 2242; GFX11W64-LABEL: sub_i32_varying_vdata: 2243; GFX11W64: ; %bb.0: ; %entry 2244; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2245; GFX11W64-NEXT: s_mov_b64 s[0:1], exec 2246; GFX11W64-NEXT: s_mov_b32 s2, 0 2247; GFX11W64-NEXT: ; implicit-def: $vgpr0 2248; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop 2249; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 2250; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 2251; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 2252; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 2253; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 2254; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 2255; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 2256; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 2257; GFX11W64-NEXT: s_add_i32 s2, s2, s8 2258; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2259; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 2260; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd 2261; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 2262; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2263; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 2264; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2265; GFX11W64-NEXT: ; implicit-def: $vgpr1 2266; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2267; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2268; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2269; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 2270; GFX11W64-NEXT: ; %bb.3: 2271; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2272; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 2273; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 2274; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 2275; GFX11W64-NEXT: .LBB7_4: 2276; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] 2277; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2278; GFX11W64-NEXT: s_waitcnt vmcnt(0) 2279; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 2280; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 2281; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 2282; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2283; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 2284; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] 2285; GFX11W64-NEXT: s_endpgm 2286; 2287; GFX11W32-LABEL: sub_i32_varying_vdata: 2288; GFX11W32: ; %bb.0: ; %entry 2289; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2290; GFX11W32-NEXT: s_mov_b32 s1, exec_lo 2291; GFX11W32-NEXT: s_mov_b32 s0, 0 2292; GFX11W32-NEXT: ; implicit-def: $vgpr0 2293; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop 2294; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 2295; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 2296; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 2297; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 2298; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 2299; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 2300; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 2301; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2302; GFX11W32-NEXT: s_add_i32 s0, s0, s3 2303; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 2304; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 2305; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd 2306; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 2307; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2308; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2309; GFX11W32-NEXT: ; implicit-def: $vgpr1 2310; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2311; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 2312; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 2313; GFX11W32-NEXT: ; %bb.3: 2314; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2315; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 2316; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 2317; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 2318; GFX11W32-NEXT: .LBB7_4: 2319; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2320; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2321; GFX11W32-NEXT: s_waitcnt vmcnt(0) 2322; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 2323; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 2324; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2325; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2326; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 2327; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] 2328; GFX11W32-NEXT: s_endpgm 2329; 2330; GFX12W64-LABEL: sub_i32_varying_vdata: 2331; GFX12W64: ; %bb.0: ; %entry 2332; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2333; GFX12W64-NEXT: s_mov_b64 s[0:1], exec 2334; GFX12W64-NEXT: s_mov_b32 s2, 0 2335; GFX12W64-NEXT: ; implicit-def: $vgpr0 2336; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop 2337; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 2338; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] 2339; GFX12W64-NEXT: s_wait_alu 0xfffe 2340; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 2341; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 2342; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 2343; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 2344; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 2345; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 2346; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2347; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 2348; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd 2349; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 2350; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2351; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 2352; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2353; GFX12W64-NEXT: ; implicit-def: $vgpr1 2354; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2355; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2356; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2357; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 2358; GFX12W64-NEXT: ; %bb.3: 2359; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2360; GFX12W64-NEXT: s_wait_alu 0xfffe 2361; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 2362; GFX12W64-NEXT: s_wait_kmcnt 0x0 2363; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 2364; GFX12W64-NEXT: .LBB7_4: 2365; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] 2366; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2367; GFX12W64-NEXT: s_wait_loadcnt 0x0 2368; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 2369; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 2370; GFX12W64-NEXT: s_wait_alu 0xfffe 2371; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 2372; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2373; GFX12W64-NEXT: s_wait_kmcnt 0x0 2374; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] 2375; GFX12W64-NEXT: s_endpgm 2376; 2377; GFX12W32-LABEL: sub_i32_varying_vdata: 2378; GFX12W32: ; %bb.0: ; %entry 2379; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 2380; GFX12W32-NEXT: s_mov_b32 s1, exec_lo 2381; GFX12W32-NEXT: s_mov_b32 s0, 0 2382; GFX12W32-NEXT: ; implicit-def: $vgpr0 2383; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop 2384; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 2385; GFX12W32-NEXT: s_wait_alu 0xfffe 2386; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 2387; GFX12W32-NEXT: s_wait_alu 0xfffe 2388; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 2389; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 2390; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 2391; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 2392; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2393; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 2394; GFX12W32-NEXT: s_wait_alu 0xfffe 2395; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 2396; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 2397; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd 2398; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 2399; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) 2400; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2401; GFX12W32-NEXT: ; implicit-def: $vgpr1 2402; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2403; GFX12W32-NEXT: s_wait_alu 0xfffe 2404; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 2405; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 2406; GFX12W32-NEXT: ; %bb.3: 2407; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 2408; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 2409; GFX12W32-NEXT: s_wait_kmcnt 0x0 2410; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN 2411; GFX12W32-NEXT: .LBB7_4: 2412; GFX12W32-NEXT: s_wait_alu 0xfffe 2413; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2414; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2415; GFX12W32-NEXT: s_wait_loadcnt 0x0 2416; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 2417; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 2418; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) 2419; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 2420; GFX12W32-NEXT: s_wait_kmcnt 0x0 2421; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] 2422; GFX12W32-NEXT: s_endpgm 2423entry: 2424 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2425 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) 2426 store i32 %old, ptr addrspace(1) %out 2427 ret void 2428} 2429 2430define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { 2431; GFX6-LABEL: sub_i32_varying_offset: 2432; GFX6: ; %bb.0: ; %entry 2433; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 2434; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 2435; GFX6-NEXT: v_mov_b32_e32 v1, 1 2436; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2437; GFX6-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc 2438; GFX6-NEXT: s_mov_b32 s7, 0xf000 2439; GFX6-NEXT: s_mov_b32 s6, -1 2440; GFX6-NEXT: s_waitcnt vmcnt(0) 2441; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 2442; GFX6-NEXT: s_endpgm 2443; 2444; GFX8-LABEL: sub_i32_varying_offset: 2445; GFX8: ; %bb.0: ; %entry 2446; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2447; GFX8-NEXT: v_mov_b32_e32 v2, 1 2448; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2449; GFX8-NEXT: buffer_atomic_sub v2, v0, s[0:3], 0 offen glc 2450; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2451; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2452; GFX8-NEXT: v_mov_b32_e32 v0, s0 2453; GFX8-NEXT: v_mov_b32_e32 v1, s1 2454; GFX8-NEXT: s_waitcnt vmcnt(0) 2455; GFX8-NEXT: flat_store_dword v[0:1], v2 2456; GFX8-NEXT: s_endpgm 2457; 2458; GFX9-LABEL: sub_i32_varying_offset: 2459; GFX9: ; %bb.0: ; %entry 2460; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2461; GFX9-NEXT: v_mov_b32_e32 v1, 1 2462; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2463; GFX9-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc 2464; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2465; GFX9-NEXT: v_mov_b32_e32 v0, 0 2466; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2467; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2468; GFX9-NEXT: s_endpgm 2469; 2470; GFX10-LABEL: sub_i32_varying_offset: 2471; GFX10: ; %bb.0: ; %entry 2472; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2473; GFX10-NEXT: v_mov_b32_e32 v1, 1 2474; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2475; GFX10-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc 2476; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2477; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2478; GFX10-NEXT: v_mov_b32_e32 v0, 0 2479; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2480; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2481; GFX10-NEXT: s_endpgm 2482; 2483; GFX11W64-LABEL: sub_i32_varying_offset: 2484; GFX11W64: ; %bb.0: ; %entry 2485; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2486; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2487; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 2488; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) 2489; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc 2490; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2491; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 2492; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2493; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] 2494; GFX11W64-NEXT: s_endpgm 2495; 2496; GFX11W32-LABEL: sub_i32_varying_offset: 2497; GFX11W32: ; %bb.0: ; %entry 2498; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2499; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 2500; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) 2501; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc 2502; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2503; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 2504; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2505; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] 2506; GFX11W32-NEXT: s_endpgm 2507; 2508; GFX12W64-LABEL: sub_i32_varying_offset: 2509; GFX12W64: ; %bb.0: ; %entry 2510; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2511; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2512; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 2513; GFX12W64-NEXT: s_wait_kmcnt 0x0 2514; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN 2515; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2516; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 2517; GFX12W64-NEXT: s_wait_loadcnt 0x0 2518; GFX12W64-NEXT: s_wait_kmcnt 0x0 2519; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] 2520; GFX12W64-NEXT: s_endpgm 2521; 2522; GFX12W32-LABEL: sub_i32_varying_offset: 2523; GFX12W32: ; %bb.0: ; %entry 2524; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 2525; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 2526; GFX12W32-NEXT: s_wait_kmcnt 0x0 2527; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN 2528; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2529; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 2530; GFX12W32-NEXT: s_wait_loadcnt 0x0 2531; GFX12W32-NEXT: s_wait_kmcnt 0x0 2532; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] 2533; GFX12W32-NEXT: s_endpgm 2534entry: 2535 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2536 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) 2537 store i32 %old, ptr addrspace(1) %out 2538 ret void 2539} 2540;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 2541; GFX11: {{.*}} 2542; GFX12: {{.*}} 2543