1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s 10; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s 11; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s 12; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s 13; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s 14; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s 15; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s 16; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s 17; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s 18; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s 19; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s 20 21declare i32 @llvm.amdgcn.workitem.id.x() 22 23; Show what the atomic optimization pass will do for global pointers. 24 25define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) { 26; GFX7LESS-LABEL: add_i32_constant: 27; GFX7LESS: ; %bb.0: ; %entry 28; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 29; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 30; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 31; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 32; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 33; GFX7LESS-NEXT: ; implicit-def: $vgpr1 34; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 35; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 36; GFX7LESS-NEXT: ; %bb.1: 37; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 38; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 39; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 40; GFX7LESS-NEXT: s_mov_b32 s10, -1 41; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 42; GFX7LESS-NEXT: s_mov_b32 s8, s2 43; GFX7LESS-NEXT: s_mov_b32 s9, s3 44; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 45; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 46; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 47; GFX7LESS-NEXT: buffer_wbinvl1 48; GFX7LESS-NEXT: .LBB0_2: 49; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 50; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 51; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 52; GFX7LESS-NEXT: s_mov_b32 s2, -1 53; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 54; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 55; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 56; GFX7LESS-NEXT: s_endpgm 57; 58; GFX8-LABEL: add_i32_constant: 59; GFX8: ; %bb.0: ; %entry 60; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 61; GFX8-NEXT: s_mov_b64 s[6:7], exec 62; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 63; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 64; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 65; GFX8-NEXT: ; implicit-def: $vgpr1 66; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 67; GFX8-NEXT: s_cbranch_execz .LBB0_2 68; GFX8-NEXT: ; %bb.1: 69; GFX8-NEXT: s_waitcnt lgkmcnt(0) 70; GFX8-NEXT: s_mov_b32 s8, s2 71; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 72; GFX8-NEXT: s_mul_i32 s2, s2, 5 73; GFX8-NEXT: s_mov_b32 s11, 0xf000 74; GFX8-NEXT: s_mov_b32 s10, -1 75; GFX8-NEXT: s_mov_b32 s9, s3 76; GFX8-NEXT: v_mov_b32_e32 v1, s2 77; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 78; GFX8-NEXT: s_waitcnt vmcnt(0) 79; GFX8-NEXT: buffer_wbinvl1_vol 80; GFX8-NEXT: .LBB0_2: 81; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 82; GFX8-NEXT: v_readfirstlane_b32 s4, v1 83; GFX8-NEXT: s_waitcnt lgkmcnt(0) 84; GFX8-NEXT: s_mov_b32 s3, 0xf000 85; GFX8-NEXT: s_mov_b32 s2, -1 86; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 87; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 88; GFX8-NEXT: s_endpgm 89; 90; GFX9-LABEL: add_i32_constant: 91; GFX9: ; %bb.0: ; %entry 92; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 93; GFX9-NEXT: s_mov_b64 s[6:7], exec 94; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 95; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 96; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 97; GFX9-NEXT: ; implicit-def: $vgpr1 98; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 99; GFX9-NEXT: s_cbranch_execz .LBB0_2 100; GFX9-NEXT: ; %bb.1: 101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 102; GFX9-NEXT: s_mov_b32 s8, s2 103; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 104; GFX9-NEXT: s_mul_i32 s2, s2, 5 105; GFX9-NEXT: s_mov_b32 s11, 0xf000 106; GFX9-NEXT: s_mov_b32 s10, -1 107; GFX9-NEXT: s_mov_b32 s9, s3 108; GFX9-NEXT: v_mov_b32_e32 v1, s2 109; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 110; GFX9-NEXT: s_waitcnt vmcnt(0) 111; GFX9-NEXT: buffer_wbinvl1_vol 112; GFX9-NEXT: .LBB0_2: 113; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 114; GFX9-NEXT: v_readfirstlane_b32 s4, v1 115; GFX9-NEXT: s_waitcnt lgkmcnt(0) 116; GFX9-NEXT: s_mov_b32 s3, 0xf000 117; GFX9-NEXT: s_mov_b32 s2, -1 118; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 119; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 120; GFX9-NEXT: s_endpgm 121; 122; GFX1064-LABEL: add_i32_constant: 123; GFX1064: ; %bb.0: ; %entry 124; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 125; GFX1064-NEXT: s_mov_b64 s[6:7], exec 126; GFX1064-NEXT: ; implicit-def: $vgpr1 127; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 128; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 129; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 130; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 131; GFX1064-NEXT: s_cbranch_execz .LBB0_2 132; GFX1064-NEXT: ; %bb.1: 133; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 134; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 135; GFX1064-NEXT: s_mul_i32 s6, s6, 5 136; GFX1064-NEXT: s_mov_b32 s10, -1 137; GFX1064-NEXT: v_mov_b32_e32 v1, s6 138; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 139; GFX1064-NEXT: s_mov_b32 s8, s2 140; GFX1064-NEXT: s_mov_b32 s9, s3 141; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 142; GFX1064-NEXT: s_waitcnt vmcnt(0) 143; GFX1064-NEXT: buffer_gl1_inv 144; GFX1064-NEXT: buffer_gl0_inv 145; GFX1064-NEXT: .LBB0_2: 146; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 147; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 148; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 149; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 150; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 151; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 152; GFX1064-NEXT: s_mov_b32 s2, -1 153; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 154; GFX1064-NEXT: s_endpgm 155; 156; GFX1032-LABEL: add_i32_constant: 157; GFX1032: ; %bb.0: ; %entry 158; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 159; GFX1032-NEXT: s_mov_b32 s6, exec_lo 160; GFX1032-NEXT: ; implicit-def: $vgpr1 161; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 162; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 163; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 164; GFX1032-NEXT: s_cbranch_execz .LBB0_2 165; GFX1032-NEXT: ; %bb.1: 166; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 167; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 168; GFX1032-NEXT: s_mul_i32 s5, s5, 5 169; GFX1032-NEXT: s_mov_b32 s10, -1 170; GFX1032-NEXT: v_mov_b32_e32 v1, s5 171; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 172; GFX1032-NEXT: s_mov_b32 s8, s2 173; GFX1032-NEXT: s_mov_b32 s9, s3 174; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 175; GFX1032-NEXT: s_waitcnt vmcnt(0) 176; GFX1032-NEXT: buffer_gl1_inv 177; GFX1032-NEXT: buffer_gl0_inv 178; GFX1032-NEXT: .LBB0_2: 179; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 180; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 181; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 182; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 183; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 184; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 185; GFX1032-NEXT: s_mov_b32 s2, -1 186; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 187; GFX1032-NEXT: s_endpgm 188; 189; GFX1164-LABEL: add_i32_constant: 190; GFX1164: ; %bb.0: ; %entry 191; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 192; GFX1164-NEXT: s_mov_b64 s[6:7], exec 193; GFX1164-NEXT: s_mov_b64 s[4:5], exec 194; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 195; GFX1164-NEXT: ; implicit-def: $vgpr1 196; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 197; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 198; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 199; GFX1164-NEXT: s_cbranch_execz .LBB0_2 200; GFX1164-NEXT: ; %bb.1: 201; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 202; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 203; GFX1164-NEXT: s_mul_i32 s6, s6, 5 204; GFX1164-NEXT: s_mov_b32 s10, -1 205; GFX1164-NEXT: v_mov_b32_e32 v1, s6 206; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 207; GFX1164-NEXT: s_mov_b32 s8, s2 208; GFX1164-NEXT: s_mov_b32 s9, s3 209; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 210; GFX1164-NEXT: s_waitcnt vmcnt(0) 211; GFX1164-NEXT: buffer_gl1_inv 212; GFX1164-NEXT: buffer_gl0_inv 213; GFX1164-NEXT: .LBB0_2: 214; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 215; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 216; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 217; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 218; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 219; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 220; GFX1164-NEXT: s_mov_b32 s2, -1 221; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 222; GFX1164-NEXT: s_endpgm 223; 224; GFX1132-LABEL: add_i32_constant: 225; GFX1132: ; %bb.0: ; %entry 226; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 227; GFX1132-NEXT: s_mov_b32 s6, exec_lo 228; GFX1132-NEXT: s_mov_b32 s4, exec_lo 229; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 230; GFX1132-NEXT: ; implicit-def: $vgpr1 231; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 232; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 233; GFX1132-NEXT: s_cbranch_execz .LBB0_2 234; GFX1132-NEXT: ; %bb.1: 235; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 236; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 237; GFX1132-NEXT: s_mul_i32 s5, s5, 5 238; GFX1132-NEXT: s_mov_b32 s10, -1 239; GFX1132-NEXT: v_mov_b32_e32 v1, s5 240; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 241; GFX1132-NEXT: s_mov_b32 s8, s2 242; GFX1132-NEXT: s_mov_b32 s9, s3 243; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 244; GFX1132-NEXT: s_waitcnt vmcnt(0) 245; GFX1132-NEXT: buffer_gl1_inv 246; GFX1132-NEXT: buffer_gl0_inv 247; GFX1132-NEXT: .LBB0_2: 248; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 249; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 250; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 251; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 252; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 253; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 254; GFX1132-NEXT: s_mov_b32 s2, -1 255; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 256; GFX1132-NEXT: s_endpgm 257; 258; GFX1264-LABEL: add_i32_constant: 259; GFX1264: ; %bb.0: ; %entry 260; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 261; GFX1264-NEXT: s_mov_b64 s[6:7], exec 262; GFX1264-NEXT: s_mov_b64 s[4:5], exec 263; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 264; GFX1264-NEXT: ; implicit-def: $vgpr1 265; GFX1264-NEXT: s_wait_alu 0xfffe 266; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 267; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 268; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 269; GFX1264-NEXT: s_cbranch_execz .LBB0_2 270; GFX1264-NEXT: ; %bb.1: 271; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 272; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 273; GFX1264-NEXT: s_wait_alu 0xfffe 274; GFX1264-NEXT: s_mul_i32 s6, s6, 5 275; GFX1264-NEXT: s_mov_b32 s10, -1 276; GFX1264-NEXT: s_wait_alu 0xfffe 277; GFX1264-NEXT: v_mov_b32_e32 v1, s6 278; GFX1264-NEXT: s_wait_kmcnt 0x0 279; GFX1264-NEXT: s_mov_b32 s8, s2 280; GFX1264-NEXT: s_mov_b32 s9, s3 281; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 282; GFX1264-NEXT: s_wait_loadcnt 0x0 283; GFX1264-NEXT: global_inv scope:SCOPE_DEV 284; GFX1264-NEXT: .LBB0_2: 285; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] 286; GFX1264-NEXT: s_wait_kmcnt 0x0 287; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 288; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 289; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) 290; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2 291; GFX1264-NEXT: s_mov_b32 s2, -1 292; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null 293; GFX1264-NEXT: s_endpgm 294; 295; GFX1232-LABEL: add_i32_constant: 296; GFX1232: ; %bb.0: ; %entry 297; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 298; GFX1232-NEXT: s_mov_b32 s6, exec_lo 299; GFX1232-NEXT: s_mov_b32 s4, exec_lo 300; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 301; GFX1232-NEXT: ; implicit-def: $vgpr1 302; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 303; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 304; GFX1232-NEXT: s_cbranch_execz .LBB0_2 305; GFX1232-NEXT: ; %bb.1: 306; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6 307; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 308; GFX1232-NEXT: s_mul_i32 s5, s5, 5 309; GFX1232-NEXT: s_mov_b32 s10, -1 310; GFX1232-NEXT: v_mov_b32_e32 v1, s5 311; GFX1232-NEXT: s_wait_kmcnt 0x0 312; GFX1232-NEXT: s_mov_b32 s8, s2 313; GFX1232-NEXT: s_mov_b32 s9, s3 314; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 315; GFX1232-NEXT: s_wait_loadcnt 0x0 316; GFX1232-NEXT: global_inv scope:SCOPE_DEV 317; GFX1232-NEXT: .LBB0_2: 318; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 319; GFX1232-NEXT: s_wait_kmcnt 0x0 320; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 321; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 322; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 323; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2 324; GFX1232-NEXT: s_mov_b32 s2, -1 325; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null 326; GFX1232-NEXT: s_endpgm 327entry: 328 %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel 329 store i32 %old, ptr addrspace(1) %out 330 ret void 331} 332 333define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) { 334; GFX7LESS-LABEL: add_i32_uniform: 335; GFX7LESS: ; %bb.0: ; %entry 336; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 337; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 338; GFX7LESS-NEXT: s_load_dword s8, s[4:5], 0xd 339; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 340; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 341; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 342; GFX7LESS-NEXT: ; implicit-def: $vgpr1 343; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 344; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 345; GFX7LESS-NEXT: ; %bb.1: 346; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 347; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 348; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 349; GFX7LESS-NEXT: s_mul_i32 s6, s8, s6 350; GFX7LESS-NEXT: s_mov_b32 s14, -1 351; GFX7LESS-NEXT: s_mov_b32 s12, s2 352; GFX7LESS-NEXT: s_mov_b32 s13, s3 353; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 354; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 355; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 356; GFX7LESS-NEXT: buffer_wbinvl1 357; GFX7LESS-NEXT: .LBB1_2: 358; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 359; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 360; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 361; GFX7LESS-NEXT: s_mov_b32 s2, -1 362; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 363; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 364; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 365; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 366; GFX7LESS-NEXT: s_endpgm 367; 368; GFX8-LABEL: add_i32_uniform: 369; GFX8: ; %bb.0: ; %entry 370; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 371; GFX8-NEXT: s_load_dword s8, s[4:5], 0x34 372; GFX8-NEXT: s_mov_b64 s[6:7], exec 373; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 374; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 375; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 376; GFX8-NEXT: ; implicit-def: $vgpr1 377; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 378; GFX8-NEXT: s_cbranch_execz .LBB1_2 379; GFX8-NEXT: ; %bb.1: 380; GFX8-NEXT: s_waitcnt lgkmcnt(0) 381; GFX8-NEXT: s_mov_b32 s12, s2 382; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 383; GFX8-NEXT: s_mul_i32 s2, s8, s2 384; GFX8-NEXT: s_mov_b32 s15, 0xf000 385; GFX8-NEXT: s_mov_b32 s14, -1 386; GFX8-NEXT: s_mov_b32 s13, s3 387; GFX8-NEXT: v_mov_b32_e32 v1, s2 388; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 389; GFX8-NEXT: s_waitcnt vmcnt(0) 390; GFX8-NEXT: buffer_wbinvl1_vol 391; GFX8-NEXT: .LBB1_2: 392; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 393; GFX8-NEXT: s_waitcnt lgkmcnt(0) 394; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 395; GFX8-NEXT: v_readfirstlane_b32 s4, v1 396; GFX8-NEXT: s_mov_b32 s3, 0xf000 397; GFX8-NEXT: s_mov_b32 s2, -1 398; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 399; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 400; GFX8-NEXT: s_endpgm 401; 402; GFX9-LABEL: add_i32_uniform: 403; GFX9: ; %bb.0: ; %entry 404; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 405; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 406; GFX9-NEXT: s_mov_b64 s[6:7], exec 407; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 408; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 409; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 410; GFX9-NEXT: ; implicit-def: $vgpr1 411; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 412; GFX9-NEXT: s_cbranch_execz .LBB1_2 413; GFX9-NEXT: ; %bb.1: 414; GFX9-NEXT: s_waitcnt lgkmcnt(0) 415; GFX9-NEXT: s_mov_b32 s12, s2 416; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 417; GFX9-NEXT: s_mul_i32 s2, s8, s2 418; GFX9-NEXT: s_mov_b32 s15, 0xf000 419; GFX9-NEXT: s_mov_b32 s14, -1 420; GFX9-NEXT: s_mov_b32 s13, s3 421; GFX9-NEXT: v_mov_b32_e32 v1, s2 422; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 423; GFX9-NEXT: s_waitcnt vmcnt(0) 424; GFX9-NEXT: buffer_wbinvl1_vol 425; GFX9-NEXT: .LBB1_2: 426; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 427; GFX9-NEXT: s_waitcnt lgkmcnt(0) 428; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 429; GFX9-NEXT: v_readfirstlane_b32 s4, v1 430; GFX9-NEXT: s_mov_b32 s3, 0xf000 431; GFX9-NEXT: s_mov_b32 s2, -1 432; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 433; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 434; GFX9-NEXT: s_endpgm 435; 436; GFX1064-LABEL: add_i32_uniform: 437; GFX1064: ; %bb.0: ; %entry 438; GFX1064-NEXT: s_clause 0x1 439; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 440; GFX1064-NEXT: s_load_dword s8, s[4:5], 0x34 441; GFX1064-NEXT: s_mov_b64 s[6:7], exec 442; GFX1064-NEXT: ; implicit-def: $vgpr1 443; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 444; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 445; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 446; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 447; GFX1064-NEXT: s_cbranch_execz .LBB1_2 448; GFX1064-NEXT: ; %bb.1: 449; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 450; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 451; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 452; GFX1064-NEXT: s_mul_i32 s6, s8, s6 453; GFX1064-NEXT: s_mov_b32 s14, -1 454; GFX1064-NEXT: v_mov_b32_e32 v1, s6 455; GFX1064-NEXT: s_mov_b32 s12, s2 456; GFX1064-NEXT: s_mov_b32 s13, s3 457; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 458; GFX1064-NEXT: s_waitcnt vmcnt(0) 459; GFX1064-NEXT: buffer_gl1_inv 460; GFX1064-NEXT: buffer_gl0_inv 461; GFX1064-NEXT: .LBB1_2: 462; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 463; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 464; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 465; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 466; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s8, v0, s[2:3] 467; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 468; GFX1064-NEXT: s_mov_b32 s2, -1 469; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 470; GFX1064-NEXT: s_endpgm 471; 472; GFX1032-LABEL: add_i32_uniform: 473; GFX1032: ; %bb.0: ; %entry 474; GFX1032-NEXT: s_clause 0x1 475; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 476; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 477; GFX1032-NEXT: s_mov_b32 s7, exec_lo 478; GFX1032-NEXT: ; implicit-def: $vgpr1 479; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s7, 0 480; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 481; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 482; GFX1032-NEXT: s_cbranch_execz .LBB1_2 483; GFX1032-NEXT: ; %bb.1: 484; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s7 485; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 486; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 487; GFX1032-NEXT: s_mul_i32 s5, s6, s5 488; GFX1032-NEXT: s_mov_b32 s10, -1 489; GFX1032-NEXT: v_mov_b32_e32 v1, s5 490; GFX1032-NEXT: s_mov_b32 s8, s2 491; GFX1032-NEXT: s_mov_b32 s9, s3 492; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 493; GFX1032-NEXT: s_waitcnt vmcnt(0) 494; GFX1032-NEXT: buffer_gl1_inv 495; GFX1032-NEXT: buffer_gl0_inv 496; GFX1032-NEXT: .LBB1_2: 497; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 498; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 499; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 500; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 501; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s6, v0, s[2:3] 502; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 503; GFX1032-NEXT: s_mov_b32 s2, -1 504; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 505; GFX1032-NEXT: s_endpgm 506; 507; GFX1164-LABEL: add_i32_uniform: 508; GFX1164: ; %bb.0: ; %entry 509; GFX1164-NEXT: s_clause 0x1 510; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 511; GFX1164-NEXT: s_load_b32 s8, s[4:5], 0x34 512; GFX1164-NEXT: s_mov_b64 s[6:7], exec 513; GFX1164-NEXT: s_mov_b64 s[4:5], exec 514; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 515; GFX1164-NEXT: ; implicit-def: $vgpr1 516; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 517; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 518; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 519; GFX1164-NEXT: s_cbranch_execz .LBB1_2 520; GFX1164-NEXT: ; %bb.1: 521; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 522; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 523; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 524; GFX1164-NEXT: s_mul_i32 s6, s8, s6 525; GFX1164-NEXT: s_mov_b32 s14, -1 526; GFX1164-NEXT: v_mov_b32_e32 v1, s6 527; GFX1164-NEXT: s_mov_b32 s12, s2 528; GFX1164-NEXT: s_mov_b32 s13, s3 529; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc 530; GFX1164-NEXT: s_waitcnt vmcnt(0) 531; GFX1164-NEXT: buffer_gl1_inv 532; GFX1164-NEXT: buffer_gl0_inv 533; GFX1164-NEXT: .LBB1_2: 534; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 535; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 536; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 537; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 538; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[2:3] 539; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 540; GFX1164-NEXT: s_mov_b32 s2, -1 541; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0 542; GFX1164-NEXT: s_endpgm 543; 544; GFX1132-LABEL: add_i32_uniform: 545; GFX1132: ; %bb.0: ; %entry 546; GFX1132-NEXT: s_clause 0x1 547; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 548; GFX1132-NEXT: s_load_b32 s4, s[4:5], 0x34 549; GFX1132-NEXT: s_mov_b32 s6, exec_lo 550; GFX1132-NEXT: s_mov_b32 s5, exec_lo 551; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 552; GFX1132-NEXT: ; implicit-def: $vgpr1 553; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 554; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 555; GFX1132-NEXT: s_cbranch_execz .LBB1_2 556; GFX1132-NEXT: ; %bb.1: 557; GFX1132-NEXT: s_bcnt1_i32_b32 s6, s6 558; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 559; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 560; GFX1132-NEXT: s_mul_i32 s6, s4, s6 561; GFX1132-NEXT: s_mov_b32 s10, -1 562; GFX1132-NEXT: v_mov_b32_e32 v1, s6 563; GFX1132-NEXT: s_mov_b32 s8, s2 564; GFX1132-NEXT: s_mov_b32 s9, s3 565; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 566; GFX1132-NEXT: s_waitcnt vmcnt(0) 567; GFX1132-NEXT: buffer_gl1_inv 568; GFX1132-NEXT: buffer_gl0_inv 569; GFX1132-NEXT: .LBB1_2: 570; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 571; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 572; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 573; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 574; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[2:3] 575; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 576; GFX1132-NEXT: s_mov_b32 s2, -1 577; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0 578; GFX1132-NEXT: s_endpgm 579; 580; GFX1264-LABEL: add_i32_uniform: 581; GFX1264: ; %bb.0: ; %entry 582; GFX1264-NEXT: s_clause 0x1 583; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 584; GFX1264-NEXT: s_load_b32 s8, s[4:5], 0x34 585; GFX1264-NEXT: s_mov_b64 s[6:7], exec 586; GFX1264-NEXT: s_mov_b64 s[4:5], exec 587; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 588; GFX1264-NEXT: ; implicit-def: $vgpr1 589; GFX1264-NEXT: s_wait_alu 0xfffe 590; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 591; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 592; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 593; GFX1264-NEXT: s_cbranch_execz .LBB1_2 594; GFX1264-NEXT: ; %bb.1: 595; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 596; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 597; GFX1264-NEXT: s_wait_kmcnt 0x0 598; GFX1264-NEXT: s_wait_alu 0xfffe 599; GFX1264-NEXT: s_mul_i32 s6, s8, s6 600; GFX1264-NEXT: s_mov_b32 s14, -1 601; GFX1264-NEXT: s_wait_alu 0xfffe 602; GFX1264-NEXT: v_mov_b32_e32 v1, s6 603; GFX1264-NEXT: s_mov_b32 s12, s2 604; GFX1264-NEXT: s_mov_b32 s13, s3 605; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 606; GFX1264-NEXT: s_wait_loadcnt 0x0 607; GFX1264-NEXT: global_inv scope:SCOPE_DEV 608; GFX1264-NEXT: .LBB1_2: 609; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] 610; GFX1264-NEXT: s_wait_kmcnt 0x0 611; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 612; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) 613; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[2:3] 614; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 615; GFX1264-NEXT: s_mov_b32 s2, -1 616; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null 617; GFX1264-NEXT: s_endpgm 618; 619; GFX1232-LABEL: add_i32_uniform: 620; GFX1232: ; %bb.0: ; %entry 621; GFX1232-NEXT: s_clause 0x1 622; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 623; GFX1232-NEXT: s_load_b32 s4, s[4:5], 0x34 624; GFX1232-NEXT: s_mov_b32 s6, exec_lo 625; GFX1232-NEXT: s_mov_b32 s5, exec_lo 626; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 627; GFX1232-NEXT: ; implicit-def: $vgpr1 628; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 629; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 630; GFX1232-NEXT: s_cbranch_execz .LBB1_2 631; GFX1232-NEXT: ; %bb.1: 632; GFX1232-NEXT: s_wait_alu 0xfffe 633; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 634; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 635; GFX1232-NEXT: s_wait_kmcnt 0x0 636; GFX1232-NEXT: s_wait_alu 0xfffe 637; GFX1232-NEXT: s_mul_i32 s6, s4, s6 638; GFX1232-NEXT: s_mov_b32 s10, -1 639; GFX1232-NEXT: s_wait_alu 0xfffe 640; GFX1232-NEXT: v_mov_b32_e32 v1, s6 641; GFX1232-NEXT: s_mov_b32 s8, s2 642; GFX1232-NEXT: s_mov_b32 s9, s3 643; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 644; GFX1232-NEXT: s_wait_loadcnt 0x0 645; GFX1232-NEXT: global_inv scope:SCOPE_DEV 646; GFX1232-NEXT: .LBB1_2: 647; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 648; GFX1232-NEXT: s_wait_kmcnt 0x0 649; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 650; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 651; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v0, s[2:3] 652; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 653; GFX1232-NEXT: s_mov_b32 s2, -1 654; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null 655; GFX1232-NEXT: s_endpgm 656entry: 657 %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel 658 store i32 %old, ptr addrspace(1) %out 659 ret void 660} 661 662define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { 663; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: 664; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 665; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 666; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 667; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 668; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 669; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 670; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 671; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s2 672; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 673; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 674; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 675; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 676; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 677; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[2:3] 678; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 679; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 680; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 681; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 682; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 683; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 684; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 685; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 686; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 687; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 688; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 689; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 690; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 691; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 692; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 693; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 694; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 695; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 696; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 697; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 698; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 699; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: 700; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 701; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 702; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 703; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 704; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 705; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) 706; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s4, v1 707; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 708; GFX7LESS_ITERATIVE-NEXT: s_endpgm 709; 710; GFX8_ITERATIVE-LABEL: add_i32_varying: 711; GFX8_ITERATIVE: ; %bb.0: ; %entry 712; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 713; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 714; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 715; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 716; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 717; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 718; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 719; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 720; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 721; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 722; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 723; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 724; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 725; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 726; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 727; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 728; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 729; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 730; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 731; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 732; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 733; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 734; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 735; GFX8_ITERATIVE-NEXT: ; %bb.3: 736; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 737; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 738; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 739; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 740; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 741; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 742; GFX8_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 743; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 744; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol 745; GFX8_ITERATIVE-NEXT: .LBB2_4: 746; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 747; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 748; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 749; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 750; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 751; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s4, v1 752; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 753; GFX8_ITERATIVE-NEXT: s_endpgm 754; 755; GFX9_ITERATIVE-LABEL: add_i32_varying: 756; GFX9_ITERATIVE: ; %bb.0: ; %entry 757; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 758; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 759; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 760; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 761; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 762; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 763; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 764; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 765; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 766; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 767; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 768; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 769; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 770; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 771; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 772; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 773; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 774; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 775; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 776; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 777; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 778; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 779; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 780; GFX9_ITERATIVE-NEXT: ; %bb.3: 781; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 782; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 783; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 784; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 785; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 786; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 787; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 788; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 789; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol 790; GFX9_ITERATIVE-NEXT: .LBB2_4: 791; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 792; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 793; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 794; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 795; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 796; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1 797; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 798; GFX9_ITERATIVE-NEXT: s_endpgm 799; 800; GFX1064_ITERATIVE-LABEL: add_i32_varying: 801; GFX1064_ITERATIVE: ; %bb.0: ; %entry 802; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 803; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 804; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 805; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 806; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 807; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] 808; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 809; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 810; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 811; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 812; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 813; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 814; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 815; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 816; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 817; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 818; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 819; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 820; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 821; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 822; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 823; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 824; GFX1064_ITERATIVE-NEXT: ; %bb.3: 825; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 826; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 827; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 828; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 829; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 830; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 831; GFX1064_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 832; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 833; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv 834; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 835; GFX1064_ITERATIVE-NEXT: .LBB2_4: 836; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 837; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 838; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 839; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 840; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 841; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 842; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 843; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 844; GFX1064_ITERATIVE-NEXT: s_endpgm 845; 846; GFX1032_ITERATIVE-LABEL: add_i32_varying: 847; GFX1032_ITERATIVE: ; %bb.0: ; %entry 848; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 849; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, 0 850; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 851; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 852; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 853; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 854; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 855; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 856; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 857; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 858; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 859; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 860; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 861; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 862; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 863; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 864; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 865; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 866; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 867; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 868; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 869; GFX1032_ITERATIVE-NEXT: ; %bb.3: 870; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 871; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 872; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 873; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 874; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 875; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 876; GFX1032_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 877; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 878; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv 879; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 880; GFX1032_ITERATIVE-NEXT: .LBB2_4: 881; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 882; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 883; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 884; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 885; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 886; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 887; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 888; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 889; GFX1032_ITERATIVE-NEXT: s_endpgm 890; 891; GFX1164_ITERATIVE-LABEL: add_i32_varying: 892; GFX1164_ITERATIVE: ; %bb.0: ; %entry 893; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 894; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 895; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 896; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 897; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 898; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 899; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] 900; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 901; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 902; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 903; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 904; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] 905; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 906; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 907; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 908; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 909; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 910; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 911; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 912; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 913; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 914; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 915; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 916; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 917; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 918; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 919; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 920; GFX1164_ITERATIVE-NEXT: ; %bb.3: 921; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 922; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 923; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 924; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 925; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 926; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 927; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 928; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 929; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv 930; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 931; GFX1164_ITERATIVE-NEXT: .LBB2_4: 932; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 933; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 934; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 935; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 936; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 937; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 938; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 939; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 940; GFX1164_ITERATIVE-NEXT: s_endpgm 941; 942; GFX1132_ITERATIVE-LABEL: add_i32_varying: 943; GFX1132_ITERATIVE: ; %bb.0: ; %entry 944; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 945; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 946; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, 0 947; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 948; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 949; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 950; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 951; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 952; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 953; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 954; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 955; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 956; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 957; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 958; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 959; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 960; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 961; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 962; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 963; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 964; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 965; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 966; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 967; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 968; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 969; GFX1132_ITERATIVE-NEXT: ; %bb.3: 970; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 971; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 972; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 973; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 974; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 975; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 976; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc 977; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 978; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv 979; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 980; GFX1132_ITERATIVE-NEXT: .LBB2_4: 981; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 982; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 983; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 984; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 985; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 986; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 987; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 988; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 989; GFX1132_ITERATIVE-NEXT: s_endpgm 990; 991; GFX1264_ITERATIVE-LABEL: add_i32_varying: 992; GFX1264_ITERATIVE: ; %bb.0: ; %entry 993; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 994; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 995; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 996; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 997; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 998; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 999; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] 1000; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe 1001; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 1002; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 1003; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 1004; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] 1005; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 1006; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 1007; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 1008; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 1009; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1010; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1011; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1012; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1013; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 1014; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1015; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 1016; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 1017; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1018; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 1019; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 1020; GFX1264_ITERATIVE-NEXT: ; %bb.3: 1021; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe 1022; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 1023; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 1024; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 1025; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 1026; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 1027; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 1028; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1029; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 1030; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV 1031; GFX1264_ITERATIVE-NEXT: .LBB2_4: 1032; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 1033; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 1034; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 1035; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 1036; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 1037; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 1038; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 1039; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null 1040; GFX1264_ITERATIVE-NEXT: s_endpgm 1041; 1042; GFX1232_ITERATIVE-LABEL: add_i32_varying: 1043; GFX1232_ITERATIVE: ; %bb.0: ; %entry 1044; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 1045; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 1046; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, 0 1047; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 1048; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 1049; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1050; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 1051; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 1052; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 1053; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 1054; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 1055; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 1056; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 1057; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 1058; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 1059; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 1060; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 1061; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 1062; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1063; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1064; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 1065; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1066; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1067; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 1068; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 1069; GFX1232_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 1070; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 1071; GFX1232_ITERATIVE-NEXT: ; %bb.3: 1072; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 1073; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 1074; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 1075; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 1076; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 1077; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 1078; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1079; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 1080; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV 1081; GFX1232_ITERATIVE-NEXT: .LBB2_4: 1082; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 1083; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 1084; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 1085; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 1086; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 1087; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 1088; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 1089; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null 1090; GFX1232_ITERATIVE-NEXT: s_endpgm 1091; 1092; GFX7LESS_DPP-LABEL: add_i32_varying: 1093; GFX7LESS_DPP: ; %bb.0: ; %entry 1094; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1095; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 1096; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 1097; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 1098; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 1099; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 1100; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 1101; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 1102; GFX7LESS_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 1103; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) 1104; GFX7LESS_DPP-NEXT: buffer_wbinvl1 1105; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 1106; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 1107; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 1108; GFX7LESS_DPP-NEXT: s_endpgm 1109; 1110; GFX8_DPP-LABEL: add_i32_varying: 1111; GFX8_DPP: ; %bb.0: ; %entry 1112; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1113; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1114; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 1115; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] 1116; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1117; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1118; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1119; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] 1120; GFX8_DPP-NEXT: s_nop 1 1121; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1122; GFX8_DPP-NEXT: s_nop 1 1123; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1124; GFX8_DPP-NEXT: s_nop 1 1125; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1126; GFX8_DPP-NEXT: s_nop 1 1127; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1128; GFX8_DPP-NEXT: s_nop 1 1129; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1130; GFX8_DPP-NEXT: s_nop 1 1131; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1132; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 1133; GFX8_DPP-NEXT: s_nop 0 1134; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1135; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] 1136; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1137; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 1138; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc 1139; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 1140; GFX8_DPP-NEXT: ; %bb.1: 1141; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 1142; GFX8_DPP-NEXT: s_mov_b32 s10, -1 1143; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 1144; GFX8_DPP-NEXT: s_mov_b32 s8, s2 1145; GFX8_DPP-NEXT: s_mov_b32 s9, s3 1146; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 1147; GFX8_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 1148; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) 1149; GFX8_DPP-NEXT: buffer_wbinvl1_vol 1150; GFX8_DPP-NEXT: .LBB2_2: 1151; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] 1152; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 1153; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 1154; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 1155; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 1156; GFX8_DPP-NEXT: s_mov_b32 s2, -1 1157; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0 1158; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 1159; GFX8_DPP-NEXT: s_endpgm 1160; 1161; GFX9_DPP-LABEL: add_i32_varying: 1162; GFX9_DPP: ; %bb.0: ; %entry 1163; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1164; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1165; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 1166; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] 1167; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1168; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1169; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1170; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] 1171; GFX9_DPP-NEXT: s_nop 1 1172; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1173; GFX9_DPP-NEXT: s_nop 1 1174; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1175; GFX9_DPP-NEXT: s_nop 1 1176; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1177; GFX9_DPP-NEXT: s_nop 1 1178; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1179; GFX9_DPP-NEXT: s_nop 1 1180; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1181; GFX9_DPP-NEXT: s_nop 1 1182; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1183; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 1184; GFX9_DPP-NEXT: s_nop 0 1185; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1186; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] 1187; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1188; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 1189; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc 1190; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 1191; GFX9_DPP-NEXT: ; %bb.1: 1192; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 1193; GFX9_DPP-NEXT: s_mov_b32 s10, -1 1194; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX9_DPP-NEXT: s_mov_b32 s8, s2 1196; GFX9_DPP-NEXT: s_mov_b32 s9, s3 1197; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 1198; GFX9_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 1199; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) 1200; GFX9_DPP-NEXT: buffer_wbinvl1_vol 1201; GFX9_DPP-NEXT: .LBB2_2: 1202; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] 1203; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 1204; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 1205; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 1206; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 1207; GFX9_DPP-NEXT: s_mov_b32 s2, -1 1208; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 1209; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 1210; GFX9_DPP-NEXT: s_endpgm 1211; 1212; GFX1064_DPP-LABEL: add_i32_varying: 1213; GFX1064_DPP: ; %bb.0: ; %entry 1214; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1215; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 1216; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 1217; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1218; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1219; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1220; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1221; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 1222; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1223; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 1224; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 1225; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1226; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 1227; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1228; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 1229; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1230; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1231; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 1232; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 1233; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] 1234; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1235; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1236; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 1237; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 1238; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 1239; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] 1240; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1241; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 1242; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 1243; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 1244; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 1245; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1246; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 1247; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 1248; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc 1249; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 1250; GFX1064_DPP-NEXT: ; %bb.1: 1251; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 1252; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 1253; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 1254; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 1255; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 1256; GFX1064_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 1257; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) 1258; GFX1064_DPP-NEXT: buffer_gl1_inv 1259; GFX1064_DPP-NEXT: buffer_gl0_inv 1260; GFX1064_DPP-NEXT: .LBB2_2: 1261; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 1262; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 1263; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 1264; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 1265; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 1266; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 1267; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 1268; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 1269; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 1270; GFX1064_DPP-NEXT: s_endpgm 1271; 1272; GFX1032_DPP-LABEL: add_i32_varying: 1273; GFX1032_DPP: ; %bb.0: ; %entry 1274; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 1275; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 1276; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1277; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1278; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1279; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1280; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 1281; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 1282; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1283; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 1284; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1285; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 1286; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 1287; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1288; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 1289; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 1290; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1291; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 1292; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 1293; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 1294; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 1295; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 1296; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1297; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 1298; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo 1299; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 1300; GFX1032_DPP-NEXT: ; %bb.1: 1301; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 1302; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 1303; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 1305; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 1306; GFX1032_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 1307; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) 1308; GFX1032_DPP-NEXT: buffer_gl1_inv 1309; GFX1032_DPP-NEXT: buffer_gl0_inv 1310; GFX1032_DPP-NEXT: .LBB2_2: 1311; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 1312; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 1313; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 1314; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 1315; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 1316; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 1317; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 1318; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 1319; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 1320; GFX1032_DPP-NEXT: s_endpgm 1321; 1322; GFX1164_DPP-LABEL: add_i32_varying: 1323; GFX1164_DPP: ; %bb.0: ; %entry 1324; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1325; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1326; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1327; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 1328; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 1329; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1330; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1331; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1332; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1333; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1334; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1335; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1336; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 1337; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1338; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1339; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 1340; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 1341; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1342; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1343; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 1344; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1345; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 1346; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1347; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1348; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 1349; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 1350; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] 1351; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1352; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1353; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1354; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 1355; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63 1356; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 1357; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] 1358; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1359; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1360; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 1361; GFX1164_DPP-NEXT: s_mov_b32 s4, s9 1362; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48 1363; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 1364; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1365; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 1366; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 1367; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc 1368; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 1369; GFX1164_DPP-NEXT: ; %bb.1: 1370; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 1371; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 1372; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 1373; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 1374; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 1375; GFX1164_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc 1376; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) 1377; GFX1164_DPP-NEXT: buffer_gl1_inv 1378; GFX1164_DPP-NEXT: buffer_gl0_inv 1379; GFX1164_DPP-NEXT: .LBB2_2: 1380; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 1381; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 1382; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0 1383; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 1384; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 1385; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 1386; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 1387; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 1388; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1389; GFX1164_DPP-NEXT: s_endpgm 1390; 1391; GFX1132_DPP-LABEL: add_i32_varying: 1392; GFX1132_DPP: ; %bb.0: ; %entry 1393; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1394; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 1395; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1396; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 1397; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1398; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1399; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1400; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1401; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1402; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1403; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 1404; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 1405; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 1406; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1407; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 1408; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1409; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 1410; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1411; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 1412; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1413; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 1414; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 1415; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1416; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1417; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 1418; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 1419; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 1420; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 1421; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 1422; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1423; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 1424; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo 1425; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 1426; GFX1132_DPP-NEXT: ; %bb.1: 1427; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 1428; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 1429; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 1430; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 1431; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 1432; GFX1132_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc 1433; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) 1434; GFX1132_DPP-NEXT: buffer_gl1_inv 1435; GFX1132_DPP-NEXT: buffer_gl0_inv 1436; GFX1132_DPP-NEXT: .LBB2_2: 1437; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 1438; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 1439; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0 1440; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 1441; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 1442; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 1443; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 1444; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 1445; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1446; GFX1132_DPP-NEXT: s_endpgm 1447; 1448; GFX1264_DPP-LABEL: add_i32_varying: 1449; GFX1264_DPP: ; %bb.0: ; %entry 1450; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1451; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1452; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1453; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 1454; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 1455; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1456; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1457; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1458; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1459; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1460; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1461; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1462; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 1463; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1464; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1465; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 1466; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s2 1467; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1468; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1469; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 1470; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1471; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] 1472; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1473; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1474; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 1475; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 1476; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] 1477; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1478; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1479; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 1480; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 1481; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 1482; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 1483; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] 1484; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1485; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1486; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 1487; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 1488; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 1489; GFX1264_DPP-NEXT: s_wait_alu 0xfffe 1490; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] 1491; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1492; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 1493; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0 1494; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc 1495; GFX1264_DPP-NEXT: s_cbranch_execz .LBB2_2 1496; GFX1264_DPP-NEXT: ; %bb.1: 1497; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4 1498; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 1499; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 1500; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 1501; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 1502; GFX1264_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1503; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 1504; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV 1505; GFX1264_DPP-NEXT: .LBB2_2: 1506; GFX1264_DPP-NEXT: s_wait_alu 0xfffe 1507; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 1508; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 1509; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 1510; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 1511; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 1512; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 1513; GFX1264_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 1514; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 1515; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null 1516; GFX1264_DPP-NEXT: s_endpgm 1517; 1518; GFX1232_DPP-LABEL: add_i32_varying: 1519; GFX1232_DPP: ; %bb.0: ; %entry 1520; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1521; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 1522; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1523; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 1524; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1525; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1526; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1527; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1528; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1529; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1530; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 1531; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 1532; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 1533; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1534; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 1535; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1536; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 1537; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1538; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 1539; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1540; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 1541; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 1542; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1543; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1544; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 1545; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 1546; GFX1232_DPP-NEXT: s_wait_alu 0xfffe 1547; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 1548; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 1549; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 1550; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1551; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 1552; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo 1553; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2 1554; GFX1232_DPP-NEXT: ; %bb.1: 1555; GFX1232_DPP-NEXT: s_wait_alu 0xfffe 1556; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 1557; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 1558; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 1559; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 1560; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 1561; GFX1232_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1562; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 1563; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV 1564; GFX1232_DPP-NEXT: .LBB2_2: 1565; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 1566; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 1567; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0 1568; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3 1569; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 1570; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 1571; GFX1232_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 1572; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 1573; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null 1574; GFX1232_DPP-NEXT: s_endpgm 1575entry: 1576 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1577 %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel 1578 store i32 %old, ptr addrspace(1) %out 1579 ret void 1580} 1581 1582define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) { 1583; GFX7LESS-LABEL: add_i64_constant: 1584; GFX7LESS: ; %bb.0: ; %entry 1585; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1586; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1587; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1588; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1589; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1590; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1591; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1592; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 1593; GFX7LESS-NEXT: ; %bb.1: 1594; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1595; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1596; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1597; GFX7LESS-NEXT: s_mov_b32 s10, -1 1598; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1599; GFX7LESS-NEXT: s_mov_b32 s8, s2 1600; GFX7LESS-NEXT: s_mov_b32 s9, s3 1601; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1602; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1603; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1604; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1605; GFX7LESS-NEXT: buffer_wbinvl1 1606; GFX7LESS-NEXT: .LBB3_2: 1607; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1608; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1609; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1610; GFX7LESS-NEXT: s_mov_b32 s2, -1 1611; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1612; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 1613; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1614; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1615; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1616; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1617; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0 1618; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1619; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1620; GFX7LESS-NEXT: s_endpgm 1621; 1622; GFX8-LABEL: add_i64_constant: 1623; GFX8: ; %bb.0: ; %entry 1624; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1625; GFX8-NEXT: s_mov_b64 s[6:7], exec 1626; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1627; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1628; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1629; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1630; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1631; GFX8-NEXT: s_cbranch_execz .LBB3_2 1632; GFX8-NEXT: ; %bb.1: 1633; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1634; GFX8-NEXT: s_mov_b32 s8, s2 1635; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1636; GFX8-NEXT: s_mul_i32 s2, s2, 5 1637; GFX8-NEXT: s_mov_b32 s11, 0xf000 1638; GFX8-NEXT: s_mov_b32 s10, -1 1639; GFX8-NEXT: s_mov_b32 s9, s3 1640; GFX8-NEXT: v_mov_b32_e32 v0, s2 1641; GFX8-NEXT: v_mov_b32_e32 v1, 0 1642; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1643; GFX8-NEXT: s_waitcnt vmcnt(0) 1644; GFX8-NEXT: buffer_wbinvl1_vol 1645; GFX8-NEXT: .LBB3_2: 1646; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1647; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1648; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1649; GFX8-NEXT: v_readfirstlane_b32 s3, v0 1650; GFX8-NEXT: v_mov_b32_e32 v0, s3 1651; GFX8-NEXT: v_mov_b32_e32 v1, s2 1652; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1653; GFX8-NEXT: s_mov_b32 s3, 0xf000 1654; GFX8-NEXT: s_mov_b32 s2, -1 1655; GFX8-NEXT: s_nop 2 1656; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1657; GFX8-NEXT: s_endpgm 1658; 1659; GFX9-LABEL: add_i64_constant: 1660; GFX9: ; %bb.0: ; %entry 1661; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1662; GFX9-NEXT: s_mov_b64 s[6:7], exec 1663; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1664; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1665; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1666; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1667; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1668; GFX9-NEXT: s_cbranch_execz .LBB3_2 1669; GFX9-NEXT: ; %bb.1: 1670; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1671; GFX9-NEXT: s_mov_b32 s8, s2 1672; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1673; GFX9-NEXT: s_mul_i32 s2, s2, 5 1674; GFX9-NEXT: s_mov_b32 s11, 0xf000 1675; GFX9-NEXT: s_mov_b32 s10, -1 1676; GFX9-NEXT: s_mov_b32 s9, s3 1677; GFX9-NEXT: v_mov_b32_e32 v0, s2 1678; GFX9-NEXT: v_mov_b32_e32 v1, 0 1679; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1680; GFX9-NEXT: s_waitcnt vmcnt(0) 1681; GFX9-NEXT: buffer_wbinvl1_vol 1682; GFX9-NEXT: .LBB3_2: 1683; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1685; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1686; GFX9-NEXT: v_readfirstlane_b32 s3, v0 1687; GFX9-NEXT: v_mov_b32_e32 v0, s3 1688; GFX9-NEXT: v_mov_b32_e32 v1, s2 1689; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1690; GFX9-NEXT: s_mov_b32 s3, 0xf000 1691; GFX9-NEXT: s_mov_b32 s2, -1 1692; GFX9-NEXT: s_nop 2 1693; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1694; GFX9-NEXT: s_endpgm 1695; 1696; GFX1064-LABEL: add_i64_constant: 1697; GFX1064: ; %bb.0: ; %entry 1698; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1699; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1700; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1701; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1702; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1703; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1704; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1705; GFX1064-NEXT: s_cbranch_execz .LBB3_2 1706; GFX1064-NEXT: ; %bb.1: 1707; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1708; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1709; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1710; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1711; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1712; GFX1064-NEXT: s_mov_b32 s10, -1 1713; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1714; GFX1064-NEXT: s_mov_b32 s8, s2 1715; GFX1064-NEXT: s_mov_b32 s9, s3 1716; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1717; GFX1064-NEXT: s_waitcnt vmcnt(0) 1718; GFX1064-NEXT: buffer_gl1_inv 1719; GFX1064-NEXT: buffer_gl0_inv 1720; GFX1064-NEXT: .LBB3_2: 1721; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1722; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1723; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1724; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1725; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1726; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1727; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1728; GFX1064-NEXT: s_mov_b32 s2, -1 1729; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1730; GFX1064-NEXT: s_endpgm 1731; 1732; GFX1032-LABEL: add_i64_constant: 1733; GFX1032: ; %bb.0: ; %entry 1734; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1735; GFX1032-NEXT: s_mov_b32 s6, exec_lo 1736; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1737; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 1738; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1739; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1740; GFX1032-NEXT: s_cbranch_execz .LBB3_2 1741; GFX1032-NEXT: ; %bb.1: 1742; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 1743; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1744; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1745; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1746; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1747; GFX1032-NEXT: s_mov_b32 s10, -1 1748; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1749; GFX1032-NEXT: s_mov_b32 s8, s2 1750; GFX1032-NEXT: s_mov_b32 s9, s3 1751; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1752; GFX1032-NEXT: s_waitcnt vmcnt(0) 1753; GFX1032-NEXT: buffer_gl1_inv 1754; GFX1032-NEXT: buffer_gl0_inv 1755; GFX1032-NEXT: .LBB3_2: 1756; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1757; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1758; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1759; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1760; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1761; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1762; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1763; GFX1032-NEXT: s_mov_b32 s2, -1 1764; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1765; GFX1032-NEXT: s_endpgm 1766; 1767; GFX1164-LABEL: add_i64_constant: 1768; GFX1164: ; %bb.0: ; %entry 1769; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1770; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1771; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1772; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1773; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1774; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1775; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1776; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1777; GFX1164-NEXT: s_cbranch_execz .LBB3_2 1778; GFX1164-NEXT: ; %bb.1: 1779; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1780; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1781; GFX1164-NEXT: s_mul_i32 s6, s6, 5 1782; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 1783; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1784; GFX1164-NEXT: s_mov_b32 s10, -1 1785; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1786; GFX1164-NEXT: s_mov_b32 s8, s2 1787; GFX1164-NEXT: s_mov_b32 s9, s3 1788; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 1789; GFX1164-NEXT: s_waitcnt vmcnt(0) 1790; GFX1164-NEXT: buffer_gl1_inv 1791; GFX1164-NEXT: buffer_gl0_inv 1792; GFX1164-NEXT: .LBB3_2: 1793; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1794; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1795; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1796; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1797; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1798; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1799; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1800; GFX1164-NEXT: s_mov_b32 s2, -1 1801; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1802; GFX1164-NEXT: s_endpgm 1803; 1804; GFX1132-LABEL: add_i64_constant: 1805; GFX1132: ; %bb.0: ; %entry 1806; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1807; GFX1132-NEXT: s_mov_b32 s6, exec_lo 1808; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1809; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 1810; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1811; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1812; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1813; GFX1132-NEXT: s_cbranch_execz .LBB3_2 1814; GFX1132-NEXT: ; %bb.1: 1815; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 1816; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 1817; GFX1132-NEXT: s_mul_i32 s5, s5, 5 1818; GFX1132-NEXT: s_mov_b32 s10, -1 1819; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 1820; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1821; GFX1132-NEXT: s_mov_b32 s8, s2 1822; GFX1132-NEXT: s_mov_b32 s9, s3 1823; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 1824; GFX1132-NEXT: s_waitcnt vmcnt(0) 1825; GFX1132-NEXT: buffer_gl1_inv 1826; GFX1132-NEXT: buffer_gl0_inv 1827; GFX1132-NEXT: .LBB3_2: 1828; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1829; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1830; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1831; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1832; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1833; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1834; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1835; GFX1132-NEXT: s_mov_b32 s2, -1 1836; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1837; GFX1132-NEXT: s_endpgm 1838; 1839; GFX1264-LABEL: add_i64_constant: 1840; GFX1264: ; %bb.0: ; %entry 1841; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1842; GFX1264-NEXT: s_mov_b64 s[6:7], exec 1843; GFX1264-NEXT: s_mov_b32 s9, 0 1844; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1845; GFX1264-NEXT: s_mov_b64 s[4:5], exec 1846; GFX1264-NEXT: s_wait_alu 0xfffe 1847; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1848; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1849; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 1850; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 1851; GFX1264-NEXT: s_cbranch_execz .LBB3_2 1852; GFX1264-NEXT: ; %bb.1: 1853; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 1854; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 1855; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 1856; GFX1264-NEXT: s_mov_b32 s10, -1 1857; GFX1264-NEXT: s_wait_alu 0xfffe 1858; GFX1264-NEXT: v_mov_b32_e32 v0, s6 1859; GFX1264-NEXT: v_mov_b32_e32 v1, s7 1860; GFX1264-NEXT: s_wait_kmcnt 0x0 1861; GFX1264-NEXT: s_mov_b32 s8, s2 1862; GFX1264-NEXT: s_mov_b32 s9, s3 1863; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1864; GFX1264-NEXT: s_wait_loadcnt 0x0 1865; GFX1264-NEXT: global_inv scope:SCOPE_DEV 1866; GFX1264-NEXT: .LBB3_2: 1867; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] 1868; GFX1264-NEXT: s_wait_kmcnt 0x0 1869; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 1870; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 1871; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) 1872; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3] 1873; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 1874; GFX1264-NEXT: s_mov_b32 s2, -1 1875; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 1876; GFX1264-NEXT: s_endpgm 1877; 1878; GFX1232-LABEL: add_i64_constant: 1879; GFX1232: ; %bb.0: ; %entry 1880; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1881; GFX1232-NEXT: s_mov_b32 s7, exec_lo 1882; GFX1232-NEXT: s_mov_b32 s5, 0 1883; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 1884; GFX1232-NEXT: s_mov_b32 s6, exec_lo 1885; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 1886; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 1887; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 1888; GFX1232-NEXT: s_cbranch_execz .LBB3_2 1889; GFX1232-NEXT: ; %bb.1: 1890; GFX1232-NEXT: s_wait_alu 0xfffe 1891; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 1892; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 1893; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 1894; GFX1232-NEXT: s_mov_b32 s10, -1 1895; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 1896; GFX1232-NEXT: s_wait_kmcnt 0x0 1897; GFX1232-NEXT: s_mov_b32 s8, s2 1898; GFX1232-NEXT: s_mov_b32 s9, s3 1899; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1900; GFX1232-NEXT: s_wait_loadcnt 0x0 1901; GFX1232-NEXT: global_inv scope:SCOPE_DEV 1902; GFX1232-NEXT: .LBB3_2: 1903; GFX1232-NEXT: s_wait_alu 0xfffe 1904; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 1905; GFX1232-NEXT: s_wait_kmcnt 0x0 1906; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 1907; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 1908; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 1909; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3] 1910; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 1911; GFX1232-NEXT: s_mov_b32 s2, -1 1912; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 1913; GFX1232-NEXT: s_endpgm 1914entry: 1915 %old = atomicrmw add ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel 1916 store i64 %old, ptr addrspace(1) %out 1917 ret void 1918} 1919 1920define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 %additive) { 1921; GFX7LESS-LABEL: add_i64_uniform: 1922; GFX7LESS: ; %bb.0: ; %entry 1923; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 1924; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1925; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1926; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 1927; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 1928; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1929; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1930; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 1931; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1932; GFX7LESS-NEXT: ; %bb.1: 1933; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1934; GFX7LESS-NEXT: s_mov_b32 s14, -1 1935; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1936; GFX7LESS-NEXT: s_mov_b32 s12, s2 1937; GFX7LESS-NEXT: s_mov_b32 s13, s3 1938; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[8:9] 1939; GFX7LESS-NEXT: s_mul_i32 s3, s5, s2 1940; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 1941; GFX7LESS-NEXT: v_mul_hi_u32 v0, s4, v0 1942; GFX7LESS-NEXT: s_mul_i32 s2, s4, s2 1943; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s3, v0 1944; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 1945; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 1946; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1947; GFX7LESS-NEXT: buffer_wbinvl1 1948; GFX7LESS-NEXT: .LBB4_2: 1949; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 1950; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1951; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1952; GFX7LESS-NEXT: s_mov_b32 s2, -1 1953; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v1 1954; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v0 1955; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1956; GFX7LESS-NEXT: v_mul_lo_u32 v0, s5, v2 1957; GFX7LESS-NEXT: v_mul_hi_u32 v1, s4, v2 1958; GFX7LESS-NEXT: v_mul_lo_u32 v2, s4, v2 1959; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1960; GFX7LESS-NEXT: v_mov_b32_e32 v3, s6 1961; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s7, v2 1962; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1963; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1964; GFX7LESS-NEXT: s_endpgm 1965; 1966; GFX8-LABEL: add_i64_uniform: 1967; GFX8: ; %bb.0: ; %entry 1968; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1969; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1970; GFX8-NEXT: s_mov_b64 s[8:9], exec 1971; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1972; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1973; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1974; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1975; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1976; GFX8-NEXT: s_cbranch_execz .LBB4_2 1977; GFX8-NEXT: ; %bb.1: 1978; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX8-NEXT: s_mov_b32 s12, s2 1980; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] 1981; GFX8-NEXT: v_mov_b32_e32 v0, s2 1982; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s4, v0, 0 1983; GFX8-NEXT: s_mul_i32 s2, s5, s2 1984; GFX8-NEXT: s_mov_b32 s15, 0xf000 1985; GFX8-NEXT: s_mov_b32 s14, -1 1986; GFX8-NEXT: s_mov_b32 s13, s3 1987; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1988; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 1989; GFX8-NEXT: s_waitcnt vmcnt(0) 1990; GFX8-NEXT: buffer_wbinvl1_vol 1991; GFX8-NEXT: .LBB4_2: 1992; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1993; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1994; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1995; GFX8-NEXT: v_readfirstlane_b32 s3, v0 1996; GFX8-NEXT: v_mov_b32_e32 v0, s3 1997; GFX8-NEXT: v_mov_b32_e32 v1, s2 1998; GFX8-NEXT: v_mul_lo_u32 v3, s5, v2 1999; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v2, v[0:1] 2000; GFX8-NEXT: s_mov_b32 s3, 0xf000 2001; GFX8-NEXT: s_mov_b32 s2, -1 2002; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2003; GFX8-NEXT: s_nop 1 2004; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2005; GFX8-NEXT: s_endpgm 2006; 2007; GFX9-LABEL: add_i64_uniform: 2008; GFX9: ; %bb.0: ; %entry 2009; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2010; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2011; GFX9-NEXT: s_mov_b64 s[8:9], exec 2012; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2013; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2014; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2015; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2016; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2017; GFX9-NEXT: s_cbranch_execz .LBB4_2 2018; GFX9-NEXT: ; %bb.1: 2019; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2020; GFX9-NEXT: s_mov_b32 s12, s2 2021; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] 2022; GFX9-NEXT: s_mov_b32 s13, s3 2023; GFX9-NEXT: s_mul_i32 s3, s7, s2 2024; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2 2025; GFX9-NEXT: s_add_i32 s8, s8, s3 2026; GFX9-NEXT: s_mul_i32 s2, s6, s2 2027; GFX9-NEXT: s_mov_b32 s15, 0xf000 2028; GFX9-NEXT: s_mov_b32 s14, -1 2029; GFX9-NEXT: v_mov_b32_e32 v0, s2 2030; GFX9-NEXT: v_mov_b32_e32 v1, s8 2031; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 2032; GFX9-NEXT: s_waitcnt vmcnt(0) 2033; GFX9-NEXT: buffer_wbinvl1_vol 2034; GFX9-NEXT: .LBB4_2: 2035; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2036; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2037; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2038; GFX9-NEXT: v_readfirstlane_b32 s3, v0 2039; GFX9-NEXT: v_mov_b32_e32 v0, s3 2040; GFX9-NEXT: v_mov_b32_e32 v1, s2 2041; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v2, v[0:1] 2042; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[1:2] 2043; GFX9-NEXT: s_mov_b32 s3, 0xf000 2044; GFX9-NEXT: s_mov_b32 s2, -1 2045; GFX9-NEXT: s_nop 2 2046; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2047; GFX9-NEXT: s_endpgm 2048; 2049; GFX1064-LABEL: add_i64_uniform: 2050; GFX1064: ; %bb.0: ; %entry 2051; GFX1064-NEXT: s_clause 0x1 2052; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2053; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2054; GFX1064-NEXT: s_mov_b64 s[8:9], exec 2055; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2056; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2057; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2058; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2059; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2060; GFX1064-NEXT: s_cbranch_execz .LBB4_2 2061; GFX1064-NEXT: ; %bb.1: 2062; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2063; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 2064; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2065; GFX1064-NEXT: s_mul_i32 s9, s7, s8 2066; GFX1064-NEXT: s_mul_hi_u32 s10, s6, s8 2067; GFX1064-NEXT: s_mul_i32 s8, s6, s8 2068; GFX1064-NEXT: s_add_i32 s10, s10, s9 2069; GFX1064-NEXT: v_mov_b32_e32 v0, s8 2070; GFX1064-NEXT: v_mov_b32_e32 v1, s10 2071; GFX1064-NEXT: s_mov_b32 s10, -1 2072; GFX1064-NEXT: s_mov_b32 s8, s2 2073; GFX1064-NEXT: s_mov_b32 s9, s3 2074; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 2075; GFX1064-NEXT: s_waitcnt vmcnt(0) 2076; GFX1064-NEXT: buffer_gl1_inv 2077; GFX1064-NEXT: buffer_gl0_inv 2078; GFX1064-NEXT: .LBB4_2: 2079; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2080; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2081; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2082; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2083; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2084; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v2, s[2:3] 2085; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[1:2] 2086; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2087; GFX1064-NEXT: s_mov_b32 s2, -1 2088; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2089; GFX1064-NEXT: s_endpgm 2090; 2091; GFX1032-LABEL: add_i64_uniform: 2092; GFX1032: ; %bb.0: ; %entry 2093; GFX1032-NEXT: s_clause 0x1 2094; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2095; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2096; GFX1032-NEXT: s_mov_b32 s8, exec_lo 2097; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2098; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 2099; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2100; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2101; GFX1032-NEXT: s_cbranch_execz .LBB4_2 2102; GFX1032-NEXT: ; %bb.1: 2103; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s8 2104; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 2105; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2106; GFX1032-NEXT: s_mul_i32 s8, s7, s5 2107; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s5 2108; GFX1032-NEXT: s_mul_i32 s5, s6, s5 2109; GFX1032-NEXT: s_add_i32 s9, s9, s8 2110; GFX1032-NEXT: v_mov_b32_e32 v0, s5 2111; GFX1032-NEXT: v_mov_b32_e32 v1, s9 2112; GFX1032-NEXT: s_mov_b32 s10, -1 2113; GFX1032-NEXT: s_mov_b32 s8, s2 2114; GFX1032-NEXT: s_mov_b32 s9, s3 2115; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 2116; GFX1032-NEXT: s_waitcnt vmcnt(0) 2117; GFX1032-NEXT: buffer_gl1_inv 2118; GFX1032-NEXT: buffer_gl0_inv 2119; GFX1032-NEXT: .LBB4_2: 2120; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2121; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2122; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2123; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2124; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2125; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s6, v2, s[2:3] 2126; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2127; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s7, v2, v[1:2] 2128; GFX1032-NEXT: s_mov_b32 s2, -1 2129; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2130; GFX1032-NEXT: s_endpgm 2131; 2132; GFX1164-LABEL: add_i64_uniform: 2133; GFX1164: ; %bb.0: ; %entry 2134; GFX1164-NEXT: s_clause 0x1 2135; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2136; GFX1164-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2137; GFX1164-NEXT: s_mov_b64 s[8:9], exec 2138; GFX1164-NEXT: s_mov_b64 s[6:7], exec 2139; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2140; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2141; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2142; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2143; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2144; GFX1164-NEXT: s_cbranch_execz .LBB4_2 2145; GFX1164-NEXT: ; %bb.1: 2146; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2147; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 2148; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2149; GFX1164-NEXT: s_mul_i32 s9, s5, s8 2150; GFX1164-NEXT: s_mul_hi_u32 s10, s4, s8 2151; GFX1164-NEXT: s_mul_i32 s8, s4, s8 2152; GFX1164-NEXT: s_add_i32 s10, s10, s9 2153; GFX1164-NEXT: v_mov_b32_e32 v0, s8 2154; GFX1164-NEXT: v_mov_b32_e32 v1, s10 2155; GFX1164-NEXT: s_mov_b32 s10, -1 2156; GFX1164-NEXT: s_mov_b32 s8, s2 2157; GFX1164-NEXT: s_mov_b32 s9, s3 2158; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 2159; GFX1164-NEXT: s_waitcnt vmcnt(0) 2160; GFX1164-NEXT: buffer_gl1_inv 2161; GFX1164-NEXT: buffer_gl0_inv 2162; GFX1164-NEXT: .LBB4_2: 2163; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7] 2164; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2165; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 2166; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2167; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2168; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] 2169; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2170; GFX1164-NEXT: s_mov_b32 s2, -1 2171; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] 2172; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2173; GFX1164-NEXT: v_mov_b32_e32 v1, v3 2174; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2175; GFX1164-NEXT: s_endpgm 2176; 2177; GFX1132-LABEL: add_i64_uniform: 2178; GFX1132: ; %bb.0: ; %entry 2179; GFX1132-NEXT: s_clause 0x1 2180; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2181; GFX1132-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2182; GFX1132-NEXT: s_mov_b32 s7, exec_lo 2183; GFX1132-NEXT: s_mov_b32 s6, exec_lo 2184; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 2185; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 2186; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2187; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 2188; GFX1132-NEXT: s_cbranch_execz .LBB4_2 2189; GFX1132-NEXT: ; %bb.1: 2190; GFX1132-NEXT: s_bcnt1_i32_b32 s7, s7 2191; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 2192; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2193; GFX1132-NEXT: s_mul_i32 s8, s5, s7 2194; GFX1132-NEXT: s_mul_hi_u32 s9, s4, s7 2195; GFX1132-NEXT: s_mul_i32 s7, s4, s7 2196; GFX1132-NEXT: s_add_i32 s9, s9, s8 2197; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2198; GFX1132-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9 2199; GFX1132-NEXT: s_mov_b32 s10, -1 2200; GFX1132-NEXT: s_mov_b32 s8, s2 2201; GFX1132-NEXT: s_mov_b32 s9, s3 2202; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc 2203; GFX1132-NEXT: s_waitcnt vmcnt(0) 2204; GFX1132-NEXT: buffer_gl1_inv 2205; GFX1132-NEXT: buffer_gl0_inv 2206; GFX1132-NEXT: .LBB4_2: 2207; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6 2208; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2209; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 2210; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2211; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2212; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] 2213; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2214; GFX1132-NEXT: s_mov_b32 s2, -1 2215; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] 2216; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2217; GFX1132-NEXT: v_mov_b32_e32 v1, v3 2218; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2219; GFX1132-NEXT: s_endpgm 2220; 2221; GFX1264-LABEL: add_i64_uniform: 2222; GFX1264: ; %bb.0: ; %entry 2223; GFX1264-NEXT: s_clause 0x1 2224; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2225; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2226; GFX1264-NEXT: s_mov_b64 s[8:9], exec 2227; GFX1264-NEXT: s_mov_b32 s11, 0 2228; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2229; GFX1264-NEXT: s_mov_b64 s[6:7], exec 2230; GFX1264-NEXT: s_wait_alu 0xfffe 2231; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2232; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2233; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 2234; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 2235; GFX1264-NEXT: s_cbranch_execz .LBB4_2 2236; GFX1264-NEXT: ; %bb.1: 2237; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] 2238; GFX1264-NEXT: s_wait_kmcnt 0x0 2239; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] 2240; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 2241; GFX1264-NEXT: s_wait_alu 0xfffe 2242; GFX1264-NEXT: v_mov_b32_e32 v0, s8 2243; GFX1264-NEXT: v_mov_b32_e32 v1, s9 2244; GFX1264-NEXT: s_mov_b32 s10, -1 2245; GFX1264-NEXT: s_mov_b32 s8, s2 2246; GFX1264-NEXT: s_mov_b32 s9, s3 2247; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2248; GFX1264-NEXT: s_wait_loadcnt 0x0 2249; GFX1264-NEXT: global_inv scope:SCOPE_DEV 2250; GFX1264-NEXT: .LBB4_2: 2251; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] 2252; GFX1264-NEXT: s_wait_kmcnt 0x0 2253; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 2254; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 2255; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2256; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3] 2257; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 2258; GFX1264-NEXT: s_mov_b32 s2, -1 2259; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s5, v2, v[1:2] 2260; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 2261; GFX1264-NEXT: s_endpgm 2262; 2263; GFX1232-LABEL: add_i64_uniform: 2264; GFX1232: ; %bb.0: ; %entry 2265; GFX1232-NEXT: s_clause 0x1 2266; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2267; GFX1232-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2268; GFX1232-NEXT: s_mov_b32 s6, exec_lo 2269; GFX1232-NEXT: s_mov_b32 s7, 0 2270; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 2271; GFX1232-NEXT: s_mov_b32 s8, exec_lo 2272; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 2273; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 2274; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 2275; GFX1232-NEXT: s_cbranch_execz .LBB4_2 2276; GFX1232-NEXT: ; %bb.1: 2277; GFX1232-NEXT: s_wait_alu 0xfffe 2278; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 2279; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 2280; GFX1232-NEXT: s_wait_kmcnt 0x0 2281; GFX1232-NEXT: s_wait_alu 0xfffe 2282; GFX1232-NEXT: s_mul_u64 s[6:7], s[4:5], s[6:7] 2283; GFX1232-NEXT: s_mov_b32 s14, -1 2284; GFX1232-NEXT: s_wait_alu 0xfffe 2285; GFX1232-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 2286; GFX1232-NEXT: s_mov_b32 s12, s2 2287; GFX1232-NEXT: s_mov_b32 s13, s3 2288; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2289; GFX1232-NEXT: s_wait_loadcnt 0x0 2290; GFX1232-NEXT: global_inv scope:SCOPE_DEV 2291; GFX1232-NEXT: .LBB4_2: 2292; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 2293; GFX1232-NEXT: s_wait_kmcnt 0x0 2294; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 2295; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 2296; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2297; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3] 2298; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 2299; GFX1232-NEXT: s_mov_b32 s2, -1 2300; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s5, v2, v[1:2] 2301; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 2302; GFX1232-NEXT: s_endpgm 2303entry: 2304 %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel 2305 store i64 %old, ptr addrspace(1) %out 2306 ret void 2307} 2308 2309define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { 2310; GFX7LESS_ITERATIVE-LABEL: add_i64_varying: 2311; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 2312; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 2313; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2314; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2315; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2316; GFX7LESS_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2317; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2318; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 2319; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s2 2320; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 2321; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 2322; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 2323; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 2324; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 2325; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 2326; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 2327; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 2328; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 2329; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[2:3] 2330; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB5_1 2331; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2332; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2333; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2334; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 2335; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2336; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2337; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 2338; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2339; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2340; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 2341; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 2342; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 2343; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2344; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 2345; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 2346; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 2347; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 2348; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc 2349; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 2350; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 2351; GFX7LESS_ITERATIVE-NEXT: .LBB5_4: 2352; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 2353; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2354; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 2355; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 2356; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 2357; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 2358; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) 2359; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 2360; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1 2361; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc 2362; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2363; GFX7LESS_ITERATIVE-NEXT: s_endpgm 2364; 2365; GFX8_ITERATIVE-LABEL: add_i64_varying: 2366; GFX8_ITERATIVE: ; %bb.0: ; %entry 2367; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 2368; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2369; GFX8_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2370; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2371; GFX8_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2372; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2373; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 2374; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 2375; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 2376; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 2377; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 2378; GFX8_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 2379; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 2380; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 2381; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 2382; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 2383; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 2384; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 2385; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2386; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2387; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2388; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2389; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2390; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2391; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 2392; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2393; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2394; GFX8_ITERATIVE-NEXT: ; %bb.3: 2395; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 2396; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 2397; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 2398; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2399; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 2400; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 2401; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 2402; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc 2403; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 2404; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol 2405; GFX8_ITERATIVE-NEXT: .LBB5_4: 2406; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 2407; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 2408; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 2409; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 2410; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1 2411; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2412; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 2413; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 2414; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc 2415; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2416; GFX8_ITERATIVE-NEXT: s_endpgm 2417; 2418; GFX9_ITERATIVE-LABEL: add_i64_varying: 2419; GFX9_ITERATIVE: ; %bb.0: ; %entry 2420; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 2421; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2422; GFX9_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2423; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2424; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2425; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2426; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 2427; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 2428; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 2429; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 2430; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 2431; GFX9_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 2432; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 2433; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 2434; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 2435; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 2436; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 2437; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 2438; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2439; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2440; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2441; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2442; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2443; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2444; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 2445; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2446; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2447; GFX9_ITERATIVE-NEXT: ; %bb.3: 2448; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 2449; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 2450; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 2451; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2452; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 2453; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 2454; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 2455; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc 2456; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 2457; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol 2458; GFX9_ITERATIVE-NEXT: .LBB5_4: 2459; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 2460; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 2461; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 2462; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 2463; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 2464; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2465; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 2466; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 2467; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc 2468; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2469; GFX9_ITERATIVE-NEXT: s_endpgm 2470; 2471; GFX1064_ITERATIVE-LABEL: add_i64_varying: 2472; GFX1064_ITERATIVE: ; %bb.0: ; %entry 2473; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2474; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 2475; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2476; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2477; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2478; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2479; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 2480; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 2481; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 2482; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 2483; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 2484; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 2485; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 2486; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 2487; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 2488; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 2489; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 2490; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2491; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2492; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2493; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2494; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2495; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2496; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 2497; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2498; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2499; GFX1064_ITERATIVE-NEXT: ; %bb.3: 2500; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 2501; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 2502; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 2503; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 2504; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2505; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 2506; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 2507; GFX1064_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc 2508; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 2509; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv 2510; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 2511; GFX1064_ITERATIVE-NEXT: .LBB5_4: 2512; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 2513; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 2514; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2515; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 2516; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 2517; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 2518; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2519; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2520; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 2521; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2522; GFX1064_ITERATIVE-NEXT: s_endpgm 2523; 2524; GFX1032_ITERATIVE-LABEL: add_i64_varying: 2525; GFX1032_ITERATIVE: ; %bb.0: ; %entry 2526; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2527; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 2528; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2529; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2530; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2531; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2532; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 2533; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 2534; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 2535; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 2536; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 2537; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 2538; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 2539; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 2540; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 2541; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 2542; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 2543; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2544; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2545; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2546; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2547; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2548; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 2549; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 2550; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2551; GFX1032_ITERATIVE-NEXT: ; %bb.3: 2552; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 2553; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 2554; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 2555; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 2556; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2557; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 2558; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 2559; GFX1032_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc 2560; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 2561; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv 2562; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 2563; GFX1032_ITERATIVE-NEXT: .LBB5_4: 2564; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 2565; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 2566; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2567; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 2568; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 2569; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 2570; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2571; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2572; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 2573; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2574; GFX1032_ITERATIVE-NEXT: s_endpgm 2575; 2576; GFX1164_ITERATIVE-LABEL: add_i64_varying: 2577; GFX1164_ITERATIVE: ; %bb.0: ; %entry 2578; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 2579; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2580; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 2581; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2582; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 2583; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2584; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2585; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] 2586; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 2587; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 2588; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 2589; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 2590; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 2591; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 2592; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2593; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 2594; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 2595; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] 2596; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2597; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 2598; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 2599; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2600; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2601; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2602; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2603; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2604; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2605; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 2606; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 2607; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2608; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2609; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2610; GFX1164_ITERATIVE-NEXT: ; %bb.3: 2611; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 2612; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 2613; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 2614; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 2615; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2616; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 2617; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 2618; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc 2619; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 2620; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv 2621; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 2622; GFX1164_ITERATIVE-NEXT: .LBB5_4: 2623; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 2624; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2625; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 2626; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 2627; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2628; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 2629; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2630; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2631; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 2632; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2633; GFX1164_ITERATIVE-NEXT: s_endpgm 2634; 2635; GFX1132_ITERATIVE-LABEL: add_i64_varying: 2636; GFX1132_ITERATIVE: ; %bb.0: ; %entry 2637; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 2638; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 2639; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2640; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 2641; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2642; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2643; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 2644; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 2645; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 2646; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 2647; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 2648; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 2649; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 2650; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 2651; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 2652; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 2653; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 2654; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 2655; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 2656; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 2657; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2658; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2659; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2660; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2661; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2662; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 2663; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 2664; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 2665; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2666; GFX1132_ITERATIVE-NEXT: ; %bb.3: 2667; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 2668; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 2669; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 2670; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2671; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 2672; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 2673; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc 2674; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 2675; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv 2676; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 2677; GFX1132_ITERATIVE-NEXT: .LBB5_4: 2678; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 2679; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2680; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 2681; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 2682; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2683; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 2684; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2685; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2686; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 2687; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2688; GFX1132_ITERATIVE-NEXT: s_endpgm 2689; 2690; GFX1264_ITERATIVE-LABEL: add_i64_varying: 2691; GFX1264_ITERATIVE: ; %bb.0: ; %entry 2692; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 2693; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2694; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 2695; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2696; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 2697; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2698; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2699; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] 2700; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe 2701; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 2702; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 2703; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 2704; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 2705; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 2706; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] 2707; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] 2708; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 2709; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 2710; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2711; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2712; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2713; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2714; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2715; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2716; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 2717; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 2718; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2719; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2720; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2721; GFX1264_ITERATIVE-NEXT: ; %bb.3: 2722; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe 2723; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 2724; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 2725; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 2726; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 2727; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 2728; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 2729; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 2730; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2731; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 2732; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV 2733; GFX1264_ITERATIVE-NEXT: .LBB5_4: 2734; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 2735; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 2736; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 2737; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 2738; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2739; GFX1264_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 2740; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2741; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2742; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 2743; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 2744; GFX1264_ITERATIVE-NEXT: s_endpgm 2745; 2746; GFX1232_ITERATIVE-LABEL: add_i64_varying: 2747; GFX1232_ITERATIVE: ; %bb.0: ; %entry 2748; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 2749; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 2750; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 2751; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 2752; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop 2753; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2754; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 2755; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 2756; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 2757; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 2758; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 2759; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 2760; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 2761; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 2762; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 2763; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] 2764; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 2765; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 2766; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 2767; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2768; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2769; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2770; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2771; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2772; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 2773; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 2774; GFX1232_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 2775; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 2776; GFX1232_ITERATIVE-NEXT: ; %bb.3: 2777; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 2778; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 2779; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 2780; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 2781; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 2782; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 2783; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2784; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 2785; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV 2786; GFX1232_ITERATIVE-NEXT: .LBB5_4: 2787; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 2788; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 2789; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 2790; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 2791; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2792; GFX1232_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 2793; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2794; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2795; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 2796; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 2797; GFX1232_ITERATIVE-NEXT: s_endpgm 2798; 2799; GFX7LESS_DPP-LABEL: add_i64_varying: 2800; GFX7LESS_DPP: ; %bb.0: ; %entry 2801; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2802; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 2803; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 2804; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 2805; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 2806; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 2807; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 2808; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 2809; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 2810; GFX7LESS_DPP-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 2811; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) 2812; GFX7LESS_DPP-NEXT: buffer_wbinvl1 2813; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 2814; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 2815; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2816; GFX7LESS_DPP-NEXT: s_endpgm 2817; 2818; GFX8_DPP-LABEL: add_i64_varying: 2819; GFX8_DPP: ; %bb.0: ; %entry 2820; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2821; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 2822; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 2823; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] 2824; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 2825; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 2826; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 2827; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] 2828; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 2829; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] 2830; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2831; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 2832; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 2833; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2834; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 2835; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 2836; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2837; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 2838; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 2839; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2840; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 2841; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 2842; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2843; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 2844; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 2845; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2846; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 2847; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 2848; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2849; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 2850; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 2851; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2852; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 2853; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 2854; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2855; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 2856; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 2857; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2858; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 2859; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 2860; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2861; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 2862; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 2863; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2864; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v2, v4, vcc 2865; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 2866; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 2867; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 2868; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 2869; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 2870; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] 2871; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 2872; GFX8_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 2873; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc 2874; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2 2875; GFX8_DPP-NEXT: ; %bb.1: 2876; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6 2877; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 2878; GFX8_DPP-NEXT: s_mov_b32 s10, -1 2879; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 2880; GFX8_DPP-NEXT: s_mov_b32 s8, s2 2881; GFX8_DPP-NEXT: s_mov_b32 s9, s3 2882; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s7 2883; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc 2884; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) 2885; GFX8_DPP-NEXT: buffer_wbinvl1_vol 2886; GFX8_DPP-NEXT: .LBB5_2: 2887; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] 2888; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 2889; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v6 2890; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v1 2891; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 2892; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 2893; GFX8_DPP-NEXT: v_add_u32_e32 v6, vcc, s5, v6 2894; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 2895; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 2896; GFX8_DPP-NEXT: s_mov_b32 s2, -1 2897; GFX8_DPP-NEXT: v_addc_u32_e32 v7, vcc, v0, v7, vcc 2898; GFX8_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 2899; GFX8_DPP-NEXT: s_endpgm 2900; 2901; GFX9_DPP-LABEL: add_i64_varying: 2902; GFX9_DPP: ; %bb.0: ; %entry 2903; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2904; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 2905; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 2906; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] 2907; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 2908; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 2909; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 2910; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] 2911; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 2912; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] 2913; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2914; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 2915; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 2916; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2917; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 2918; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 2919; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2920; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 2921; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 2922; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2923; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 2924; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 2925; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2926; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 2927; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 2928; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2929; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 2930; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 2931; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2932; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 2933; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 2934; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2935; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 2936; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 2937; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2938; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 2939; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 2940; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2941; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 2942; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 2943; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2944; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 2945; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 2946; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2947; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc 2948; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 2949; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 2950; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 2951; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 2952; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 2953; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] 2954; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 2955; GFX9_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 2956; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc 2957; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 2958; GFX9_DPP-NEXT: ; %bb.1: 2959; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6 2960; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 2961; GFX9_DPP-NEXT: s_mov_b32 s10, -1 2962; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 2963; GFX9_DPP-NEXT: s_mov_b32 s8, s2 2964; GFX9_DPP-NEXT: s_mov_b32 s9, s3 2965; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s7 2966; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc 2967; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) 2968; GFX9_DPP-NEXT: buffer_wbinvl1_vol 2969; GFX9_DPP-NEXT: .LBB5_2: 2970; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] 2971; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 2972; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 2973; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 2974; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 2975; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 2976; GFX9_DPP-NEXT: v_add_co_u32_e32 v6, vcc, s5, v6 2977; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 2978; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 2979; GFX9_DPP-NEXT: s_mov_b32 s2, -1 2980; GFX9_DPP-NEXT: v_addc_co_u32_e32 v7, vcc, v0, v7, vcc 2981; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 2982; GFX9_DPP-NEXT: s_endpgm 2983; 2984; GFX1064_DPP-LABEL: add_i64_varying: 2985; GFX1064_DPP: ; %bb.0: ; %entry 2986; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 2987; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 2988; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 2989; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] 2990; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 2991; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 2992; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 2993; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2994; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, 0 2995; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf 2996; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 2997; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc 2998; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 2999; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3000; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 3001; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3002; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 3003; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 3004; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 3005; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3006; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 3007; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3008; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 3009; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc 3010; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 3011; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3012; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3013; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 3014; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 3015; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 3016; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 3017; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 3018; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3019; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3020; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 3021; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 3022; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 3023; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 3024; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 3025; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 3026; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 3027; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 3028; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3029; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3030; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 3031; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 3032; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 3033; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 3034; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3035; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3036; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 3037; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3038; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3039; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 3040; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 3041; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 3042; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 3043; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 3044; GFX1064_DPP-NEXT: v_writelane_b32 v7, s6, 16 3045; GFX1064_DPP-NEXT: v_writelane_b32 v6, s7, 16 3046; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 3047; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 3048; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 3049; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 32 3050; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 32 3051; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] 3052; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3053; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 3054; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] 3055; GFX1064_DPP-NEXT: v_writelane_b32 v7, s11, 48 3056; GFX1064_DPP-NEXT: v_writelane_b32 v6, s10, 48 3057; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] 3058; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3059; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 3060; GFX1064_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 3061; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc 3062; GFX1064_DPP-NEXT: s_cbranch_execz .LBB5_2 3063; GFX1064_DPP-NEXT: ; %bb.1: 3064; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s5 3065; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s4 3066; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 3067; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 3068; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 3069; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 3070; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[8:9], off, s[4:7], 0 glc 3071; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) 3072; GFX1064_DPP-NEXT: buffer_gl1_inv 3073; GFX1064_DPP-NEXT: buffer_gl0_inv 3074; GFX1064_DPP-NEXT: .LBB5_2: 3075; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 3076; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 3077; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 3078; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v8 3079; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v6 3080; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 3081; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 3082; GFX1064_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10 3083; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 3084; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc 3085; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 3086; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 3087; GFX1064_DPP-NEXT: s_endpgm 3088; 3089; GFX1032_DPP-LABEL: add_i64_varying: 3090; GFX1032_DPP: ; %bb.0: ; %entry 3091; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 3092; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 3093; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 3094; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 3095; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 3096; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 3097; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 3098; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3099; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 3100; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf 3101; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, 0 3102; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 3103; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo 3104; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 3105; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3106; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 3107; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3108; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 3109; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 3110; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 3111; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3112; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 3113; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3114; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 3115; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo 3116; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 3117; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3118; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3119; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 3120; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 3121; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 3122; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 3123; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 3124; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3125; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3126; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 3127; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3128; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 3129; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 3130; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 3131; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 3132; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3133; GFX1032_DPP-NEXT: v_readlane_b32 s8, v2, 15 3134; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 3135; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3136; GFX1032_DPP-NEXT: v_readlane_b32 s7, v1, 15 3137; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 3138; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3139; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 3140; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16 3141; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16 3142; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 3143; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 3144; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3145; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 3146; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo 3147; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 3148; GFX1032_DPP-NEXT: ; %bb.1: 3149; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 3150; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 3151; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 3152; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 3153; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 3154; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 3155; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc 3156; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) 3157; GFX1032_DPP-NEXT: buffer_gl1_inv 3158; GFX1032_DPP-NEXT: buffer_gl0_inv 3159; GFX1032_DPP-NEXT: .LBB5_2: 3160; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 3161; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 3162; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 3163; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 3164; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 3165; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 3166; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 3167; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s2, v11 3168; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 3169; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo 3170; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 3171; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 3172; GFX1032_DPP-NEXT: s_endpgm 3173; 3174; GFX1164_DPP-LABEL: add_i64_varying: 3175; GFX1164_DPP: ; %bb.0: ; %entry 3176; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3177; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 3178; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3179; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 3180; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 3181; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 3182; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 3183; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3184; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3185; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3186; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3187; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 3188; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3189; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 3190; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3191; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3192; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 3193; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3194; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3195; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 3196; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3197; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 3198; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 3199; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3200; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 3201; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3202; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3203; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 3204; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3205; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 3206; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3207; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 3208; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3209; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 3210; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3211; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3212; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 3213; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 3214; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 3215; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3216; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 3217; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 3218; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3219; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3220; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3221; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 3222; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 3223; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 3224; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3225; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3226; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 3227; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3228; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 15 3229; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3230; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 15 3231; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 31 3232; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 31 3233; GFX1164_DPP-NEXT: v_writelane_b32 v4, s6, 16 3234; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 63 3235; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 16 3236; GFX1164_DPP-NEXT: v_readlane_b32 s10, v2, 47 3237; GFX1164_DPP-NEXT: v_readlane_b32 s11, v1, 47 3238; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 3239; GFX1164_DPP-NEXT: v_writelane_b32 v4, s8, 32 3240; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 32 3241; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] 3242; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3243; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3244; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 3245; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] 3246; GFX1164_DPP-NEXT: v_writelane_b32 v4, s10, 48 3247; GFX1164_DPP-NEXT: v_writelane_b32 v5, s11, 48 3248; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] 3249; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 3250; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec 3251; GFX1164_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 3252; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 3253; GFX1164_DPP-NEXT: s_cbranch_execz .LBB5_2 3254; GFX1164_DPP-NEXT: ; %bb.1: 3255; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 3256; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 3257; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 3258; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 3259; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 3260; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 3261; GFX1164_DPP-NEXT: buffer_atomic_add_u64 v[6:7], off, s[4:7], 0 glc 3262; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) 3263; GFX1164_DPP-NEXT: buffer_gl1_inv 3264; GFX1164_DPP-NEXT: buffer_gl0_inv 3265; GFX1164_DPP-NEXT: .LBB5_2: 3266; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 3267; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 3268; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v6 3269; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 3270; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 3271; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 3272; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3273; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 3274; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 3275; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc 3276; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 3277; GFX1164_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], 0 3278; GFX1164_DPP-NEXT: s_endpgm 3279; 3280; GFX1132_DPP-LABEL: add_i64_varying: 3281; GFX1132_DPP: ; %bb.0: ; %entry 3282; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3283; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 3284; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3285; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 3286; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 3287; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 3288; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 3289; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 3290; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 3291; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3292; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3293; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3294; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 3295; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3296; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 3297; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3298; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3299; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 3300; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3301; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3302; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 3303; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3304; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 3305; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 3306; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3307; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 3308; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3309; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3310; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 3311; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3312; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 3313; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3314; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3315; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 3316; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3317; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 3318; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3319; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 3320; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 3321; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 3322; GFX1132_DPP-NEXT: v_readlane_b32 s4, v2, 31 3323; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3324; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15 3325; GFX1132_DPP-NEXT: v_readlane_b32 s8, v1, 15 3326; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 3327; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3328; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 3329; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3330; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3331; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 3332; GFX1132_DPP-NEXT: v_writelane_b32 v6, s7, 16 3333; GFX1132_DPP-NEXT: v_writelane_b32 v7, s8, 16 3334; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 3335; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 3336; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo 3337; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 3338; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 3339; GFX1132_DPP-NEXT: s_cbranch_execz .LBB5_2 3340; GFX1132_DPP-NEXT: ; %bb.1: 3341; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 3342; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 3343; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 3344; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 3345; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 3346; GFX1132_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc 3347; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) 3348; GFX1132_DPP-NEXT: buffer_gl1_inv 3349; GFX1132_DPP-NEXT: buffer_gl0_inv 3350; GFX1132_DPP-NEXT: .LBB5_2: 3351; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 3352; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 3353; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 3354; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 3355; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 3356; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 3357; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3358; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 3359; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 3360; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo 3361; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 3362; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 3363; GFX1132_DPP-NEXT: s_endpgm 3364; 3365; GFX1264_DPP-LABEL: add_i64_varying: 3366; GFX1264_DPP: ; %bb.0: ; %entry 3367; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3368; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 3369; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3370; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 3371; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 3372; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 3373; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 3374; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3375; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3376; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3377; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3378; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 3379; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3380; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 3381; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3382; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3383; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 3384; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3385; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3386; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 3387; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3388; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 3389; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 3390; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3391; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 3392; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3393; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3394; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 3395; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3396; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 3397; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3398; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 3399; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3400; GFX1264_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 3401; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3402; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3403; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 3404; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 3405; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 3406; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3407; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 3408; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 3409; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3410; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3411; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3412; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 3413; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 3414; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] 3415; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3416; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3417; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 3418; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3419; GFX1264_DPP-NEXT: v_readlane_b32 s6, v2, 15 3420; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3421; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 15 3422; GFX1264_DPP-NEXT: v_readlane_b32 s8, v2, 31 3423; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 31 3424; GFX1264_DPP-NEXT: v_writelane_b32 v4, s6, 16 3425; GFX1264_DPP-NEXT: v_readlane_b32 s6, v2, 63 3426; GFX1264_DPP-NEXT: v_writelane_b32 v5, s7, 16 3427; GFX1264_DPP-NEXT: v_readlane_b32 s10, v2, 47 3428; GFX1264_DPP-NEXT: v_readlane_b32 s11, v1, 47 3429; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 63 3430; GFX1264_DPP-NEXT: v_writelane_b32 v4, s8, 32 3431; GFX1264_DPP-NEXT: v_writelane_b32 v5, s9, 32 3432; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] 3433; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3434; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3435; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 3436; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] 3437; GFX1264_DPP-NEXT: v_writelane_b32 v4, s10, 48 3438; GFX1264_DPP-NEXT: v_writelane_b32 v5, s11, 48 3439; GFX1264_DPP-NEXT: s_wait_alu 0xfffe 3440; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] 3441; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 3442; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec 3443; GFX1264_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 3444; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 3445; GFX1264_DPP-NEXT: s_cbranch_execz .LBB5_2 3446; GFX1264_DPP-NEXT: ; %bb.1: 3447; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 3448; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 3449; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 3450; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 3451; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 3452; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 3453; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[6:7], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 3454; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 3455; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV 3456; GFX1264_DPP-NEXT: .LBB5_2: 3457; GFX1264_DPP-NEXT: s_wait_alu 0xfffe 3458; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 3459; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 3460; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v6 3461; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v4 3462; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 3463; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 3464; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3465; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 3466; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 3467; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc 3468; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 3469; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null 3470; GFX1264_DPP-NEXT: s_endpgm 3471; 3472; GFX1232_DPP-LABEL: add_i64_varying: 3473; GFX1232_DPP: ; %bb.0: ; %entry 3474; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3475; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 3476; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3477; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 3478; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 3479; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 3480; GFX1232_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 3481; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, 0 3482; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 3483; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3484; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3485; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3486; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 3487; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3488; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 3489; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3490; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3491; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 3492; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3493; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3494; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 3495; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3496; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 3497; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 3498; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3499; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 3500; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3501; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3502; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 3503; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3504; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 3505; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3506; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3507; GFX1232_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 3508; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3509; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 3510; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3511; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 3512; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 3513; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 3514; GFX1232_DPP-NEXT: v_readlane_b32 s4, v2, 31 3515; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3516; GFX1232_DPP-NEXT: v_readlane_b32 s7, v2, 15 3517; GFX1232_DPP-NEXT: v_readlane_b32 s8, v1, 15 3518; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 31 3519; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3520; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 3521; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3522; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3523; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 3524; GFX1232_DPP-NEXT: v_writelane_b32 v6, s7, 16 3525; GFX1232_DPP-NEXT: v_writelane_b32 v7, s8, 16 3526; GFX1232_DPP-NEXT: s_wait_alu 0xfffe 3527; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 3528; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 3529; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo 3530; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 3531; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 3532; GFX1232_DPP-NEXT: s_cbranch_execz .LBB5_2 3533; GFX1232_DPP-NEXT: ; %bb.1: 3534; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 3535; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 3536; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 3537; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 3538; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 3539; GFX1232_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 3540; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 3541; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV 3542; GFX1232_DPP-NEXT: .LBB5_2: 3543; GFX1232_DPP-NEXT: s_wait_alu 0xfffe 3544; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 3545; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 3546; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 3547; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 3548; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 3549; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 3550; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3551; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 3552; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 3553; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo 3554; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 3555; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null 3556; GFX1232_DPP-NEXT: s_endpgm 3557entry: 3558 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3559 %zext = zext i32 %lane to i64 3560 %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel 3561 store i64 %old, ptr addrspace(1) %out 3562 ret void 3563} 3564 3565define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) { 3566; GFX7LESS-LABEL: sub_i32_constant: 3567; GFX7LESS: ; %bb.0: ; %entry 3568; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 3569; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3570; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 3571; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 3572; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3573; GFX7LESS-NEXT: ; implicit-def: $vgpr1 3574; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 3575; GFX7LESS-NEXT: s_cbranch_execz .LBB6_2 3576; GFX7LESS-NEXT: ; %bb.1: 3577; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 3578; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3579; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 3580; GFX7LESS-NEXT: s_mov_b32 s10, -1 3581; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3582; GFX7LESS-NEXT: s_mov_b32 s8, s2 3583; GFX7LESS-NEXT: s_mov_b32 s9, s3 3584; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 3585; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 3586; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 3587; GFX7LESS-NEXT: buffer_wbinvl1 3588; GFX7LESS-NEXT: .LBB6_2: 3589; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 3590; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3591; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3592; GFX7LESS-NEXT: s_mov_b32 s2, -1 3593; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 3594; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3595; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3596; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3597; GFX7LESS-NEXT: s_endpgm 3598; 3599; GFX8-LABEL: sub_i32_constant: 3600; GFX8: ; %bb.0: ; %entry 3601; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3602; GFX8-NEXT: s_mov_b64 s[6:7], exec 3603; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3604; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 3605; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3606; GFX8-NEXT: ; implicit-def: $vgpr1 3607; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3608; GFX8-NEXT: s_cbranch_execz .LBB6_2 3609; GFX8-NEXT: ; %bb.1: 3610; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3611; GFX8-NEXT: s_mov_b32 s8, s2 3612; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 3613; GFX8-NEXT: s_mul_i32 s2, s2, 5 3614; GFX8-NEXT: s_mov_b32 s11, 0xf000 3615; GFX8-NEXT: s_mov_b32 s10, -1 3616; GFX8-NEXT: s_mov_b32 s9, s3 3617; GFX8-NEXT: v_mov_b32_e32 v1, s2 3618; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 3619; GFX8-NEXT: s_waitcnt vmcnt(0) 3620; GFX8-NEXT: buffer_wbinvl1_vol 3621; GFX8-NEXT: .LBB6_2: 3622; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3623; GFX8-NEXT: v_readfirstlane_b32 s4, v1 3624; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3625; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3626; GFX8-NEXT: s_mov_b32 s3, 0xf000 3627; GFX8-NEXT: s_mov_b32 s2, -1 3628; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 3629; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3630; GFX8-NEXT: s_endpgm 3631; 3632; GFX9-LABEL: sub_i32_constant: 3633; GFX9: ; %bb.0: ; %entry 3634; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3635; GFX9-NEXT: s_mov_b64 s[6:7], exec 3636; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3637; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 3638; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3639; GFX9-NEXT: ; implicit-def: $vgpr1 3640; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3641; GFX9-NEXT: s_cbranch_execz .LBB6_2 3642; GFX9-NEXT: ; %bb.1: 3643; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3644; GFX9-NEXT: s_mov_b32 s8, s2 3645; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 3646; GFX9-NEXT: s_mul_i32 s2, s2, 5 3647; GFX9-NEXT: s_mov_b32 s11, 0xf000 3648; GFX9-NEXT: s_mov_b32 s10, -1 3649; GFX9-NEXT: s_mov_b32 s9, s3 3650; GFX9-NEXT: v_mov_b32_e32 v1, s2 3651; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 3652; GFX9-NEXT: s_waitcnt vmcnt(0) 3653; GFX9-NEXT: buffer_wbinvl1_vol 3654; GFX9-NEXT: .LBB6_2: 3655; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3656; GFX9-NEXT: v_readfirstlane_b32 s4, v1 3657; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3658; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3659; GFX9-NEXT: s_mov_b32 s3, 0xf000 3660; GFX9-NEXT: s_mov_b32 s2, -1 3661; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3662; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3663; GFX9-NEXT: s_endpgm 3664; 3665; GFX1064-LABEL: sub_i32_constant: 3666; GFX1064: ; %bb.0: ; %entry 3667; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3668; GFX1064-NEXT: s_mov_b64 s[6:7], exec 3669; GFX1064-NEXT: ; implicit-def: $vgpr1 3670; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3671; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 3672; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3673; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3674; GFX1064-NEXT: s_cbranch_execz .LBB6_2 3675; GFX1064-NEXT: ; %bb.1: 3676; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3677; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 3678; GFX1064-NEXT: s_mul_i32 s6, s6, 5 3679; GFX1064-NEXT: s_mov_b32 s10, -1 3680; GFX1064-NEXT: v_mov_b32_e32 v1, s6 3681; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3682; GFX1064-NEXT: s_mov_b32 s8, s2 3683; GFX1064-NEXT: s_mov_b32 s9, s3 3684; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 3685; GFX1064-NEXT: s_waitcnt vmcnt(0) 3686; GFX1064-NEXT: buffer_gl1_inv 3687; GFX1064-NEXT: buffer_gl0_inv 3688; GFX1064-NEXT: .LBB6_2: 3689; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3690; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3691; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3692; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 3693; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3694; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3695; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3696; GFX1064-NEXT: s_mov_b32 s2, -1 3697; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3698; GFX1064-NEXT: s_endpgm 3699; 3700; GFX1032-LABEL: sub_i32_constant: 3701; GFX1032: ; %bb.0: ; %entry 3702; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3703; GFX1032-NEXT: s_mov_b32 s6, exec_lo 3704; GFX1032-NEXT: ; implicit-def: $vgpr1 3705; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3706; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3707; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3708; GFX1032-NEXT: s_cbranch_execz .LBB6_2 3709; GFX1032-NEXT: ; %bb.1: 3710; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 3711; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 3712; GFX1032-NEXT: s_mul_i32 s5, s5, 5 3713; GFX1032-NEXT: s_mov_b32 s10, -1 3714; GFX1032-NEXT: v_mov_b32_e32 v1, s5 3715; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3716; GFX1032-NEXT: s_mov_b32 s8, s2 3717; GFX1032-NEXT: s_mov_b32 s9, s3 3718; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 3719; GFX1032-NEXT: s_waitcnt vmcnt(0) 3720; GFX1032-NEXT: buffer_gl1_inv 3721; GFX1032-NEXT: buffer_gl0_inv 3722; GFX1032-NEXT: .LBB6_2: 3723; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3724; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3725; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3726; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 3727; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3728; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3729; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3730; GFX1032-NEXT: s_mov_b32 s2, -1 3731; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3732; GFX1032-NEXT: s_endpgm 3733; 3734; GFX1164-LABEL: sub_i32_constant: 3735; GFX1164: ; %bb.0: ; %entry 3736; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3737; GFX1164-NEXT: s_mov_b64 s[6:7], exec 3738; GFX1164-NEXT: s_mov_b64 s[4:5], exec 3739; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3740; GFX1164-NEXT: ; implicit-def: $vgpr1 3741; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3742; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 3743; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 3744; GFX1164-NEXT: s_cbranch_execz .LBB6_2 3745; GFX1164-NEXT: ; %bb.1: 3746; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3747; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 3748; GFX1164-NEXT: s_mul_i32 s6, s6, 5 3749; GFX1164-NEXT: s_mov_b32 s10, -1 3750; GFX1164-NEXT: v_mov_b32_e32 v1, s6 3751; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3752; GFX1164-NEXT: s_mov_b32 s8, s2 3753; GFX1164-NEXT: s_mov_b32 s9, s3 3754; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 3755; GFX1164-NEXT: s_waitcnt vmcnt(0) 3756; GFX1164-NEXT: buffer_gl1_inv 3757; GFX1164-NEXT: buffer_gl0_inv 3758; GFX1164-NEXT: .LBB6_2: 3759; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3760; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3761; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 3762; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3763; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3764; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3765; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3766; GFX1164-NEXT: s_mov_b32 s2, -1 3767; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3768; GFX1164-NEXT: s_endpgm 3769; 3770; GFX1132-LABEL: sub_i32_constant: 3771; GFX1132: ; %bb.0: ; %entry 3772; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3773; GFX1132-NEXT: s_mov_b32 s6, exec_lo 3774; GFX1132-NEXT: s_mov_b32 s4, exec_lo 3775; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3776; GFX1132-NEXT: ; implicit-def: $vgpr1 3777; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3778; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 3779; GFX1132-NEXT: s_cbranch_execz .LBB6_2 3780; GFX1132-NEXT: ; %bb.1: 3781; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 3782; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 3783; GFX1132-NEXT: s_mul_i32 s5, s5, 5 3784; GFX1132-NEXT: s_mov_b32 s10, -1 3785; GFX1132-NEXT: v_mov_b32_e32 v1, s5 3786; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3787; GFX1132-NEXT: s_mov_b32 s8, s2 3788; GFX1132-NEXT: s_mov_b32 s9, s3 3789; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 3790; GFX1132-NEXT: s_waitcnt vmcnt(0) 3791; GFX1132-NEXT: buffer_gl1_inv 3792; GFX1132-NEXT: buffer_gl0_inv 3793; GFX1132-NEXT: .LBB6_2: 3794; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 3795; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3796; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 3797; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3798; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3799; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3800; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3801; GFX1132-NEXT: s_mov_b32 s2, -1 3802; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3803; GFX1132-NEXT: s_endpgm 3804; 3805; GFX1264-LABEL: sub_i32_constant: 3806; GFX1264: ; %bb.0: ; %entry 3807; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3808; GFX1264-NEXT: s_mov_b64 s[6:7], exec 3809; GFX1264-NEXT: s_mov_b64 s[4:5], exec 3810; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3811; GFX1264-NEXT: ; implicit-def: $vgpr1 3812; GFX1264-NEXT: s_wait_alu 0xfffe 3813; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3814; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 3815; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 3816; GFX1264-NEXT: s_cbranch_execz .LBB6_2 3817; GFX1264-NEXT: ; %bb.1: 3818; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3819; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 3820; GFX1264-NEXT: s_wait_alu 0xfffe 3821; GFX1264-NEXT: s_mul_i32 s6, s6, 5 3822; GFX1264-NEXT: s_mov_b32 s10, -1 3823; GFX1264-NEXT: s_wait_alu 0xfffe 3824; GFX1264-NEXT: v_mov_b32_e32 v1, s6 3825; GFX1264-NEXT: s_wait_kmcnt 0x0 3826; GFX1264-NEXT: s_mov_b32 s8, s2 3827; GFX1264-NEXT: s_mov_b32 s9, s3 3828; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 3829; GFX1264-NEXT: s_wait_loadcnt 0x0 3830; GFX1264-NEXT: global_inv scope:SCOPE_DEV 3831; GFX1264-NEXT: .LBB6_2: 3832; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] 3833; GFX1264-NEXT: s_wait_kmcnt 0x0 3834; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 3835; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3836; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 3837; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) 3838; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3839; GFX1264-NEXT: s_mov_b32 s2, -1 3840; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null 3841; GFX1264-NEXT: s_endpgm 3842; 3843; GFX1232-LABEL: sub_i32_constant: 3844; GFX1232: ; %bb.0: ; %entry 3845; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3846; GFX1232-NEXT: s_mov_b32 s6, exec_lo 3847; GFX1232-NEXT: s_mov_b32 s4, exec_lo 3848; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3849; GFX1232-NEXT: ; implicit-def: $vgpr1 3850; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 3851; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 3852; GFX1232-NEXT: s_cbranch_execz .LBB6_2 3853; GFX1232-NEXT: ; %bb.1: 3854; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6 3855; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 3856; GFX1232-NEXT: s_mul_i32 s5, s5, 5 3857; GFX1232-NEXT: s_mov_b32 s10, -1 3858; GFX1232-NEXT: v_mov_b32_e32 v1, s5 3859; GFX1232-NEXT: s_wait_kmcnt 0x0 3860; GFX1232-NEXT: s_mov_b32 s8, s2 3861; GFX1232-NEXT: s_mov_b32 s9, s3 3862; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 3863; GFX1232-NEXT: s_wait_loadcnt 0x0 3864; GFX1232-NEXT: global_inv scope:SCOPE_DEV 3865; GFX1232-NEXT: .LBB6_2: 3866; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 3867; GFX1232-NEXT: s_wait_kmcnt 0x0 3868; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 3869; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3870; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 3871; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 3872; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3873; GFX1232-NEXT: s_mov_b32 s2, -1 3874; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null 3875; GFX1232-NEXT: s_endpgm 3876entry: 3877 %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel 3878 store i32 %old, ptr addrspace(1) %out 3879 ret void 3880} 3881 3882define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) { 3883; GFX7LESS-LABEL: sub_i32_uniform: 3884; GFX7LESS: ; %bb.0: ; %entry 3885; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 3886; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3887; GFX7LESS-NEXT: s_load_dword s8, s[4:5], 0xd 3888; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 3889; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 3890; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3891; GFX7LESS-NEXT: ; implicit-def: $vgpr1 3892; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 3893; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 3894; GFX7LESS-NEXT: ; %bb.1: 3895; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 3896; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3897; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3898; GFX7LESS-NEXT: s_mul_i32 s6, s8, s6 3899; GFX7LESS-NEXT: s_mov_b32 s14, -1 3900; GFX7LESS-NEXT: s_mov_b32 s12, s2 3901; GFX7LESS-NEXT: s_mov_b32 s13, s3 3902; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 3903; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 3904; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 3905; GFX7LESS-NEXT: buffer_wbinvl1 3906; GFX7LESS-NEXT: .LBB7_2: 3907; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 3908; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3909; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3910; GFX7LESS-NEXT: s_mov_b32 s2, -1 3911; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 3912; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 3913; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3914; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3915; GFX7LESS-NEXT: s_endpgm 3916; 3917; GFX8-LABEL: sub_i32_uniform: 3918; GFX8: ; %bb.0: ; %entry 3919; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3920; GFX8-NEXT: s_load_dword s8, s[4:5], 0x34 3921; GFX8-NEXT: s_mov_b64 s[6:7], exec 3922; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3923; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 3924; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3925; GFX8-NEXT: ; implicit-def: $vgpr1 3926; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3927; GFX8-NEXT: s_cbranch_execz .LBB7_2 3928; GFX8-NEXT: ; %bb.1: 3929; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3930; GFX8-NEXT: s_mov_b32 s12, s2 3931; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 3932; GFX8-NEXT: s_mul_i32 s2, s8, s2 3933; GFX8-NEXT: s_mov_b32 s15, 0xf000 3934; GFX8-NEXT: s_mov_b32 s14, -1 3935; GFX8-NEXT: s_mov_b32 s13, s3 3936; GFX8-NEXT: v_mov_b32_e32 v1, s2 3937; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 3938; GFX8-NEXT: s_waitcnt vmcnt(0) 3939; GFX8-NEXT: buffer_wbinvl1_vol 3940; GFX8-NEXT: .LBB7_2: 3941; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3942; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3943; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 3944; GFX8-NEXT: v_readfirstlane_b32 s4, v1 3945; GFX8-NEXT: s_mov_b32 s3, 0xf000 3946; GFX8-NEXT: s_mov_b32 s2, -1 3947; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 3948; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3949; GFX8-NEXT: s_endpgm 3950; 3951; GFX9-LABEL: sub_i32_uniform: 3952; GFX9: ; %bb.0: ; %entry 3953; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3954; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 3955; GFX9-NEXT: s_mov_b64 s[6:7], exec 3956; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3957; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 3958; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3959; GFX9-NEXT: ; implicit-def: $vgpr1 3960; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3961; GFX9-NEXT: s_cbranch_execz .LBB7_2 3962; GFX9-NEXT: ; %bb.1: 3963; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3964; GFX9-NEXT: s_mov_b32 s12, s2 3965; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 3966; GFX9-NEXT: s_mul_i32 s2, s8, s2 3967; GFX9-NEXT: s_mov_b32 s15, 0xf000 3968; GFX9-NEXT: s_mov_b32 s14, -1 3969; GFX9-NEXT: s_mov_b32 s13, s3 3970; GFX9-NEXT: v_mov_b32_e32 v1, s2 3971; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 3972; GFX9-NEXT: s_waitcnt vmcnt(0) 3973; GFX9-NEXT: buffer_wbinvl1_vol 3974; GFX9-NEXT: .LBB7_2: 3975; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3976; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3977; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 3978; GFX9-NEXT: v_readfirstlane_b32 s4, v1 3979; GFX9-NEXT: s_mov_b32 s3, 0xf000 3980; GFX9-NEXT: s_mov_b32 s2, -1 3981; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3982; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3983; GFX9-NEXT: s_endpgm 3984; 3985; GFX1064-LABEL: sub_i32_uniform: 3986; GFX1064: ; %bb.0: ; %entry 3987; GFX1064-NEXT: s_clause 0x1 3988; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3989; GFX1064-NEXT: s_load_dword s8, s[4:5], 0x34 3990; GFX1064-NEXT: s_mov_b64 s[6:7], exec 3991; GFX1064-NEXT: ; implicit-def: $vgpr1 3992; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3993; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 3994; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3995; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3996; GFX1064-NEXT: s_cbranch_execz .LBB7_2 3997; GFX1064-NEXT: ; %bb.1: 3998; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3999; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 4000; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4001; GFX1064-NEXT: s_mul_i32 s6, s8, s6 4002; GFX1064-NEXT: s_mov_b32 s14, -1 4003; GFX1064-NEXT: v_mov_b32_e32 v1, s6 4004; GFX1064-NEXT: s_mov_b32 s12, s2 4005; GFX1064-NEXT: s_mov_b32 s13, s3 4006; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 4007; GFX1064-NEXT: s_waitcnt vmcnt(0) 4008; GFX1064-NEXT: buffer_gl1_inv 4009; GFX1064-NEXT: buffer_gl0_inv 4010; GFX1064-NEXT: .LBB7_2: 4011; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4012; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4013; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4014; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 4015; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 4016; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4017; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4018; GFX1064-NEXT: s_mov_b32 s2, -1 4019; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4020; GFX1064-NEXT: s_endpgm 4021; 4022; GFX1032-LABEL: sub_i32_uniform: 4023; GFX1032: ; %bb.0: ; %entry 4024; GFX1032-NEXT: s_clause 0x1 4025; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4026; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 4027; GFX1032-NEXT: s_mov_b32 s7, exec_lo 4028; GFX1032-NEXT: ; implicit-def: $vgpr1 4029; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s7, 0 4030; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4031; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4032; GFX1032-NEXT: s_cbranch_execz .LBB7_2 4033; GFX1032-NEXT: ; %bb.1: 4034; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s7 4035; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 4036; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4037; GFX1032-NEXT: s_mul_i32 s5, s6, s5 4038; GFX1032-NEXT: s_mov_b32 s10, -1 4039; GFX1032-NEXT: v_mov_b32_e32 v1, s5 4040; GFX1032-NEXT: s_mov_b32 s8, s2 4041; GFX1032-NEXT: s_mov_b32 s9, s3 4042; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 4043; GFX1032-NEXT: s_waitcnt vmcnt(0) 4044; GFX1032-NEXT: buffer_gl1_inv 4045; GFX1032-NEXT: buffer_gl0_inv 4046; GFX1032-NEXT: .LBB7_2: 4047; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4048; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4049; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4050; GFX1032-NEXT: v_mul_lo_u32 v0, s6, v0 4051; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 4052; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4053; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4054; GFX1032-NEXT: s_mov_b32 s2, -1 4055; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4056; GFX1032-NEXT: s_endpgm 4057; 4058; GFX1164-LABEL: sub_i32_uniform: 4059; GFX1164: ; %bb.0: ; %entry 4060; GFX1164-NEXT: s_clause 0x1 4061; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4062; GFX1164-NEXT: s_load_b32 s8, s[4:5], 0x34 4063; GFX1164-NEXT: s_mov_b64 s[6:7], exec 4064; GFX1164-NEXT: s_mov_b64 s[4:5], exec 4065; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 4066; GFX1164-NEXT: ; implicit-def: $vgpr1 4067; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4068; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 4069; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 4070; GFX1164-NEXT: s_cbranch_execz .LBB7_2 4071; GFX1164-NEXT: ; %bb.1: 4072; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 4073; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 4074; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4075; GFX1164-NEXT: s_mul_i32 s6, s8, s6 4076; GFX1164-NEXT: s_mov_b32 s14, -1 4077; GFX1164-NEXT: v_mov_b32_e32 v1, s6 4078; GFX1164-NEXT: s_mov_b32 s12, s2 4079; GFX1164-NEXT: s_mov_b32 s13, s3 4080; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc 4081; GFX1164-NEXT: s_waitcnt vmcnt(0) 4082; GFX1164-NEXT: buffer_gl1_inv 4083; GFX1164-NEXT: buffer_gl0_inv 4084; GFX1164-NEXT: .LBB7_2: 4085; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4086; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4087; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 4088; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 4089; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4090; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4091; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4092; GFX1164-NEXT: s_mov_b32 s2, -1 4093; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4094; GFX1164-NEXT: s_endpgm 4095; 4096; GFX1132-LABEL: sub_i32_uniform: 4097; GFX1132: ; %bb.0: ; %entry 4098; GFX1132-NEXT: s_clause 0x1 4099; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4100; GFX1132-NEXT: s_load_b32 s4, s[4:5], 0x34 4101; GFX1132-NEXT: s_mov_b32 s6, exec_lo 4102; GFX1132-NEXT: s_mov_b32 s5, exec_lo 4103; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 4104; GFX1132-NEXT: ; implicit-def: $vgpr1 4105; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4106; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 4107; GFX1132-NEXT: s_cbranch_execz .LBB7_2 4108; GFX1132-NEXT: ; %bb.1: 4109; GFX1132-NEXT: s_bcnt1_i32_b32 s6, s6 4110; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 4111; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4112; GFX1132-NEXT: s_mul_i32 s6, s4, s6 4113; GFX1132-NEXT: s_mov_b32 s10, -1 4114; GFX1132-NEXT: v_mov_b32_e32 v1, s6 4115; GFX1132-NEXT: s_mov_b32 s8, s2 4116; GFX1132-NEXT: s_mov_b32 s9, s3 4117; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 4118; GFX1132-NEXT: s_waitcnt vmcnt(0) 4119; GFX1132-NEXT: buffer_gl1_inv 4120; GFX1132-NEXT: buffer_gl0_inv 4121; GFX1132-NEXT: .LBB7_2: 4122; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 4123; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4124; GFX1132-NEXT: v_mul_lo_u32 v0, s4, v0 4125; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 4126; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4127; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4128; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4129; GFX1132-NEXT: s_mov_b32 s2, -1 4130; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4131; GFX1132-NEXT: s_endpgm 4132; 4133; GFX1264-LABEL: sub_i32_uniform: 4134; GFX1264: ; %bb.0: ; %entry 4135; GFX1264-NEXT: s_clause 0x1 4136; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4137; GFX1264-NEXT: s_load_b32 s8, s[4:5], 0x34 4138; GFX1264-NEXT: s_mov_b64 s[6:7], exec 4139; GFX1264-NEXT: s_mov_b64 s[4:5], exec 4140; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 4141; GFX1264-NEXT: ; implicit-def: $vgpr1 4142; GFX1264-NEXT: s_wait_alu 0xfffe 4143; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4144; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 4145; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 4146; GFX1264-NEXT: s_cbranch_execz .LBB7_2 4147; GFX1264-NEXT: ; %bb.1: 4148; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 4149; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 4150; GFX1264-NEXT: s_wait_kmcnt 0x0 4151; GFX1264-NEXT: s_wait_alu 0xfffe 4152; GFX1264-NEXT: s_mul_i32 s6, s8, s6 4153; GFX1264-NEXT: s_mov_b32 s14, -1 4154; GFX1264-NEXT: s_wait_alu 0xfffe 4155; GFX1264-NEXT: v_mov_b32_e32 v1, s6 4156; GFX1264-NEXT: s_mov_b32 s12, s2 4157; GFX1264-NEXT: s_mov_b32 s13, s3 4158; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4159; GFX1264-NEXT: s_wait_loadcnt 0x0 4160; GFX1264-NEXT: global_inv scope:SCOPE_DEV 4161; GFX1264-NEXT: .LBB7_2: 4162; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] 4163; GFX1264-NEXT: s_wait_kmcnt 0x0 4164; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 4165; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 4166; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 4167; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) 4168; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4169; GFX1264-NEXT: s_mov_b32 s2, -1 4170; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null 4171; GFX1264-NEXT: s_endpgm 4172; 4173; GFX1232-LABEL: sub_i32_uniform: 4174; GFX1232: ; %bb.0: ; %entry 4175; GFX1232-NEXT: s_clause 0x1 4176; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4177; GFX1232-NEXT: s_load_b32 s4, s[4:5], 0x34 4178; GFX1232-NEXT: s_mov_b32 s6, exec_lo 4179; GFX1232-NEXT: s_mov_b32 s5, exec_lo 4180; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 4181; GFX1232-NEXT: ; implicit-def: $vgpr1 4182; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 4183; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 4184; GFX1232-NEXT: s_cbranch_execz .LBB7_2 4185; GFX1232-NEXT: ; %bb.1: 4186; GFX1232-NEXT: s_wait_alu 0xfffe 4187; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 4188; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 4189; GFX1232-NEXT: s_wait_kmcnt 0x0 4190; GFX1232-NEXT: s_wait_alu 0xfffe 4191; GFX1232-NEXT: s_mul_i32 s6, s4, s6 4192; GFX1232-NEXT: s_mov_b32 s10, -1 4193; GFX1232-NEXT: s_wait_alu 0xfffe 4194; GFX1232-NEXT: v_mov_b32_e32 v1, s6 4195; GFX1232-NEXT: s_mov_b32 s8, s2 4196; GFX1232-NEXT: s_mov_b32 s9, s3 4197; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4198; GFX1232-NEXT: s_wait_loadcnt 0x0 4199; GFX1232-NEXT: global_inv scope:SCOPE_DEV 4200; GFX1232-NEXT: .LBB7_2: 4201; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 4202; GFX1232-NEXT: s_wait_kmcnt 0x0 4203; GFX1232-NEXT: v_mul_lo_u32 v0, s4, v0 4204; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 4205; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 4206; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 4207; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4208; GFX1232-NEXT: s_mov_b32 s2, -1 4209; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null 4210; GFX1232-NEXT: s_endpgm 4211entry: 4212 %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel 4213 store i32 %old, ptr addrspace(1) %out 4214 ret void 4215} 4216 4217define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { 4218; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: 4219; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 4220; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4221; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 4222; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4223; GFX7LESS_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4224; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4225; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 4226; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s2 4227; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 4228; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 4229; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 4230; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 4231; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 4232; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[2:3] 4233; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 4234; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB8_1 4235; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4236; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4237; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4238; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4239; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4240; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4241; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 4242; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4243; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4244; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 4245; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 4246; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 4247; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4248; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 4249; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 4250; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 4251; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 4252; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 4253; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 4254; GFX7LESS_ITERATIVE-NEXT: .LBB8_4: 4255; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 4256; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4257; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 4258; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 4259; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 4260; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) 4261; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 4262; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 4263; GFX7LESS_ITERATIVE-NEXT: s_endpgm 4264; 4265; GFX8_ITERATIVE-LABEL: sub_i32_varying: 4266; GFX8_ITERATIVE: ; %bb.0: ; %entry 4267; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4268; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 4269; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4270; GFX8_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4271; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4272; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 4273; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 4274; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 4275; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 4276; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 4277; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 4278; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 4279; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4280; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 4281; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4282; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4283; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4284; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4285; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4286; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4287; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 4288; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4289; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4290; GFX8_ITERATIVE-NEXT: ; %bb.3: 4291; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 4292; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 4293; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4294; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 4295; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 4296; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 4297; GFX8_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 4298; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 4299; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol 4300; GFX8_ITERATIVE-NEXT: .LBB8_4: 4301; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 4302; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 4303; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4304; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 4305; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 4306; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 4307; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 4308; GFX8_ITERATIVE-NEXT: s_endpgm 4309; 4310; GFX9_ITERATIVE-LABEL: sub_i32_varying: 4311; GFX9_ITERATIVE: ; %bb.0: ; %entry 4312; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4313; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 4314; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4315; GFX9_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4316; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4317; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 4318; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 4319; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 4320; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 4321; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 4322; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 4323; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 4324; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4325; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 4326; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4327; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4328; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4329; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4330; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4331; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4332; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 4333; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4334; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4335; GFX9_ITERATIVE-NEXT: ; %bb.3: 4336; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 4337; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 4338; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4339; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 4340; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 4341; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 4342; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 4343; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 4344; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol 4345; GFX9_ITERATIVE-NEXT: .LBB8_4: 4346; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 4347; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 4348; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4349; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 4350; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 4351; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1 4352; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 4353; GFX9_ITERATIVE-NEXT: s_endpgm 4354; 4355; GFX1064_ITERATIVE-LABEL: sub_i32_varying: 4356; GFX1064_ITERATIVE: ; %bb.0: ; %entry 4357; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4358; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 4359; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4360; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4361; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4362; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] 4363; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 4364; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 4365; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 4366; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 4367; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 4368; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4369; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 4370; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4371; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4372; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4373; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4374; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4375; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4376; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 4377; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4378; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4379; GFX1064_ITERATIVE-NEXT: ; %bb.3: 4380; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 4381; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 4382; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 4383; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4384; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 4385; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 4386; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 4387; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 4388; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv 4389; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 4390; GFX1064_ITERATIVE-NEXT: .LBB8_4: 4391; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 4392; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 4393; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4394; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 4395; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4396; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 4397; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 4398; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 4399; GFX1064_ITERATIVE-NEXT: s_endpgm 4400; 4401; GFX1032_ITERATIVE-LABEL: sub_i32_varying: 4402; GFX1032_ITERATIVE: ; %bb.0: ; %entry 4403; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 4404; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, 0 4405; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4406; GFX1032_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4407; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4408; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 4409; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 4410; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 4411; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 4412; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 4413; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 4414; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 4415; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 4416; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4417; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4418; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4419; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4420; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4421; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 4422; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 4423; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4424; GFX1032_ITERATIVE-NEXT: ; %bb.3: 4425; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 4426; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 4427; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 4428; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4429; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 4430; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 4431; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 4432; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 4433; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv 4434; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 4435; GFX1032_ITERATIVE-NEXT: .LBB8_4: 4436; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 4437; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 4438; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4439; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 4440; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4441; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 4442; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 4443; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 4444; GFX1032_ITERATIVE-NEXT: s_endpgm 4445; 4446; GFX1164_ITERATIVE-LABEL: sub_i32_varying: 4447; GFX1164_ITERATIVE: ; %bb.0: ; %entry 4448; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 4449; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4450; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 4451; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4452; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4453; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4454; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] 4455; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4456; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 4457; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 4458; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 4459; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] 4460; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 4461; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 4462; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4463; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 4464; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4465; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4466; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 4467; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4468; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 4469; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 4470; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4471; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 4472; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4473; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4474; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4475; GFX1164_ITERATIVE-NEXT: ; %bb.3: 4476; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 4477; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 4478; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 4479; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4480; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 4481; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 4482; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 4483; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 4484; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv 4485; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 4486; GFX1164_ITERATIVE-NEXT: .LBB8_4: 4487; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 4488; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4489; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 4490; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4491; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 4492; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4493; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 4494; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4495; GFX1164_ITERATIVE-NEXT: s_endpgm 4496; 4497; GFX1132_ITERATIVE-LABEL: sub_i32_varying: 4498; GFX1132_ITERATIVE: ; %bb.0: ; %entry 4499; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 4500; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 4501; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, 0 4502; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4503; GFX1132_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4504; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4505; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 4506; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4507; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 4508; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 4509; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 4510; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 4511; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 4512; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 4513; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 4514; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 4515; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4516; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4517; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 4518; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4519; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 4520; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4521; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 4522; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 4523; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4524; GFX1132_ITERATIVE-NEXT: ; %bb.3: 4525; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 4526; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 4527; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 4528; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4529; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 4530; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 4531; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc 4532; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 4533; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv 4534; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 4535; GFX1132_ITERATIVE-NEXT: .LBB8_4: 4536; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 4537; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4538; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 4539; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4540; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 4541; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4542; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 4543; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4544; GFX1132_ITERATIVE-NEXT: s_endpgm 4545; 4546; GFX1264_ITERATIVE-LABEL: sub_i32_varying: 4547; GFX1264_ITERATIVE: ; %bb.0: ; %entry 4548; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 4549; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4550; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 4551; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4552; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4553; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4554; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] 4555; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe 4556; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 4557; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 4558; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 4559; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] 4560; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 4561; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 4562; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4563; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 4564; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4565; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4566; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 4567; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4568; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 4569; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 4570; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4571; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 4572; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4573; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4574; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4575; GFX1264_ITERATIVE-NEXT: ; %bb.3: 4576; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe 4577; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 4578; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 4579; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 4580; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 4581; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 4582; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 4583; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4584; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 4585; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV 4586; GFX1264_ITERATIVE-NEXT: .LBB8_4: 4587; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 4588; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 4589; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 4590; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4591; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 4592; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4593; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 4594; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null 4595; GFX1264_ITERATIVE-NEXT: s_endpgm 4596; 4597; GFX1232_ITERATIVE-LABEL: sub_i32_varying: 4598; GFX1232_ITERATIVE: ; %bb.0: ; %entry 4599; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 4600; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 4601; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, 0 4602; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4603; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop 4604; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4605; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 4606; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 4607; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 4608; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 4609; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 4610; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 4611; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 4612; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 4613; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 4614; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 4615; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 4616; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 4617; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4618; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4619; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 4620; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4621; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 4622; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4623; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 4624; GFX1232_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 4625; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 4626; GFX1232_ITERATIVE-NEXT: ; %bb.3: 4627; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 4628; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 4629; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 4630; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 4631; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 4632; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 4633; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4634; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 4635; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV 4636; GFX1232_ITERATIVE-NEXT: .LBB8_4: 4637; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 4638; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 4639; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 4640; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4641; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 4642; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4643; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 4644; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null 4645; GFX1232_ITERATIVE-NEXT: s_endpgm 4646; 4647; GFX7LESS_DPP-LABEL: sub_i32_varying: 4648; GFX7LESS_DPP: ; %bb.0: ; %entry 4649; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4650; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 4651; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 4652; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 4653; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 4654; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 4655; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 4656; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 4657; GFX7LESS_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 4658; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) 4659; GFX7LESS_DPP-NEXT: buffer_wbinvl1 4660; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 4661; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 4662; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 4663; GFX7LESS_DPP-NEXT: s_endpgm 4664; 4665; GFX8_DPP-LABEL: sub_i32_varying: 4666; GFX8_DPP: ; %bb.0: ; %entry 4667; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4668; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 4669; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 4670; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] 4671; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4672; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4673; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 4674; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] 4675; GFX8_DPP-NEXT: s_nop 1 4676; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4677; GFX8_DPP-NEXT: s_nop 1 4678; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4679; GFX8_DPP-NEXT: s_nop 1 4680; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4681; GFX8_DPP-NEXT: s_nop 1 4682; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4683; GFX8_DPP-NEXT: s_nop 1 4684; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4685; GFX8_DPP-NEXT: s_nop 1 4686; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4687; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 4688; GFX8_DPP-NEXT: s_nop 0 4689; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4690; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] 4691; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4692; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 4693; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc 4694; GFX8_DPP-NEXT: s_cbranch_execz .LBB8_2 4695; GFX8_DPP-NEXT: ; %bb.1: 4696; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 4697; GFX8_DPP-NEXT: s_mov_b32 s10, -1 4698; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 4699; GFX8_DPP-NEXT: s_mov_b32 s8, s2 4700; GFX8_DPP-NEXT: s_mov_b32 s9, s3 4701; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 4702; GFX8_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 4703; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) 4704; GFX8_DPP-NEXT: buffer_wbinvl1_vol 4705; GFX8_DPP-NEXT: .LBB8_2: 4706; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] 4707; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 4708; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 4709; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 4710; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 4711; GFX8_DPP-NEXT: s_mov_b32 s2, -1 4712; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 4713; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4714; GFX8_DPP-NEXT: s_endpgm 4715; 4716; GFX9_DPP-LABEL: sub_i32_varying: 4717; GFX9_DPP: ; %bb.0: ; %entry 4718; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4719; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 4720; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 4721; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] 4722; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4723; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4724; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 4725; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] 4726; GFX9_DPP-NEXT: s_nop 1 4727; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4728; GFX9_DPP-NEXT: s_nop 1 4729; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4730; GFX9_DPP-NEXT: s_nop 1 4731; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4732; GFX9_DPP-NEXT: s_nop 1 4733; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4734; GFX9_DPP-NEXT: s_nop 1 4735; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4736; GFX9_DPP-NEXT: s_nop 1 4737; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4738; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 4739; GFX9_DPP-NEXT: s_nop 0 4740; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4741; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] 4742; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4743; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 4744; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc 4745; GFX9_DPP-NEXT: s_cbranch_execz .LBB8_2 4746; GFX9_DPP-NEXT: ; %bb.1: 4747; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 4748; GFX9_DPP-NEXT: s_mov_b32 s10, -1 4749; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 4750; GFX9_DPP-NEXT: s_mov_b32 s8, s2 4751; GFX9_DPP-NEXT: s_mov_b32 s9, s3 4752; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 4753; GFX9_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 4754; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) 4755; GFX9_DPP-NEXT: buffer_wbinvl1_vol 4756; GFX9_DPP-NEXT: .LBB8_2: 4757; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] 4758; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 4759; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 4760; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 4761; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 4762; GFX9_DPP-NEXT: s_mov_b32 s2, -1 4763; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 4764; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4765; GFX9_DPP-NEXT: s_endpgm 4766; 4767; GFX1064_DPP-LABEL: sub_i32_varying: 4768; GFX1064_DPP: ; %bb.0: ; %entry 4769; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4770; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4771; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 4772; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4773; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4774; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4775; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4776; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 4777; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4778; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 4779; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 4780; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4781; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 4782; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4783; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 4784; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4785; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 4786; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 4787; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 4788; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] 4789; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4790; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 4791; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 4792; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 4793; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 4794; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] 4795; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4796; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 4797; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 4798; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 4799; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 4800; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4801; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 4802; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 4803; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc 4804; GFX1064_DPP-NEXT: s_cbranch_execz .LBB8_2 4805; GFX1064_DPP-NEXT: ; %bb.1: 4806; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 4807; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 4808; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 4809; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 4810; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 4811; GFX1064_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 4812; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) 4813; GFX1064_DPP-NEXT: buffer_gl1_inv 4814; GFX1064_DPP-NEXT: buffer_gl0_inv 4815; GFX1064_DPP-NEXT: .LBB8_2: 4816; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 4817; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 4818; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 4819; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 4820; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 4821; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 4822; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4823; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 4824; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4825; GFX1064_DPP-NEXT: s_endpgm 4826; 4827; GFX1032_DPP-LABEL: sub_i32_varying: 4828; GFX1032_DPP: ; %bb.0: ; %entry 4829; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 4830; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 4831; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4832; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4833; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4834; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4835; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 4836; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 4837; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4838; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 4839; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4840; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 4841; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 4842; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4843; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 4844; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 4845; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4846; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 4847; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 4848; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 4849; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 4850; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 4851; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4852; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 4853; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo 4854; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2 4855; GFX1032_DPP-NEXT: ; %bb.1: 4856; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 4857; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 4858; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 4859; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 4860; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 4861; GFX1032_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 4862; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) 4863; GFX1032_DPP-NEXT: buffer_gl1_inv 4864; GFX1032_DPP-NEXT: buffer_gl0_inv 4865; GFX1032_DPP-NEXT: .LBB8_2: 4866; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 4867; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 4868; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 4869; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 4870; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 4871; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 4872; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4873; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 4874; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4875; GFX1032_DPP-NEXT: s_endpgm 4876; 4877; GFX1164_DPP-LABEL: sub_i32_varying: 4878; GFX1164_DPP: ; %bb.0: ; %entry 4879; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4880; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4881; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4882; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4883; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 4884; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 4885; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4886; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4887; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4888; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4889; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4890; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4891; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 4892; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4893; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4894; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 4895; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 4896; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4897; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4898; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 4899; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4900; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 4901; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4902; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 4903; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 4904; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 4905; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] 4906; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4907; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4908; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 4909; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 4910; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63 4911; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 4912; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] 4913; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4914; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4915; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 4916; GFX1164_DPP-NEXT: s_mov_b32 s4, s9 4917; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48 4918; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 4919; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4920; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 4921; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 4922; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc 4923; GFX1164_DPP-NEXT: s_cbranch_execz .LBB8_2 4924; GFX1164_DPP-NEXT: ; %bb.1: 4925; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 4926; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 4927; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 4928; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 4929; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 4930; GFX1164_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc 4931; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) 4932; GFX1164_DPP-NEXT: buffer_gl1_inv 4933; GFX1164_DPP-NEXT: buffer_gl0_inv 4934; GFX1164_DPP-NEXT: .LBB8_2: 4935; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 4936; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 4937; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0 4938; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 4939; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 4940; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 4941; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4942; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 4943; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4944; GFX1164_DPP-NEXT: s_endpgm 4945; 4946; GFX1132_DPP-LABEL: sub_i32_varying: 4947; GFX1132_DPP: ; %bb.0: ; %entry 4948; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4949; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 4950; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4951; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 4952; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4953; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4954; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4955; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4956; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4957; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4958; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 4959; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 4960; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 4961; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4962; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 4963; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4964; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 4965; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 4966; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 4967; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4968; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 4969; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 4970; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4971; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4972; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 4973; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 4974; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 4975; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 4976; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 4977; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4978; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 4979; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo 4980; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2 4981; GFX1132_DPP-NEXT: ; %bb.1: 4982; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 4983; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 4984; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 4985; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 4986; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 4987; GFX1132_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc 4988; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) 4989; GFX1132_DPP-NEXT: buffer_gl1_inv 4990; GFX1132_DPP-NEXT: buffer_gl0_inv 4991; GFX1132_DPP-NEXT: .LBB8_2: 4992; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 4993; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 4994; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0 4995; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 4996; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 4997; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 4998; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4999; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 5000; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5001; GFX1132_DPP-NEXT: s_endpgm 5002; 5003; GFX1264_DPP-LABEL: sub_i32_varying: 5004; GFX1264_DPP: ; %bb.0: ; %entry 5005; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5006; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 5007; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 5008; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 5009; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 5010; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 5011; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5012; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5013; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5014; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5015; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5016; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5017; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 5018; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5019; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5020; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 5021; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s2 5022; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5023; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5024; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 5025; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5026; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] 5027; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5028; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 5029; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 5030; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 5031; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] 5032; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5033; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5034; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 5035; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 5036; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 5037; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 5038; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] 5039; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5040; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5041; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 5042; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 5043; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 5044; GFX1264_DPP-NEXT: s_wait_alu 0xfffe 5045; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] 5046; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5047; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 5048; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0 5049; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc 5050; GFX1264_DPP-NEXT: s_cbranch_execz .LBB8_2 5051; GFX1264_DPP-NEXT: ; %bb.1: 5052; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4 5053; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 5054; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 5055; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 5056; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 5057; GFX1264_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5058; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 5059; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV 5060; GFX1264_DPP-NEXT: .LBB8_2: 5061; GFX1264_DPP-NEXT: s_wait_alu 0xfffe 5062; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 5063; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 5064; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 5065; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 5066; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 5067; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 5068; GFX1264_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 5069; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 5070; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null 5071; GFX1264_DPP-NEXT: s_endpgm 5072; 5073; GFX1232_DPP-LABEL: sub_i32_varying: 5074; GFX1232_DPP: ; %bb.0: ; %entry 5075; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5076; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 5077; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 5078; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 5079; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5080; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5081; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5082; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5083; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5084; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5085; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 5086; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 5087; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 5088; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5089; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 5090; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5091; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 5092; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 5093; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 5094; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5095; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 5096; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 5097; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5098; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5099; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 5100; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 5101; GFX1232_DPP-NEXT: s_wait_alu 0xfffe 5102; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 5103; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 5104; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 5105; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5106; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 5107; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo 5108; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2 5109; GFX1232_DPP-NEXT: ; %bb.1: 5110; GFX1232_DPP-NEXT: s_wait_alu 0xfffe 5111; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 5112; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 5113; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 5114; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 5115; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 5116; GFX1232_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5117; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 5118; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV 5119; GFX1232_DPP-NEXT: .LBB8_2: 5120; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 5121; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 5122; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0 5123; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3 5124; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 5125; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 5126; GFX1232_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 5127; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 5128; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null 5129; GFX1232_DPP-NEXT: s_endpgm 5130entry: 5131 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5132 %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel 5133 store i32 %old, ptr addrspace(1) %out 5134 ret void 5135} 5136 5137define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) { 5138; GFX7LESS-LABEL: sub_i64_constant: 5139; GFX7LESS: ; %bb.0: ; %entry 5140; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 5141; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5142; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 5143; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 5144; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5145; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5146; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 5147; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 5148; GFX7LESS-NEXT: ; %bb.1: 5149; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 5150; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 5151; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 5152; GFX7LESS-NEXT: s_mov_b32 s10, -1 5153; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5154; GFX7LESS-NEXT: s_mov_b32 s8, s2 5155; GFX7LESS-NEXT: s_mov_b32 s9, s3 5156; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 5157; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5158; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 5159; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 5160; GFX7LESS-NEXT: buffer_wbinvl1 5161; GFX7LESS-NEXT: .LBB9_2: 5162; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 5163; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5164; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5165; GFX7LESS-NEXT: s_mov_b32 s2, -1 5166; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 5167; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 5168; GFX7LESS-NEXT: s_waitcnt expcnt(0) 5169; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5170; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5171; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 5172; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 5173; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 5174; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5175; GFX7LESS-NEXT: s_endpgm 5176; 5177; GFX8-LABEL: sub_i64_constant: 5178; GFX8: ; %bb.0: ; %entry 5179; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5180; GFX8-NEXT: s_mov_b64 s[6:7], exec 5181; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5182; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5183; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5184; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5185; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 5186; GFX8-NEXT: s_cbranch_execz .LBB9_2 5187; GFX8-NEXT: ; %bb.1: 5188; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5189; GFX8-NEXT: s_mov_b32 s8, s2 5190; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 5191; GFX8-NEXT: s_mul_i32 s2, s2, 5 5192; GFX8-NEXT: s_mov_b32 s11, 0xf000 5193; GFX8-NEXT: s_mov_b32 s10, -1 5194; GFX8-NEXT: s_mov_b32 s9, s3 5195; GFX8-NEXT: v_mov_b32_e32 v0, s2 5196; GFX8-NEXT: v_mov_b32_e32 v1, 0 5197; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 5198; GFX8-NEXT: s_waitcnt vmcnt(0) 5199; GFX8-NEXT: buffer_wbinvl1_vol 5200; GFX8-NEXT: .LBB9_2: 5201; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5202; GFX8-NEXT: v_readfirstlane_b32 s4, v1 5203; GFX8-NEXT: v_readfirstlane_b32 s5, v0 5204; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5205; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5206; GFX8-NEXT: v_mov_b32_e32 v2, s4 5207; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 5208; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5209; GFX8-NEXT: s_mov_b32 s3, 0xf000 5210; GFX8-NEXT: s_mov_b32 s2, -1 5211; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 5212; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5213; GFX8-NEXT: s_endpgm 5214; 5215; GFX9-LABEL: sub_i64_constant: 5216; GFX9: ; %bb.0: ; %entry 5217; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5218; GFX9-NEXT: s_mov_b64 s[6:7], exec 5219; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5220; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5221; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5222; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5223; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 5224; GFX9-NEXT: s_cbranch_execz .LBB9_2 5225; GFX9-NEXT: ; %bb.1: 5226; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5227; GFX9-NEXT: s_mov_b32 s8, s2 5228; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 5229; GFX9-NEXT: s_mul_i32 s2, s2, 5 5230; GFX9-NEXT: s_mov_b32 s11, 0xf000 5231; GFX9-NEXT: s_mov_b32 s10, -1 5232; GFX9-NEXT: s_mov_b32 s9, s3 5233; GFX9-NEXT: v_mov_b32_e32 v0, s2 5234; GFX9-NEXT: v_mov_b32_e32 v1, 0 5235; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 5236; GFX9-NEXT: s_waitcnt vmcnt(0) 5237; GFX9-NEXT: buffer_wbinvl1_vol 5238; GFX9-NEXT: .LBB9_2: 5239; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5240; GFX9-NEXT: v_readfirstlane_b32 s4, v1 5241; GFX9-NEXT: v_readfirstlane_b32 s5, v0 5242; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5243; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5244; GFX9-NEXT: v_mov_b32_e32 v2, s4 5245; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 5246; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5247; GFX9-NEXT: s_mov_b32 s3, 0xf000 5248; GFX9-NEXT: s_mov_b32 s2, -1 5249; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 5250; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5251; GFX9-NEXT: s_endpgm 5252; 5253; GFX1064-LABEL: sub_i64_constant: 5254; GFX1064: ; %bb.0: ; %entry 5255; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5256; GFX1064-NEXT: s_mov_b64 s[6:7], exec 5257; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5258; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5259; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5260; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5261; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5262; GFX1064-NEXT: s_cbranch_execz .LBB9_2 5263; GFX1064-NEXT: ; %bb.1: 5264; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 5265; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5266; GFX1064-NEXT: s_mul_i32 s6, s6, 5 5267; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 5268; GFX1064-NEXT: v_mov_b32_e32 v0, s6 5269; GFX1064-NEXT: s_mov_b32 s10, -1 5270; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5271; GFX1064-NEXT: s_mov_b32 s8, s2 5272; GFX1064-NEXT: s_mov_b32 s9, s3 5273; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 5274; GFX1064-NEXT: s_waitcnt vmcnt(0) 5275; GFX1064-NEXT: buffer_gl1_inv 5276; GFX1064-NEXT: buffer_gl0_inv 5277; GFX1064-NEXT: .LBB9_2: 5278; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5279; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5280; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5281; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5282; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5283; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5284; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5285; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 5286; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5287; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5288; GFX1064-NEXT: s_mov_b32 s2, -1 5289; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5290; GFX1064-NEXT: s_endpgm 5291; 5292; GFX1032-LABEL: sub_i64_constant: 5293; GFX1032: ; %bb.0: ; %entry 5294; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5295; GFX1032-NEXT: s_mov_b32 s6, exec_lo 5296; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5297; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 5298; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 5299; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 5300; GFX1032-NEXT: s_cbranch_execz .LBB9_2 5301; GFX1032-NEXT: ; %bb.1: 5302; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 5303; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5304; GFX1032-NEXT: s_mul_i32 s5, s5, 5 5305; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 5306; GFX1032-NEXT: v_mov_b32_e32 v0, s5 5307; GFX1032-NEXT: s_mov_b32 s10, -1 5308; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5309; GFX1032-NEXT: s_mov_b32 s8, s2 5310; GFX1032-NEXT: s_mov_b32 s9, s3 5311; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 5312; GFX1032-NEXT: s_waitcnt vmcnt(0) 5313; GFX1032-NEXT: buffer_gl1_inv 5314; GFX1032-NEXT: buffer_gl0_inv 5315; GFX1032-NEXT: .LBB9_2: 5316; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5317; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 5318; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5319; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5320; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5321; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5322; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5323; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 5324; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5325; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5326; GFX1032-NEXT: s_mov_b32 s2, -1 5327; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5328; GFX1032-NEXT: s_endpgm 5329; 5330; GFX1164-LABEL: sub_i64_constant: 5331; GFX1164: ; %bb.0: ; %entry 5332; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5333; GFX1164-NEXT: s_mov_b64 s[6:7], exec 5334; GFX1164-NEXT: s_mov_b64 s[4:5], exec 5335; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5336; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5337; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5338; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5339; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 5340; GFX1164-NEXT: s_cbranch_execz .LBB9_2 5341; GFX1164-NEXT: ; %bb.1: 5342; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 5343; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5344; GFX1164-NEXT: s_mul_i32 s6, s6, 5 5345; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 5346; GFX1164-NEXT: v_mov_b32_e32 v0, s6 5347; GFX1164-NEXT: s_mov_b32 s10, -1 5348; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5349; GFX1164-NEXT: s_mov_b32 s8, s2 5350; GFX1164-NEXT: s_mov_b32 s9, s3 5351; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 5352; GFX1164-NEXT: s_waitcnt vmcnt(0) 5353; GFX1164-NEXT: buffer_gl1_inv 5354; GFX1164-NEXT: buffer_gl0_inv 5355; GFX1164-NEXT: .LBB9_2: 5356; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5357; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5358; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5359; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5360; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5361; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5362; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5363; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 5364; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5365; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5366; GFX1164-NEXT: s_mov_b32 s2, -1 5367; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5368; GFX1164-NEXT: s_endpgm 5369; 5370; GFX1132-LABEL: sub_i64_constant: 5371; GFX1132: ; %bb.0: ; %entry 5372; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5373; GFX1132-NEXT: s_mov_b32 s6, exec_lo 5374; GFX1132-NEXT: s_mov_b32 s4, exec_lo 5375; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 5376; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5377; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5378; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 5379; GFX1132-NEXT: s_cbranch_execz .LBB9_2 5380; GFX1132-NEXT: ; %bb.1: 5381; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 5382; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 5383; GFX1132-NEXT: s_mul_i32 s5, s5, 5 5384; GFX1132-NEXT: s_mov_b32 s10, -1 5385; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 5386; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5387; GFX1132-NEXT: s_mov_b32 s8, s2 5388; GFX1132-NEXT: s_mov_b32 s9, s3 5389; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 5390; GFX1132-NEXT: s_waitcnt vmcnt(0) 5391; GFX1132-NEXT: buffer_gl1_inv 5392; GFX1132-NEXT: buffer_gl0_inv 5393; GFX1132-NEXT: .LBB9_2: 5394; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 5395; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5396; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5397; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5398; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5399; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5400; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5401; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 5402; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5403; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5404; GFX1132-NEXT: s_mov_b32 s2, -1 5405; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5406; GFX1132-NEXT: s_endpgm 5407; 5408; GFX1264-LABEL: sub_i64_constant: 5409; GFX1264: ; %bb.0: ; %entry 5410; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5411; GFX1264-NEXT: s_mov_b64 s[6:7], exec 5412; GFX1264-NEXT: s_mov_b32 s9, 0 5413; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5414; GFX1264-NEXT: s_mov_b64 s[4:5], exec 5415; GFX1264-NEXT: s_wait_alu 0xfffe 5416; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5417; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5418; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 5419; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 5420; GFX1264-NEXT: s_cbranch_execz .LBB9_2 5421; GFX1264-NEXT: ; %bb.1: 5422; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 5423; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 5424; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 5425; GFX1264-NEXT: s_mov_b32 s10, -1 5426; GFX1264-NEXT: s_wait_alu 0xfffe 5427; GFX1264-NEXT: v_mov_b32_e32 v0, s6 5428; GFX1264-NEXT: v_mov_b32_e32 v1, s7 5429; GFX1264-NEXT: s_wait_kmcnt 0x0 5430; GFX1264-NEXT: s_mov_b32 s8, s2 5431; GFX1264-NEXT: s_mov_b32 s9, s3 5432; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5433; GFX1264-NEXT: s_wait_loadcnt 0x0 5434; GFX1264-NEXT: global_inv scope:SCOPE_DEV 5435; GFX1264-NEXT: .LBB9_2: 5436; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] 5437; GFX1264-NEXT: s_wait_kmcnt 0x0 5438; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 5439; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5440; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 5441; GFX1264-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5442; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5443; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v0 5444; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5445; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 5446; GFX1264-NEXT: s_mov_b32 s2, -1 5447; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 5448; GFX1264-NEXT: s_endpgm 5449; 5450; GFX1232-LABEL: sub_i64_constant: 5451; GFX1232: ; %bb.0: ; %entry 5452; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5453; GFX1232-NEXT: s_mov_b32 s7, exec_lo 5454; GFX1232-NEXT: s_mov_b32 s5, 0 5455; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 5456; GFX1232-NEXT: s_mov_b32 s6, exec_lo 5457; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 5458; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 5459; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 5460; GFX1232-NEXT: s_cbranch_execz .LBB9_2 5461; GFX1232-NEXT: ; %bb.1: 5462; GFX1232-NEXT: s_wait_alu 0xfffe 5463; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 5464; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 5465; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 5466; GFX1232-NEXT: s_mov_b32 s10, -1 5467; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 5468; GFX1232-NEXT: s_wait_kmcnt 0x0 5469; GFX1232-NEXT: s_mov_b32 s8, s2 5470; GFX1232-NEXT: s_mov_b32 s9, s3 5471; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5472; GFX1232-NEXT: s_wait_loadcnt 0x0 5473; GFX1232-NEXT: global_inv scope:SCOPE_DEV 5474; GFX1232-NEXT: .LBB9_2: 5475; GFX1232-NEXT: s_wait_alu 0xfffe 5476; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 5477; GFX1232-NEXT: s_wait_kmcnt 0x0 5478; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 5479; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5480; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 5481; GFX1232-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5482; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5483; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 5484; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5485; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 5486; GFX1232-NEXT: s_mov_b32 s2, -1 5487; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 5488; GFX1232-NEXT: s_endpgm 5489entry: 5490 %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel 5491 store i64 %old, ptr addrspace(1) %out 5492 ret void 5493} 5494 5495define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 %subitive) { 5496; GFX7LESS-LABEL: sub_i64_uniform: 5497; GFX7LESS: ; %bb.0: ; %entry 5498; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 5499; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5500; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 5501; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 5502; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 5503; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5504; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5505; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 5506; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 5507; GFX7LESS-NEXT: ; %bb.1: 5508; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 5509; GFX7LESS-NEXT: s_mov_b32 s14, -1 5510; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5511; GFX7LESS-NEXT: s_mov_b32 s12, s2 5512; GFX7LESS-NEXT: s_mov_b32 s13, s3 5513; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[8:9] 5514; GFX7LESS-NEXT: s_mul_i32 s3, s5, s2 5515; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 5516; GFX7LESS-NEXT: v_mul_hi_u32 v0, s4, v0 5517; GFX7LESS-NEXT: s_mul_i32 s2, s4, s2 5518; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s3, v0 5519; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 5520; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 5521; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 5522; GFX7LESS-NEXT: buffer_wbinvl1 5523; GFX7LESS-NEXT: .LBB10_2: 5524; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 5525; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5526; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5527; GFX7LESS-NEXT: s_mov_b32 s2, -1 5528; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v1 5529; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v0 5530; GFX7LESS-NEXT: s_waitcnt expcnt(0) 5531; GFX7LESS-NEXT: v_mul_lo_u32 v0, s5, v2 5532; GFX7LESS-NEXT: v_mul_hi_u32 v1, s4, v2 5533; GFX7LESS-NEXT: v_mul_lo_u32 v2, s4, v2 5534; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 5535; GFX7LESS-NEXT: v_mov_b32_e32 v3, s6 5536; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s7, v2 5537; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 5538; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5539; GFX7LESS-NEXT: s_endpgm 5540; 5541; GFX8-LABEL: sub_i64_uniform: 5542; GFX8: ; %bb.0: ; %entry 5543; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5544; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 5545; GFX8-NEXT: s_mov_b64 s[8:9], exec 5546; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 5547; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 5548; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5549; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5550; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 5551; GFX8-NEXT: s_cbranch_execz .LBB10_2 5552; GFX8-NEXT: ; %bb.1: 5553; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5554; GFX8-NEXT: s_mov_b32 s12, s2 5555; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] 5556; GFX8-NEXT: v_mov_b32_e32 v0, s2 5557; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s4, v0, 0 5558; GFX8-NEXT: s_mul_i32 s2, s5, s2 5559; GFX8-NEXT: s_mov_b32 s15, 0xf000 5560; GFX8-NEXT: s_mov_b32 s14, -1 5561; GFX8-NEXT: s_mov_b32 s13, s3 5562; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 5563; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 5564; GFX8-NEXT: s_waitcnt vmcnt(0) 5565; GFX8-NEXT: buffer_wbinvl1_vol 5566; GFX8-NEXT: .LBB10_2: 5567; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 5568; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5569; GFX8-NEXT: v_mul_lo_u32 v4, s5, v2 5570; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s4, v2, 0 5571; GFX8-NEXT: v_readfirstlane_b32 s4, v1 5572; GFX8-NEXT: v_readfirstlane_b32 s5, v0 5573; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 5574; GFX8-NEXT: v_mov_b32_e32 v3, s4 5575; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v2 5576; GFX8-NEXT: s_mov_b32 s3, 0xf000 5577; GFX8-NEXT: s_mov_b32 s2, -1 5578; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 5579; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5580; GFX8-NEXT: s_endpgm 5581; 5582; GFX9-LABEL: sub_i64_uniform: 5583; GFX9: ; %bb.0: ; %entry 5584; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5585; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5586; GFX9-NEXT: s_mov_b64 s[8:9], exec 5587; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 5588; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 5589; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5590; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5591; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 5592; GFX9-NEXT: s_cbranch_execz .LBB10_2 5593; GFX9-NEXT: ; %bb.1: 5594; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5595; GFX9-NEXT: s_mov_b32 s12, s2 5596; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] 5597; GFX9-NEXT: s_mov_b32 s13, s3 5598; GFX9-NEXT: s_mul_i32 s3, s7, s2 5599; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2 5600; GFX9-NEXT: s_add_i32 s8, s8, s3 5601; GFX9-NEXT: s_mul_i32 s2, s6, s2 5602; GFX9-NEXT: s_mov_b32 s15, 0xf000 5603; GFX9-NEXT: s_mov_b32 s14, -1 5604; GFX9-NEXT: v_mov_b32_e32 v0, s2 5605; GFX9-NEXT: v_mov_b32_e32 v1, s8 5606; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 5607; GFX9-NEXT: s_waitcnt vmcnt(0) 5608; GFX9-NEXT: buffer_wbinvl1_vol 5609; GFX9-NEXT: .LBB10_2: 5610; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5611; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5612; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s6, v2, 0 5613; GFX9-NEXT: s_mov_b32 s3, 0xf000 5614; GFX9-NEXT: s_mov_b32 s2, -1 5615; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s7, v2, v[4:5] 5616; GFX9-NEXT: v_readfirstlane_b32 s4, v1 5617; GFX9-NEXT: v_readfirstlane_b32 s5, v0 5618; GFX9-NEXT: v_mov_b32_e32 v1, v4 5619; GFX9-NEXT: v_mov_b32_e32 v2, s4 5620; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v3 5621; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 5622; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5623; GFX9-NEXT: s_endpgm 5624; 5625; GFX1064-LABEL: sub_i64_uniform: 5626; GFX1064: ; %bb.0: ; %entry 5627; GFX1064-NEXT: s_clause 0x1 5628; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5629; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5630; GFX1064-NEXT: s_mov_b64 s[8:9], exec 5631; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 5632; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 5633; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5634; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5635; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5636; GFX1064-NEXT: s_cbranch_execz .LBB10_2 5637; GFX1064-NEXT: ; %bb.1: 5638; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 5639; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 5640; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5641; GFX1064-NEXT: s_mul_i32 s9, s7, s8 5642; GFX1064-NEXT: s_mul_hi_u32 s10, s6, s8 5643; GFX1064-NEXT: s_mul_i32 s8, s6, s8 5644; GFX1064-NEXT: s_add_i32 s10, s10, s9 5645; GFX1064-NEXT: v_mov_b32_e32 v0, s8 5646; GFX1064-NEXT: v_mov_b32_e32 v1, s10 5647; GFX1064-NEXT: s_mov_b32 s10, -1 5648; GFX1064-NEXT: s_mov_b32 s8, s2 5649; GFX1064-NEXT: s_mov_b32 s9, s3 5650; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 5651; GFX1064-NEXT: s_waitcnt vmcnt(0) 5652; GFX1064-NEXT: buffer_gl1_inv 5653; GFX1064-NEXT: buffer_gl0_inv 5654; GFX1064-NEXT: .LBB10_2: 5655; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5656; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5657; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5658; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s6, v2, 0 5659; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s7, v2, v[4:5] 5660; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5661; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5662; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 5663; GFX1064-NEXT: v_mov_b32_e32 v1, v4 5664; GFX1064-NEXT: s_mov_b32 s2, -1 5665; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5666; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5667; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5668; GFX1064-NEXT: s_endpgm 5669; 5670; GFX1032-LABEL: sub_i64_uniform: 5671; GFX1032: ; %bb.0: ; %entry 5672; GFX1032-NEXT: s_clause 0x1 5673; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5674; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5675; GFX1032-NEXT: s_mov_b32 s8, exec_lo 5676; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5677; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 5678; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 5679; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 5680; GFX1032-NEXT: s_cbranch_execz .LBB10_2 5681; GFX1032-NEXT: ; %bb.1: 5682; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s8 5683; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 5684; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5685; GFX1032-NEXT: s_mul_i32 s8, s7, s5 5686; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s5 5687; GFX1032-NEXT: s_mul_i32 s5, s6, s5 5688; GFX1032-NEXT: s_add_i32 s9, s9, s8 5689; GFX1032-NEXT: v_mov_b32_e32 v0, s5 5690; GFX1032-NEXT: v_mov_b32_e32 v1, s9 5691; GFX1032-NEXT: s_mov_b32 s10, -1 5692; GFX1032-NEXT: s_mov_b32 s8, s2 5693; GFX1032-NEXT: s_mov_b32 s9, s3 5694; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 5695; GFX1032-NEXT: s_waitcnt vmcnt(0) 5696; GFX1032-NEXT: buffer_gl1_inv 5697; GFX1032-NEXT: buffer_gl0_inv 5698; GFX1032-NEXT: .LBB10_2: 5699; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5700; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 5701; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5702; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s6, v2, 0 5703; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5704; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s7, v2, v[4:5] 5705; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5706; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 5707; GFX1032-NEXT: v_mov_b32_e32 v1, v4 5708; GFX1032-NEXT: s_mov_b32 s2, -1 5709; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5710; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5711; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5712; GFX1032-NEXT: s_endpgm 5713; 5714; GFX1164-LABEL: sub_i64_uniform: 5715; GFX1164: ; %bb.0: ; %entry 5716; GFX1164-NEXT: s_clause 0x1 5717; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5718; GFX1164-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5719; GFX1164-NEXT: s_mov_b64 s[8:9], exec 5720; GFX1164-NEXT: s_mov_b64 s[6:7], exec 5721; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 5722; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5723; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 5724; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5725; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 5726; GFX1164-NEXT: s_cbranch_execz .LBB10_2 5727; GFX1164-NEXT: ; %bb.1: 5728; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 5729; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 5730; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5731; GFX1164-NEXT: s_mul_i32 s9, s5, s8 5732; GFX1164-NEXT: s_mul_hi_u32 s10, s4, s8 5733; GFX1164-NEXT: s_mul_i32 s8, s4, s8 5734; GFX1164-NEXT: s_add_i32 s10, s10, s9 5735; GFX1164-NEXT: v_mov_b32_e32 v0, s8 5736; GFX1164-NEXT: v_mov_b32_e32 v1, s10 5737; GFX1164-NEXT: s_mov_b32 s10, -1 5738; GFX1164-NEXT: s_mov_b32 s8, s2 5739; GFX1164-NEXT: s_mov_b32 s9, s3 5740; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 5741; GFX1164-NEXT: s_waitcnt vmcnt(0) 5742; GFX1164-NEXT: buffer_gl1_inv 5743; GFX1164-NEXT: buffer_gl0_inv 5744; GFX1164-NEXT: .LBB10_2: 5745; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7] 5746; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5747; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, 0 5748; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5749; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5750; GFX1164-NEXT: s_waitcnt_depctr 0xfff 5751; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5] 5752; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 5753; GFX1164-NEXT: s_mov_b32 s2, -1 5754; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 5755; GFX1164-NEXT: v_mov_b32_e32 v1, v5 5756; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5757; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5758; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5759; GFX1164-NEXT: s_endpgm 5760; 5761; GFX1132-LABEL: sub_i64_uniform: 5762; GFX1132: ; %bb.0: ; %entry 5763; GFX1132-NEXT: s_clause 0x1 5764; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5765; GFX1132-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5766; GFX1132-NEXT: s_mov_b32 s7, exec_lo 5767; GFX1132-NEXT: s_mov_b32 s6, exec_lo 5768; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 5769; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5770; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5771; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 5772; GFX1132-NEXT: s_cbranch_execz .LBB10_2 5773; GFX1132-NEXT: ; %bb.1: 5774; GFX1132-NEXT: s_bcnt1_i32_b32 s7, s7 5775; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 5776; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5777; GFX1132-NEXT: s_mul_i32 s8, s5, s7 5778; GFX1132-NEXT: s_mul_hi_u32 s9, s4, s7 5779; GFX1132-NEXT: s_mul_i32 s7, s4, s7 5780; GFX1132-NEXT: s_add_i32 s9, s9, s8 5781; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5782; GFX1132-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9 5783; GFX1132-NEXT: s_mov_b32 s10, -1 5784; GFX1132-NEXT: s_mov_b32 s8, s2 5785; GFX1132-NEXT: s_mov_b32 s9, s3 5786; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc 5787; GFX1132-NEXT: s_waitcnt vmcnt(0) 5788; GFX1132-NEXT: buffer_gl1_inv 5789; GFX1132-NEXT: buffer_gl0_inv 5790; GFX1132-NEXT: .LBB10_2: 5791; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6 5792; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5793; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, 0 5794; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5795; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5796; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 5797; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5] 5798; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 5799; GFX1132-NEXT: s_mov_b32 s2, -1 5800; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 5801; GFX1132-NEXT: v_mov_b32_e32 v1, v5 5802; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5803; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5804; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5805; GFX1132-NEXT: s_endpgm 5806; 5807; GFX1264-LABEL: sub_i64_uniform: 5808; GFX1264: ; %bb.0: ; %entry 5809; GFX1264-NEXT: s_clause 0x1 5810; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5811; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5812; GFX1264-NEXT: s_mov_b64 s[8:9], exec 5813; GFX1264-NEXT: s_mov_b32 s11, 0 5814; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 5815; GFX1264-NEXT: s_mov_b64 s[6:7], exec 5816; GFX1264-NEXT: s_wait_alu 0xfffe 5817; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5818; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 5819; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 5820; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 5821; GFX1264-NEXT: s_cbranch_execz .LBB10_2 5822; GFX1264-NEXT: ; %bb.1: 5823; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] 5824; GFX1264-NEXT: s_wait_kmcnt 0x0 5825; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] 5826; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 5827; GFX1264-NEXT: s_wait_alu 0xfffe 5828; GFX1264-NEXT: v_mov_b32_e32 v0, s8 5829; GFX1264-NEXT: v_mov_b32_e32 v1, s9 5830; GFX1264-NEXT: s_mov_b32 s10, -1 5831; GFX1264-NEXT: s_mov_b32 s8, s2 5832; GFX1264-NEXT: s_mov_b32 s9, s3 5833; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5834; GFX1264-NEXT: s_wait_loadcnt 0x0 5835; GFX1264-NEXT: global_inv scope:SCOPE_DEV 5836; GFX1264-NEXT: .LBB10_2: 5837; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] 5838; GFX1264-NEXT: s_wait_kmcnt 0x0 5839; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0 5840; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 5841; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 5842; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 5843; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] 5844; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3 5845; GFX1264-NEXT: s_mov_b32 s2, -1 5846; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 5847; GFX1264-NEXT: v_mov_b32_e32 v1, v4 5848; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5849; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 5850; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 5851; GFX1264-NEXT: s_endpgm 5852; 5853; GFX1232-LABEL: sub_i64_uniform: 5854; GFX1232: ; %bb.0: ; %entry 5855; GFX1232-NEXT: s_clause 0x1 5856; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5857; GFX1232-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5858; GFX1232-NEXT: s_mov_b32 s6, exec_lo 5859; GFX1232-NEXT: s_mov_b32 s7, 0 5860; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 5861; GFX1232-NEXT: s_mov_b32 s8, exec_lo 5862; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 5863; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) 5864; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 5865; GFX1232-NEXT: s_cbranch_execz .LBB10_2 5866; GFX1232-NEXT: ; %bb.1: 5867; GFX1232-NEXT: s_wait_alu 0xfffe 5868; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 5869; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 5870; GFX1232-NEXT: s_wait_kmcnt 0x0 5871; GFX1232-NEXT: s_wait_alu 0xfffe 5872; GFX1232-NEXT: s_mul_u64 s[6:7], s[4:5], s[6:7] 5873; GFX1232-NEXT: s_mov_b32 s14, -1 5874; GFX1232-NEXT: s_wait_alu 0xfffe 5875; GFX1232-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 5876; GFX1232-NEXT: s_mov_b32 s12, s2 5877; GFX1232-NEXT: s_mov_b32 s13, s3 5878; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5879; GFX1232-NEXT: s_wait_loadcnt 0x0 5880; GFX1232-NEXT: global_inv scope:SCOPE_DEV 5881; GFX1232-NEXT: .LBB10_2: 5882; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 5883; GFX1232-NEXT: s_wait_kmcnt 0x0 5884; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0 5885; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 5886; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 5887; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 5888; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] 5889; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 5890; GFX1232-NEXT: s_mov_b32 s2, -1 5891; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 5892; GFX1232-NEXT: v_mov_b32_e32 v1, v4 5893; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5894; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 5895; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 5896; GFX1232-NEXT: s_endpgm 5897entry: 5898 %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel 5899 store i64 %old, ptr addrspace(1) %out 5900 ret void 5901} 5902 5903define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { 5904; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying: 5905; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 5906; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 5907; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 5908; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 5909; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 5910; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 5911; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5912; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 5913; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s2 5914; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 5915; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 5916; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 5917; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 5918; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 5919; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 5920; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 5921; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 5922; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 5923; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[2:3] 5924; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1 5925; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5926; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5927; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5928; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5929; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5930; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 5931; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 5932; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5933; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 5934; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 5935; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 5936; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 5937; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5938; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 5939; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 5940; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 5941; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 5942; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc 5943; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 5944; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 5945; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: 5946; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 5947; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5948; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 5949; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 5950; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 5951; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 5952; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) 5953; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 5954; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s5, v1 5955; GFX7LESS_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc 5956; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5957; GFX7LESS_ITERATIVE-NEXT: s_endpgm 5958; 5959; GFX8_ITERATIVE-LABEL: sub_i64_varying: 5960; GFX8_ITERATIVE: ; %bb.0: ; %entry 5961; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 5962; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 5963; GFX8_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 5964; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 5965; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 5966; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5967; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 5968; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 5969; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 5970; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 5971; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 5972; GFX8_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 5973; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 5974; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 5975; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 5976; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 5977; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 5978; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 5979; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5980; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5981; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5982; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5983; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5984; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 5985; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 5986; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5987; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 5988; GFX8_ITERATIVE-NEXT: ; %bb.3: 5989; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 5990; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 5991; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 5992; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5993; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 5994; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 5995; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 5996; GFX8_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc 5997; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 5998; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol 5999; GFX8_ITERATIVE-NEXT: .LBB11_4: 6000; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 6001; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 6002; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 6003; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 6004; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s5, v1 6005; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6006; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 6007; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 6008; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc 6009; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6010; GFX8_ITERATIVE-NEXT: s_endpgm 6011; 6012; GFX9_ITERATIVE-LABEL: sub_i64_varying: 6013; GFX9_ITERATIVE: ; %bb.0: ; %entry 6014; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6015; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 6016; GFX9_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 6017; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 6018; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 6019; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6020; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 6021; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 6022; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 6023; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 6024; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 6025; GFX9_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 6026; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 6027; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 6028; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 6029; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 6030; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 6031; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 6032; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6033; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6034; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6035; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6036; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6037; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 6038; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 6039; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6040; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 6041; GFX9_ITERATIVE-NEXT: ; %bb.3: 6042; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 6043; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 6044; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 6045; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6046; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 6047; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 6048; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 6049; GFX9_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc 6050; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 6051; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol 6052; GFX9_ITERATIVE-NEXT: .LBB11_4: 6053; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 6054; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 6055; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 6056; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 6057; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 6058; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6059; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 6060; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 6061; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc 6062; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6063; GFX9_ITERATIVE-NEXT: s_endpgm 6064; 6065; GFX1064_ITERATIVE-LABEL: sub_i64_varying: 6066; GFX1064_ITERATIVE: ; %bb.0: ; %entry 6067; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 6068; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6069; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 6070; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 6071; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 6072; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6073; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] 6074; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 6075; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 6076; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 6077; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 6078; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 6079; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 6080; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 6081; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 6082; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 6083; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 6084; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6085; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6086; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6087; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 6088; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6089; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6090; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 6091; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6092; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 6093; GFX1064_ITERATIVE-NEXT: ; %bb.3: 6094; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 6095; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 6096; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 6097; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 6098; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6099; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 6100; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 6101; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc 6102; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 6103; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv 6104; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 6105; GFX1064_ITERATIVE-NEXT: .LBB11_4: 6106; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 6107; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 6108; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6109; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 6110; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 6111; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 6112; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 6113; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6114; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 6115; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6116; GFX1064_ITERATIVE-NEXT: s_endpgm 6117; 6118; GFX1032_ITERATIVE-LABEL: sub_i64_varying: 6119; GFX1032_ITERATIVE: ; %bb.0: ; %entry 6120; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 6121; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 6122; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 6123; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 6124; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 6125; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6126; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 6127; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 6128; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 6129; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 6130; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 6131; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 6132; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 6133; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 6134; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 6135; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 6136; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 6137; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6138; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6139; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6140; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 6141; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6142; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 6143; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 6144; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 6145; GFX1032_ITERATIVE-NEXT: ; %bb.3: 6146; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 6147; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 6148; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 6149; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 6150; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6151; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 6152; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 6153; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc 6154; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 6155; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv 6156; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 6157; GFX1032_ITERATIVE-NEXT: .LBB11_4: 6158; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 6159; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 6160; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6161; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 6162; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 6163; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 6164; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 6165; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6166; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 6167; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6168; GFX1032_ITERATIVE-NEXT: s_endpgm 6169; 6170; GFX1164_ITERATIVE-LABEL: sub_i64_varying: 6171; GFX1164_ITERATIVE: ; %bb.0: ; %entry 6172; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 6173; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 6174; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6175; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 6176; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 6177; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 6178; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6179; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] 6180; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 6181; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 6182; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 6183; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 6184; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 6185; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 6186; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6187; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 6188; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 6189; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] 6190; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6191; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 6192; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 6193; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6194; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 6195; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 6196; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6197; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 6198; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 6199; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 6200; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 6201; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6202; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6203; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 6204; GFX1164_ITERATIVE-NEXT: ; %bb.3: 6205; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 6206; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 6207; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 6208; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 6209; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6210; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 6211; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 6212; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc 6213; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 6214; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv 6215; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 6216; GFX1164_ITERATIVE-NEXT: .LBB11_4: 6217; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 6218; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6219; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 6220; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 6221; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6222; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 6223; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 6224; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6225; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 6226; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6227; GFX1164_ITERATIVE-NEXT: s_endpgm 6228; 6229; GFX1132_ITERATIVE-LABEL: sub_i64_varying: 6230; GFX1132_ITERATIVE: ; %bb.0: ; %entry 6231; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 6232; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 6233; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 6234; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 6235; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 6236; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6237; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 6238; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 6239; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 6240; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 6241; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 6242; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 6243; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 6244; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 6245; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 6246; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 6247; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 6248; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 6249; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 6250; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 6251; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6252; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 6253; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 6254; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6255; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 6256; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 6257; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 6258; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 6259; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 6260; GFX1132_ITERATIVE-NEXT: ; %bb.3: 6261; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 6262; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 6263; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 6264; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6265; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 6266; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 6267; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc 6268; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) 6269; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv 6270; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 6271; GFX1132_ITERATIVE-NEXT: .LBB11_4: 6272; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 6273; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6274; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 6275; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 6276; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6277; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 6278; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 6279; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6280; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 6281; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6282; GFX1132_ITERATIVE-NEXT: s_endpgm 6283; 6284; GFX1264_ITERATIVE-LABEL: sub_i64_varying: 6285; GFX1264_ITERATIVE: ; %bb.0: ; %entry 6286; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 6287; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 6288; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6289; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 6290; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 6291; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 6292; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6293; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] 6294; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe 6295; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 6296; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 6297; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 6298; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 6299; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 6300; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] 6301; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] 6302; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 6303; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 6304; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6305; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 6306; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 6307; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6308; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 6309; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 6310; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 6311; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc 6312; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6313; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 6314; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 6315; GFX1264_ITERATIVE-NEXT: ; %bb.3: 6316; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe 6317; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 6318; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 6319; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 6320; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 6321; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 6322; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 6323; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 6324; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6325; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 6326; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV 6327; GFX1264_ITERATIVE-NEXT: .LBB11_4: 6328; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] 6329; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 6330; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 6331; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 6332; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6333; GFX1264_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 6334; GFX1264_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 6335; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6336; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 6337; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 6338; GFX1264_ITERATIVE-NEXT: s_endpgm 6339; 6340; GFX1232_ITERATIVE-LABEL: sub_i64_varying: 6341; GFX1232_ITERATIVE: ; %bb.0: ; %entry 6342; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 6343; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo 6344; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 6345; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 6346; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 6347; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6348; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 6349; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 6350; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 6351; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 6352; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 6353; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 6354; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 6355; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 6356; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 6357; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] 6358; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe 6359; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 6360; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 6361; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6362; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 6363; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 6364; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6365; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 6366; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 6367; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo 6368; GFX1232_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 6369; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 6370; GFX1232_ITERATIVE-NEXT: ; %bb.3: 6371; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 6372; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 6373; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 6374; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 6375; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 6376; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 6377; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6378; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 6379; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV 6380; GFX1232_ITERATIVE-NEXT: .LBB11_4: 6381; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 6382; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 6383; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 6384; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 6385; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6386; GFX1232_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 6387; GFX1232_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 6388; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6389; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 6390; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 6391; GFX1232_ITERATIVE-NEXT: s_endpgm 6392; 6393; GFX7LESS_DPP-LABEL: sub_i64_varying: 6394; GFX7LESS_DPP: ; %bb.0: ; %entry 6395; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6396; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 6397; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 6398; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 6399; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 6400; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 6401; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 6402; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 6403; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 6404; GFX7LESS_DPP-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 6405; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) 6406; GFX7LESS_DPP-NEXT: buffer_wbinvl1 6407; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 6408; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 6409; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6410; GFX7LESS_DPP-NEXT: s_endpgm 6411; 6412; GFX8_DPP-LABEL: sub_i64_varying: 6413; GFX8_DPP: ; %bb.0: ; %entry 6414; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6415; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 6416; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 6417; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] 6418; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 6419; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 6420; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 6421; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] 6422; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 6423; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] 6424; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 6425; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 6426; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 6427; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6428; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 6429; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 6430; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 6431; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 6432; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 6433; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6434; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 6435; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 6436; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 6437; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 6438; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 6439; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6440; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 6441; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 6442; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 6443; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 6444; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 6445; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6446; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 6447; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 6448; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 6449; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 6450; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 6451; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6452; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 6453; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 6454; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 6455; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 6456; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 6457; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6458; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v2, v4, vcc 6459; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 6460; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 6461; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 6462; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 6463; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 6464; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] 6465; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 6466; GFX8_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 6467; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc 6468; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 6469; GFX8_DPP-NEXT: ; %bb.1: 6470; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6 6471; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 6472; GFX8_DPP-NEXT: s_mov_b32 s10, -1 6473; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 6474; GFX8_DPP-NEXT: s_mov_b32 s8, s2 6475; GFX8_DPP-NEXT: s_mov_b32 s9, s3 6476; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s7 6477; GFX8_DPP-NEXT: buffer_atomic_sub_x2 v[6:7], off, s[8:11], 0 glc 6478; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) 6479; GFX8_DPP-NEXT: buffer_wbinvl1_vol 6480; GFX8_DPP-NEXT: .LBB11_2: 6481; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] 6482; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 6483; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v6 6484; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v1 6485; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 6486; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 6487; GFX8_DPP-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 6488; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 6489; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 6490; GFX8_DPP-NEXT: s_mov_b32 s2, -1 6491; GFX8_DPP-NEXT: v_subb_u32_e32 v7, vcc, v0, v7, vcc 6492; GFX8_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 6493; GFX8_DPP-NEXT: s_endpgm 6494; 6495; GFX9_DPP-LABEL: sub_i64_varying: 6496; GFX9_DPP: ; %bb.0: ; %entry 6497; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6498; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 6499; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 6500; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] 6501; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 6502; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 6503; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 6504; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] 6505; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 6506; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] 6507; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 6508; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 6509; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 6510; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6511; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 6512; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 6513; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 6514; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 6515; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 6516; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6517; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 6518; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 6519; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 6520; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 6521; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 6522; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6523; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 6524; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 6525; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 6526; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 6527; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 6528; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6529; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 6530; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 6531; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 6532; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 6533; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 6534; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6535; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 6536; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 6537; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 6538; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 6539; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 6540; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6541; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc 6542; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 6543; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 6544; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 6545; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 6546; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 6547; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] 6548; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 6549; GFX9_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 6550; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc 6551; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 6552; GFX9_DPP-NEXT: ; %bb.1: 6553; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6 6554; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 6555; GFX9_DPP-NEXT: s_mov_b32 s10, -1 6556; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 6557; GFX9_DPP-NEXT: s_mov_b32 s8, s2 6558; GFX9_DPP-NEXT: s_mov_b32 s9, s3 6559; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s7 6560; GFX9_DPP-NEXT: buffer_atomic_sub_x2 v[6:7], off, s[8:11], 0 glc 6561; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) 6562; GFX9_DPP-NEXT: buffer_wbinvl1_vol 6563; GFX9_DPP-NEXT: .LBB11_2: 6564; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] 6565; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 6566; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 6567; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 6568; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 6569; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 6570; GFX9_DPP-NEXT: v_sub_co_u32_e32 v6, vcc, s5, v6 6571; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 6572; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 6573; GFX9_DPP-NEXT: s_mov_b32 s2, -1 6574; GFX9_DPP-NEXT: v_subb_co_u32_e32 v7, vcc, v0, v7, vcc 6575; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 6576; GFX9_DPP-NEXT: s_endpgm 6577; 6578; GFX1064_DPP-LABEL: sub_i64_varying: 6579; GFX1064_DPP: ; %bb.0: ; %entry 6580; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6581; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 6582; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 6583; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] 6584; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 6585; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 6586; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 6587; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6588; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, 0 6589; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf 6590; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 6591; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc 6592; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 6593; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6594; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 6595; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6596; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 6597; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 6598; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 6599; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6600; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 6601; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6602; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 6603; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc 6604; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 6605; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6606; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6607; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 6608; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 6609; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 6610; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 6611; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 6612; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6613; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6614; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 6615; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 6616; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 6617; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 6618; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 6619; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 6620; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 6621; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 6622; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6623; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6624; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 6625; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 6626; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 6627; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 6628; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6629; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6630; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 6631; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6632; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6633; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 6634; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 6635; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 6636; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 6637; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 6638; GFX1064_DPP-NEXT: v_writelane_b32 v7, s6, 16 6639; GFX1064_DPP-NEXT: v_writelane_b32 v6, s7, 16 6640; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 6641; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 6642; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 6643; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 32 6644; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 32 6645; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] 6646; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6647; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 6648; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] 6649; GFX1064_DPP-NEXT: v_writelane_b32 v7, s11, 48 6650; GFX1064_DPP-NEXT: v_writelane_b32 v6, s10, 48 6651; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] 6652; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6653; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 6654; GFX1064_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 6655; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc 6656; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 6657; GFX1064_DPP-NEXT: ; %bb.1: 6658; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s5 6659; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s4 6660; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 6661; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 6662; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 6663; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 6664; GFX1064_DPP-NEXT: buffer_atomic_sub_x2 v[8:9], off, s[4:7], 0 glc 6665; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) 6666; GFX1064_DPP-NEXT: buffer_gl1_inv 6667; GFX1064_DPP-NEXT: buffer_gl0_inv 6668; GFX1064_DPP-NEXT: .LBB11_2: 6669; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 6670; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 6671; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 6672; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v8 6673; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v6 6674; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 6675; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 6676; GFX1064_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10 6677; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 6678; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc 6679; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 6680; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 6681; GFX1064_DPP-NEXT: s_endpgm 6682; 6683; GFX1032_DPP-LABEL: sub_i64_varying: 6684; GFX1032_DPP: ; %bb.0: ; %entry 6685; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 6686; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 6687; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 6688; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 6689; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 6690; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 6691; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 6692; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6693; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 6694; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf 6695; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, 0 6696; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 6697; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo 6698; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 6699; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6700; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 6701; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6702; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 6703; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 6704; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 6705; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6706; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 6707; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6708; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 6709; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo 6710; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 6711; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6712; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6713; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 6714; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 6715; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 6716; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 6717; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 6718; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6719; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6720; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 6721; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6722; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 6723; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 6724; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 6725; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 6726; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6727; GFX1032_DPP-NEXT: v_readlane_b32 s8, v2, 15 6728; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 6729; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6730; GFX1032_DPP-NEXT: v_readlane_b32 s7, v1, 15 6731; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 6732; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6733; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 6734; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16 6735; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16 6736; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 6737; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 6738; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6739; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 6740; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo 6741; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 6742; GFX1032_DPP-NEXT: ; %bb.1: 6743; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 6744; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 6745; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 6746; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 6747; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 6748; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 6749; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc 6750; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) 6751; GFX1032_DPP-NEXT: buffer_gl1_inv 6752; GFX1032_DPP-NEXT: buffer_gl0_inv 6753; GFX1032_DPP-NEXT: .LBB11_2: 6754; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 6755; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 6756; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 6757; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 6758; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 6759; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 6760; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 6761; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s2, v11 6762; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 6763; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo 6764; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 6765; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 6766; GFX1032_DPP-NEXT: s_endpgm 6767; 6768; GFX1164_DPP-LABEL: sub_i64_varying: 6769; GFX1164_DPP: ; %bb.0: ; %entry 6770; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 6771; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6772; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6773; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 6774; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 6775; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 6776; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 6777; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 6778; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6779; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 6780; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6781; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 6782; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 6783; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 6784; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 6785; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6786; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 6787; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6788; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 6789; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 6790; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6791; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 6792; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 6793; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 6794; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 6795; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 6796; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6797; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 6798; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6799; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 6800; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6801; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 6802; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 6803; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 6804; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6805; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6806; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 6807; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 6808; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 6809; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6810; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 6811; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 6812; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6813; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6814; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6815; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 6816; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 6817; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 6818; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 6819; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6820; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 6821; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6822; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 15 6823; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6824; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 15 6825; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 31 6826; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 31 6827; GFX1164_DPP-NEXT: v_writelane_b32 v4, s6, 16 6828; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 63 6829; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 16 6830; GFX1164_DPP-NEXT: v_readlane_b32 s10, v2, 47 6831; GFX1164_DPP-NEXT: v_readlane_b32 s11, v1, 47 6832; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 6833; GFX1164_DPP-NEXT: v_writelane_b32 v4, s8, 32 6834; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 32 6835; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] 6836; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6837; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6838; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 6839; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] 6840; GFX1164_DPP-NEXT: v_writelane_b32 v4, s10, 48 6841; GFX1164_DPP-NEXT: v_writelane_b32 v5, s11, 48 6842; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] 6843; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 6844; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec 6845; GFX1164_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 6846; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 6847; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 6848; GFX1164_DPP-NEXT: ; %bb.1: 6849; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 6850; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 6851; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 6852; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 6853; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 6854; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 6855; GFX1164_DPP-NEXT: buffer_atomic_sub_u64 v[6:7], off, s[4:7], 0 glc 6856; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) 6857; GFX1164_DPP-NEXT: buffer_gl1_inv 6858; GFX1164_DPP-NEXT: buffer_gl0_inv 6859; GFX1164_DPP-NEXT: .LBB11_2: 6860; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 6861; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 6862; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v6 6863; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 6864; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 6865; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 6866; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6867; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 6868; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 6869; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc 6870; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 6871; GFX1164_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], 0 6872; GFX1164_DPP-NEXT: s_endpgm 6873; 6874; GFX1132_DPP-LABEL: sub_i64_varying: 6875; GFX1132_DPP: ; %bb.0: ; %entry 6876; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 6877; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 6878; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6879; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 6880; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 6881; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 6882; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 6883; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 6884; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 6885; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6886; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 6887; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6888; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 6889; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 6890; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 6891; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 6892; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6893; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 6894; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6895; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 6896; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 6897; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6898; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 6899; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 6900; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 6901; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 6902; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 6903; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6904; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 6905; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6906; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 6907; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6908; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 6909; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 6910; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6911; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 6912; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 6913; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 6914; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 6915; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 6916; GFX1132_DPP-NEXT: v_readlane_b32 s4, v2, 31 6917; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6918; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15 6919; GFX1132_DPP-NEXT: v_readlane_b32 s8, v1, 15 6920; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 31 6921; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6922; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 6923; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6924; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6925; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 6926; GFX1132_DPP-NEXT: v_writelane_b32 v6, s7, 16 6927; GFX1132_DPP-NEXT: v_writelane_b32 v7, s8, 16 6928; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 6929; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 6930; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo 6931; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 6932; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 6933; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 6934; GFX1132_DPP-NEXT: ; %bb.1: 6935; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 6936; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 6937; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 6938; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 6939; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 6940; GFX1132_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], 0 glc 6941; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) 6942; GFX1132_DPP-NEXT: buffer_gl1_inv 6943; GFX1132_DPP-NEXT: buffer_gl0_inv 6944; GFX1132_DPP-NEXT: .LBB11_2: 6945; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 6946; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 6947; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 6948; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 6949; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 6950; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 6951; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6952; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 6953; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 6954; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo 6955; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 6956; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 6957; GFX1132_DPP-NEXT: s_endpgm 6958; 6959; GFX1264_DPP-LABEL: sub_i64_varying: 6960; GFX1264_DPP: ; %bb.0: ; %entry 6961; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 6962; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6963; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6964; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 6965; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 6966; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 6967; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 6968; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 6969; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6970; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 6971; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6972; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 6973; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 6974; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 6975; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 6976; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6977; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 6978; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6979; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 6980; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 6981; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6982; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 6983; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 6984; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 6985; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 6986; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 6987; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6988; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 6989; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6990; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 6991; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6992; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 6993; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 6994; GFX1264_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 6995; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6996; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6997; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 6998; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 6999; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 7000; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 7001; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 7002; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 7003; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 7004; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 7005; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 7006; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 7007; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 7008; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] 7009; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 7010; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7011; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 7012; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7013; GFX1264_DPP-NEXT: v_readlane_b32 s6, v2, 15 7014; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7015; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 15 7016; GFX1264_DPP-NEXT: v_readlane_b32 s8, v2, 31 7017; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 31 7018; GFX1264_DPP-NEXT: v_writelane_b32 v4, s6, 16 7019; GFX1264_DPP-NEXT: v_readlane_b32 s6, v2, 63 7020; GFX1264_DPP-NEXT: v_writelane_b32 v5, s7, 16 7021; GFX1264_DPP-NEXT: v_readlane_b32 s10, v2, 47 7022; GFX1264_DPP-NEXT: v_readlane_b32 s11, v1, 47 7023; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 63 7024; GFX1264_DPP-NEXT: v_writelane_b32 v4, s8, 32 7025; GFX1264_DPP-NEXT: v_writelane_b32 v5, s9, 32 7026; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] 7027; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7028; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 7029; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 7030; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] 7031; GFX1264_DPP-NEXT: v_writelane_b32 v4, s10, 48 7032; GFX1264_DPP-NEXT: v_writelane_b32 v5, s11, 48 7033; GFX1264_DPP-NEXT: s_wait_alu 0xfffe 7034; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] 7035; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 7036; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec 7037; GFX1264_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 7038; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 7039; GFX1264_DPP-NEXT: s_cbranch_execz .LBB11_2 7040; GFX1264_DPP-NEXT: ; %bb.1: 7041; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 7042; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 7043; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 7044; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 7045; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 7046; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 7047; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[6:7], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7048; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 7049; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV 7050; GFX1264_DPP-NEXT: .LBB11_2: 7051; GFX1264_DPP-NEXT: s_wait_alu 0xfffe 7052; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] 7053; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 7054; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v6 7055; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v4 7056; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 7057; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 7058; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 7059; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 7060; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 7061; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc 7062; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 7063; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null 7064; GFX1264_DPP-NEXT: s_endpgm 7065; 7066; GFX1232_DPP-LABEL: sub_i64_varying: 7067; GFX1232_DPP: ; %bb.0: ; %entry 7068; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 7069; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 7070; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 7071; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 7072; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 7073; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 7074; GFX1232_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 7075; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, 0 7076; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 7077; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7078; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 7079; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7080; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 7081; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 7082; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 7083; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 7084; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 7085; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 7086; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 7087; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 7088; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 7089; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 7090; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 7091; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 7092; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 7093; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 7094; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 7095; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 7096; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 7097; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7098; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 7099; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7100; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 7101; GFX1232_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 7102; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7103; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 7104; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 7105; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 7106; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 7107; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 7108; GFX1232_DPP-NEXT: v_readlane_b32 s4, v2, 31 7109; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7110; GFX1232_DPP-NEXT: v_readlane_b32 s7, v2, 15 7111; GFX1232_DPP-NEXT: v_readlane_b32 s8, v1, 15 7112; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 31 7113; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7114; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 7115; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7116; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7117; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 7118; GFX1232_DPP-NEXT: v_writelane_b32 v6, s7, 16 7119; GFX1232_DPP-NEXT: v_writelane_b32 v7, s8, 16 7120; GFX1232_DPP-NEXT: s_wait_alu 0xfffe 7121; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 7122; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 7123; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo 7124; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 7125; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 7126; GFX1232_DPP-NEXT: s_cbranch_execz .LBB11_2 7127; GFX1232_DPP-NEXT: ; %bb.1: 7128; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 7129; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 7130; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 7131; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 7132; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 7133; GFX1232_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7134; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 7135; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV 7136; GFX1232_DPP-NEXT: .LBB11_2: 7137; GFX1232_DPP-NEXT: s_wait_alu 0xfffe 7138; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 7139; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 7140; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 7141; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 7142; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 7143; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 7144; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 7145; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 7146; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 7147; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo 7148; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 7149; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null 7150; GFX1232_DPP-NEXT: s_endpgm 7151entry: 7152 %lane = call i32 @llvm.amdgcn.workitem.id.x() 7153 %zext = zext i32 %lane to i64 7154 %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel 7155 store i64 %old, ptr addrspace(1) %out 7156 ret void 7157} 7158