1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s 9; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s 10; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s 11; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s 12; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s 13; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s 14; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s 15; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s 16 17declare i32 @llvm.amdgcn.workitem.id.x() 18 19@local_var32 = addrspace(3) global i32 undef, align 4 20@local_var64 = addrspace(3) global i64 undef, align 8 21 22; Show what the atomic optimization pass will do for local pointers. 23 24define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { 25; GFX7LESS-LABEL: add_i32_constant: 26; GFX7LESS: ; %bb.0: ; %entry 27; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 28; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 29; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 30; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 31; GFX7LESS-NEXT: ; implicit-def: $vgpr1 32; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 33; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 34; GFX7LESS-NEXT: ; %bb.1: 35; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 36; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 37; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 38; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 39; GFX7LESS-NEXT: s_mov_b32 m0, -1 40; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 41; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 42; GFX7LESS-NEXT: .LBB0_2: 43; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 44; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 45; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 46; GFX7LESS-NEXT: s_mov_b32 s2, -1 47; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 48; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 49; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 50; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 51; GFX7LESS-NEXT: s_endpgm 52; 53; GFX8-LABEL: add_i32_constant: 54; GFX8: ; %bb.0: ; %entry 55; GFX8-NEXT: s_mov_b64 s[2:3], exec 56; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 57; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 58; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 59; GFX8-NEXT: ; implicit-def: $vgpr1 60; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 61; GFX8-NEXT: s_cbranch_execz .LBB0_2 62; GFX8-NEXT: ; %bb.1: 63; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 64; GFX8-NEXT: s_mul_i32 s2, s2, 5 65; GFX8-NEXT: v_mov_b32_e32 v1, 0 66; GFX8-NEXT: v_mov_b32_e32 v2, s2 67; GFX8-NEXT: s_mov_b32 m0, -1 68; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 69; GFX8-NEXT: s_waitcnt lgkmcnt(0) 70; GFX8-NEXT: .LBB0_2: 71; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 72; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 73; GFX8-NEXT: v_readfirstlane_b32 s4, v1 74; GFX8-NEXT: s_mov_b32 s3, 0xf000 75; GFX8-NEXT: s_mov_b32 s2, -1 76; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 77; GFX8-NEXT: s_waitcnt lgkmcnt(0) 78; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 79; GFX8-NEXT: s_endpgm 80; 81; GFX9-LABEL: add_i32_constant: 82; GFX9: ; %bb.0: ; %entry 83; GFX9-NEXT: s_mov_b64 s[2:3], exec 84; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 85; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 86; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 87; GFX9-NEXT: ; implicit-def: $vgpr1 88; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 89; GFX9-NEXT: s_cbranch_execz .LBB0_2 90; GFX9-NEXT: ; %bb.1: 91; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 92; GFX9-NEXT: s_mul_i32 s2, s2, 5 93; GFX9-NEXT: v_mov_b32_e32 v1, 0 94; GFX9-NEXT: v_mov_b32_e32 v2, s2 95; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: .LBB0_2: 98; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 99; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 100; GFX9-NEXT: v_readfirstlane_b32 s4, v1 101; GFX9-NEXT: s_mov_b32 s3, 0xf000 102; GFX9-NEXT: s_mov_b32 s2, -1 103; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 104; GFX9-NEXT: s_waitcnt lgkmcnt(0) 105; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; GFX9-NEXT: s_endpgm 107; 108; GFX1064-LABEL: add_i32_constant: 109; GFX1064: ; %bb.0: ; %entry 110; GFX1064-NEXT: s_mov_b64 s[2:3], exec 111; GFX1064-NEXT: ; implicit-def: $vgpr1 112; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 113; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 114; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 115; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 116; GFX1064-NEXT: s_cbranch_execz .LBB0_2 117; GFX1064-NEXT: ; %bb.1: 118; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 119; GFX1064-NEXT: v_mov_b32_e32 v1, 0 120; GFX1064-NEXT: s_mul_i32 s2, s2, 5 121; GFX1064-NEXT: v_mov_b32_e32 v2, s2 122; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 123; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 124; GFX1064-NEXT: buffer_gl0_inv 125; GFX1064-NEXT: .LBB0_2: 126; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 127; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 128; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_mov_b32 s1, exec_lo 140; GFX1032-NEXT: ; implicit-def: $vgpr1 141; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 142; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 143; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 144; GFX1032-NEXT: s_cbranch_execz .LBB0_2 145; GFX1032-NEXT: ; %bb.1: 146; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 147; GFX1032-NEXT: v_mov_b32_e32 v1, 0 148; GFX1032-NEXT: s_mul_i32 s1, s1, 5 149; GFX1032-NEXT: v_mov_b32_e32 v2, s1 150; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 151; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 152; GFX1032-NEXT: buffer_gl0_inv 153; GFX1032-NEXT: .LBB0_2: 154; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 155; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 156; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 157; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 158; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 159; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 160; GFX1032-NEXT: s_mov_b32 s2, -1 161; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 162; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 163; GFX1032-NEXT: s_endpgm 164; 165; GFX1164-LABEL: add_i32_constant: 166; GFX1164: ; %bb.0: ; %entry 167; GFX1164-NEXT: s_mov_b64 s[2:3], exec 168; GFX1164-NEXT: s_mov_b64 s[0:1], exec 169; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 170; GFX1164-NEXT: ; implicit-def: $vgpr1 171; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 172; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 173; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 174; GFX1164-NEXT: s_cbranch_execz .LBB0_2 175; GFX1164-NEXT: ; %bb.1: 176; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 177; GFX1164-NEXT: v_mov_b32_e32 v1, 0 178; GFX1164-NEXT: s_mul_i32 s2, s2, 5 179; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 180; GFX1164-NEXT: v_mov_b32_e32 v2, s2 181; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 182; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 183; GFX1164-NEXT: buffer_gl0_inv 184; GFX1164-NEXT: .LBB0_2: 185; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 186; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 187; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 188; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 189; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 190; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 191; GFX1164-NEXT: s_mov_b32 s2, -1 192; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 193; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 194; GFX1164-NEXT: s_endpgm 195; 196; GFX1132-LABEL: add_i32_constant: 197; GFX1132: ; %bb.0: ; %entry 198; GFX1132-NEXT: s_mov_b32 s1, exec_lo 199; GFX1132-NEXT: s_mov_b32 s0, exec_lo 200; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 201; GFX1132-NEXT: ; implicit-def: $vgpr1 202; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 203; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 204; GFX1132-NEXT: s_cbranch_execz .LBB0_2 205; GFX1132-NEXT: ; %bb.1: 206; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 207; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 208; GFX1132-NEXT: s_mul_i32 s1, s1, 5 209; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 210; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 211; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 212; GFX1132-NEXT: buffer_gl0_inv 213; GFX1132-NEXT: .LBB0_2: 214; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 215; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 216; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 217; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 218; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 219; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 220; GFX1132-NEXT: s_mov_b32 s2, -1 221; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 222; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 223; GFX1132-NEXT: s_endpgm 224entry: 225 %old = atomicrmw add ptr addrspace(3) @local_var32, i32 5 acq_rel 226 store i32 %old, ptr addrspace(1) %out 227 ret void 228} 229 230define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) { 231; GFX7LESS-LABEL: add_i32_uniform: 232; GFX7LESS: ; %bb.0: ; %entry 233; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 234; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xb 235; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 236; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 237; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 238; GFX7LESS-NEXT: ; implicit-def: $vgpr1 239; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 240; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 241; GFX7LESS-NEXT: ; %bb.1: 242; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 243; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 244; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 245; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 246; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 247; GFX7LESS-NEXT: s_mov_b32 m0, -1 248; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 249; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 250; GFX7LESS-NEXT: .LBB1_2: 251; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 252; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 253; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 254; GFX7LESS-NEXT: s_mov_b32 s2, -1 255; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 256; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 257; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 258; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 259; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 260; GFX7LESS-NEXT: s_endpgm 261; 262; GFX8-LABEL: add_i32_uniform: 263; GFX8: ; %bb.0: ; %entry 264; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c 265; GFX8-NEXT: s_mov_b64 s[2:3], exec 266; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 267; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 268; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 269; GFX8-NEXT: ; implicit-def: $vgpr1 270; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 271; GFX8-NEXT: s_cbranch_execz .LBB1_2 272; GFX8-NEXT: ; %bb.1: 273; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 274; GFX8-NEXT: s_waitcnt lgkmcnt(0) 275; GFX8-NEXT: s_mul_i32 s2, s6, s2 276; GFX8-NEXT: v_mov_b32_e32 v1, 0 277; GFX8-NEXT: v_mov_b32_e32 v2, s2 278; GFX8-NEXT: s_mov_b32 m0, -1 279; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 280; GFX8-NEXT: s_waitcnt lgkmcnt(0) 281; GFX8-NEXT: .LBB1_2: 282; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 283; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 284; GFX8-NEXT: s_waitcnt lgkmcnt(0) 285; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 286; GFX8-NEXT: v_readfirstlane_b32 s4, v1 287; GFX8-NEXT: s_mov_b32 s3, 0xf000 288; GFX8-NEXT: s_mov_b32 s2, -1 289; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 290; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 291; GFX8-NEXT: s_endpgm 292; 293; GFX9-LABEL: add_i32_uniform: 294; GFX9: ; %bb.0: ; %entry 295; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 296; GFX9-NEXT: s_mov_b64 s[2:3], exec 297; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 298; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 299; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 300; GFX9-NEXT: ; implicit-def: $vgpr1 301; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 302; GFX9-NEXT: s_cbranch_execz .LBB1_2 303; GFX9-NEXT: ; %bb.1: 304; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 305; GFX9-NEXT: s_waitcnt lgkmcnt(0) 306; GFX9-NEXT: s_mul_i32 s2, s6, s2 307; GFX9-NEXT: v_mov_b32_e32 v1, 0 308; GFX9-NEXT: v_mov_b32_e32 v2, s2 309; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 310; GFX9-NEXT: s_waitcnt lgkmcnt(0) 311; GFX9-NEXT: .LBB1_2: 312; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 313; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 314; GFX9-NEXT: s_waitcnt lgkmcnt(0) 315; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 316; GFX9-NEXT: v_readfirstlane_b32 s4, v1 317; GFX9-NEXT: s_mov_b32 s3, 0xf000 318; GFX9-NEXT: s_mov_b32 s2, -1 319; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 320; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 321; GFX9-NEXT: s_endpgm 322; 323; GFX1064-LABEL: add_i32_uniform: 324; GFX1064: ; %bb.0: ; %entry 325; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c 326; GFX1064-NEXT: s_mov_b64 s[2:3], exec 327; GFX1064-NEXT: ; implicit-def: $vgpr1 328; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 329; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 330; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 331; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 332; GFX1064-NEXT: s_cbranch_execz .LBB1_2 333; GFX1064-NEXT: ; %bb.1: 334; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 335; GFX1064-NEXT: v_mov_b32_e32 v1, 0 336; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 337; GFX1064-NEXT: s_mul_i32 s2, s6, s2 338; GFX1064-NEXT: v_mov_b32_e32 v2, s2 339; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 340; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 341; GFX1064-NEXT: buffer_gl0_inv 342; GFX1064-NEXT: .LBB1_2: 343; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 344; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 345; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 346; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 347; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 348; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] 349; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 350; GFX1064-NEXT: s_mov_b32 s2, -1 351; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 352; GFX1064-NEXT: s_endpgm 353; 354; GFX1032-LABEL: add_i32_uniform: 355; GFX1032: ; %bb.0: ; %entry 356; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x2c 357; GFX1032-NEXT: s_mov_b32 s2, exec_lo 358; GFX1032-NEXT: ; implicit-def: $vgpr1 359; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 360; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 361; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 362; GFX1032-NEXT: s_cbranch_execz .LBB1_2 363; GFX1032-NEXT: ; %bb.1: 364; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 365; GFX1032-NEXT: v_mov_b32_e32 v1, 0 366; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 367; GFX1032-NEXT: s_mul_i32 s2, s0, s2 368; GFX1032-NEXT: v_mov_b32_e32 v2, s2 369; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 370; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 371; GFX1032-NEXT: buffer_gl0_inv 372; GFX1032-NEXT: .LBB1_2: 373; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 374; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 375; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 376; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 377; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 378; GFX1032-NEXT: s_mov_b32 s10, -1 379; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 380; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] 381; GFX1032-NEXT: buffer_store_dword v0, off, s[8:11], 0 382; GFX1032-NEXT: s_endpgm 383; 384; GFX1164-LABEL: add_i32_uniform: 385; GFX1164: ; %bb.0: ; %entry 386; GFX1164-NEXT: s_load_b32 s6, s[4:5], 0x2c 387; GFX1164-NEXT: s_mov_b64 s[2:3], exec 388; GFX1164-NEXT: s_mov_b64 s[0:1], exec 389; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 390; GFX1164-NEXT: ; implicit-def: $vgpr1 391; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 392; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 393; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 394; GFX1164-NEXT: s_cbranch_execz .LBB1_2 395; GFX1164-NEXT: ; %bb.1: 396; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 397; GFX1164-NEXT: v_mov_b32_e32 v1, 0 398; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 399; GFX1164-NEXT: s_mul_i32 s2, s6, s2 400; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 401; GFX1164-NEXT: v_mov_b32_e32 v2, s2 402; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 403; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 404; GFX1164-NEXT: buffer_gl0_inv 405; GFX1164-NEXT: .LBB1_2: 406; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 407; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 408; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 409; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 410; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 411; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] 412; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 413; GFX1164-NEXT: s_mov_b32 s2, -1 414; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0 415; GFX1164-NEXT: s_endpgm 416; 417; GFX1132-LABEL: add_i32_uniform: 418; GFX1132: ; %bb.0: ; %entry 419; GFX1132-NEXT: s_load_b32 s0, s[4:5], 0x2c 420; GFX1132-NEXT: s_mov_b32 s2, exec_lo 421; GFX1132-NEXT: s_mov_b32 s1, exec_lo 422; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 423; GFX1132-NEXT: ; implicit-def: $vgpr1 424; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 425; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 426; GFX1132-NEXT: s_cbranch_execz .LBB1_2 427; GFX1132-NEXT: ; %bb.1: 428; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 429; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 430; GFX1132-NEXT: s_mul_i32 s2, s0, s2 431; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 432; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 433; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 434; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 435; GFX1132-NEXT: buffer_gl0_inv 436; GFX1132-NEXT: .LBB1_2: 437; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 438; GFX1132-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 439; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 440; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 441; GFX1132-NEXT: s_mov_b32 s6, -1 442; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 443; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 444; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] 445; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 446; GFX1132-NEXT: s_endpgm 447entry: 448 %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %additive acq_rel 449 store i32 %old, ptr addrspace(1) %out 450 ret void 451} 452 453define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { 454; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: 455; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 456; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 457; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 458; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 459; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 460; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 461; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 462; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 463; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 464; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 465; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 466; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 467; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 468; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 469; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 470; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 471; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 472; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 473; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 474; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 475; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 476; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 477; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 478; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 479; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 480; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 481; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 482; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 483; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 484; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 485; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: 486; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 487; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 488; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 489; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 490; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 491; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s4, v1 492; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 493; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 494; GFX7LESS_ITERATIVE-NEXT: s_endpgm 495; 496; GFX8_ITERATIVE-LABEL: add_i32_varying: 497; GFX8_ITERATIVE: ; %bb.0: ; %entry 498; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 499; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 500; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 501; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 502; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 503; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 504; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 505; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 506; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 507; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 508; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 509; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 510; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 511; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 512; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 513; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 514; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 515; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 516; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 517; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 518; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 519; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 520; GFX8_ITERATIVE-NEXT: ; %bb.3: 521; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 522; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 523; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 524; GFX8_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 525; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 526; GFX8_ITERATIVE-NEXT: .LBB2_4: 527; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 528; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 529; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 530; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 531; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 532; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s4, v1 533; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 534; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 535; GFX8_ITERATIVE-NEXT: s_endpgm 536; 537; GFX9_ITERATIVE-LABEL: add_i32_varying: 538; GFX9_ITERATIVE: ; %bb.0: ; %entry 539; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 540; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 541; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 542; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 543; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 544; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 545; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 546; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 547; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 548; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 549; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 550; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 551; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 552; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 553; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 554; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 555; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 556; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 557; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 558; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 559; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 560; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 561; GFX9_ITERATIVE-NEXT: ; %bb.3: 562; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 563; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 564; GFX9_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 565; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 566; GFX9_ITERATIVE-NEXT: .LBB2_4: 567; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 568; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 569; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 570; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 571; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 572; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1 573; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 574; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 575; GFX9_ITERATIVE-NEXT: s_endpgm 576; 577; GFX1064_ITERATIVE-LABEL: add_i32_varying: 578; GFX1064_ITERATIVE: ; %bb.0: ; %entry 579; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 580; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 581; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 582; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 583; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 584; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 585; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 586; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 587; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 588; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 589; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 590; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 591; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 592; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 593; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 594; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 595; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 596; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 597; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 598; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 599; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 600; GFX1064_ITERATIVE-NEXT: ; %bb.3: 601; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 602; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 603; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 604; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 605; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 606; GFX1064_ITERATIVE-NEXT: .LBB2_4: 607; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 608; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 609; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 610; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 611; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 612; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 613; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 614; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 615; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 616; GFX1064_ITERATIVE-NEXT: s_endpgm 617; 618; GFX1032_ITERATIVE-LABEL: add_i32_varying: 619; GFX1032_ITERATIVE: ; %bb.0: ; %entry 620; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 621; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 622; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 623; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 624; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 625; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 626; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 627; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 628; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 629; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 630; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 631; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 632; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 633; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 634; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 635; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 636; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 637; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 638; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 639; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 640; GFX1032_ITERATIVE-NEXT: ; %bb.3: 641; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 642; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 643; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 644; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 645; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 646; GFX1032_ITERATIVE-NEXT: .LBB2_4: 647; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 648; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 649; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 650; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 651; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 652; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 653; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 654; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 655; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 656; GFX1032_ITERATIVE-NEXT: s_endpgm 657; 658; GFX1164_ITERATIVE-LABEL: add_i32_varying: 659; GFX1164_ITERATIVE: ; %bb.0: ; %entry 660; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 661; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 662; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 663; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 664; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 665; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 666; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 667; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 668; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 669; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 670; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 671; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 672; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 673; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 674; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 675; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 676; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 677; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 678; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 679; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 680; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 681; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 682; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 683; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 684; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 685; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 686; GFX1164_ITERATIVE-NEXT: ; %bb.3: 687; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 688; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 689; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 690; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 691; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 692; GFX1164_ITERATIVE-NEXT: .LBB2_4: 693; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 694; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 695; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 696; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 697; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 698; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 699; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 700; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 701; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 702; GFX1164_ITERATIVE-NEXT: s_endpgm 703; 704; GFX1132_ITERATIVE-LABEL: add_i32_varying: 705; GFX1132_ITERATIVE: ; %bb.0: ; %entry 706; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 707; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 708; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 709; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 710; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop 711; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 712; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 713; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 714; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 715; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 716; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 717; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 718; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 719; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 720; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 721; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 722; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 723; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 724; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 725; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 726; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 727; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 728; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 729; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 730; GFX1132_ITERATIVE-NEXT: ; %bb.3: 731; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 732; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 733; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 734; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 735; GFX1132_ITERATIVE-NEXT: .LBB2_4: 736; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 737; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 738; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 739; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 740; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 741; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 742; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 743; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 744; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 745; GFX1132_ITERATIVE-NEXT: s_endpgm 746; 747; GFX7LESS_DPP-LABEL: add_i32_varying: 748; GFX7LESS_DPP: ; %bb.0: ; %entry 749; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 750; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 751; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 752; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 753; GFX7LESS_DPP-NEXT: ds_add_rtn_u32 v0, v1, v0 754; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 755; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 756; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 757; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 758; GFX7LESS_DPP-NEXT: s_endpgm 759; 760; GFX8_DPP-LABEL: add_i32_varying: 761; GFX8_DPP: ; %bb.0: ; %entry 762; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 763; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 764; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 765; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 766; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 767; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 768; GFX8_DPP-NEXT: s_nop 0 769; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 770; GFX8_DPP-NEXT: s_nop 1 771; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 772; GFX8_DPP-NEXT: s_nop 1 773; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 774; GFX8_DPP-NEXT: s_nop 1 775; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 776; GFX8_DPP-NEXT: s_nop 1 777; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 778; GFX8_DPP-NEXT: s_nop 1 779; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 780; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 781; GFX8_DPP-NEXT: s_nop 0 782; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 783; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 784; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 785; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 786; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 787; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 788; GFX8_DPP-NEXT: ; %bb.1: 789; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 790; GFX8_DPP-NEXT: s_mov_b32 m0, -1 791; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 792; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 793; GFX8_DPP-NEXT: .LBB2_2: 794; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 795; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 796; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 797; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 798; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 799; GFX8_DPP-NEXT: s_mov_b32 s2, -1 800; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0 801; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 802; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 803; GFX8_DPP-NEXT: s_endpgm 804; 805; GFX9_DPP-LABEL: add_i32_varying: 806; GFX9_DPP: ; %bb.0: ; %entry 807; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 808; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 809; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 810; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 811; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 812; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 813; GFX9_DPP-NEXT: s_nop 0 814; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 815; GFX9_DPP-NEXT: s_nop 1 816; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 817; GFX9_DPP-NEXT: s_nop 1 818; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 819; GFX9_DPP-NEXT: s_nop 1 820; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 821; GFX9_DPP-NEXT: s_nop 1 822; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 823; GFX9_DPP-NEXT: s_nop 1 824; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 825; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 826; GFX9_DPP-NEXT: s_nop 0 827; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 828; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 829; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 830; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 831; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 832; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 833; GFX9_DPP-NEXT: ; %bb.1: 834; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 835; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 836; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 837; GFX9_DPP-NEXT: .LBB2_2: 838; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 839; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 840; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 841; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 842; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 843; GFX9_DPP-NEXT: s_mov_b32 s2, -1 844; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 845; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 846; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 847; GFX9_DPP-NEXT: s_endpgm 848; 849; GFX1064_DPP-LABEL: add_i32_varying: 850; GFX1064_DPP: ; %bb.0: ; %entry 851; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 852; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 853; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 854; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 855; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 856; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 857; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 858; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 859; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 860; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 861; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 862; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 863; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 864; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 865; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 866; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 867; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 868; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 869; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 870; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 871; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 872; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 873; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 874; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 875; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 876; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 877; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 878; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 879; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 880; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 881; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 882; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 883; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 884; GFX1064_DPP-NEXT: ; %bb.1: 885; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 886; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 887; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 888; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 889; GFX1064_DPP-NEXT: buffer_gl0_inv 890; GFX1064_DPP-NEXT: .LBB2_2: 891; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 892; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 893; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 894; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 895; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 896; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 897; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 898; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 899; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 900; GFX1064_DPP-NEXT: s_endpgm 901; 902; GFX1032_DPP-LABEL: add_i32_varying: 903; GFX1032_DPP: ; %bb.0: ; %entry 904; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 905; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 906; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 907; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 908; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 909; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 910; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 911; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 912; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 913; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 914; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 915; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 916; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 917; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 918; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 919; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 920; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 921; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 922; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 923; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 924; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 925; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 926; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 927; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 928; GFX1032_DPP-NEXT: ; %bb.1: 929; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 930; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 931; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 932; GFX1032_DPP-NEXT: buffer_gl0_inv 933; GFX1032_DPP-NEXT: .LBB2_2: 934; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 935; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 936; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 937; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 938; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 939; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 940; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 941; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 942; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 943; GFX1032_DPP-NEXT: s_endpgm 944; 945; GFX1164_DPP-LABEL: add_i32_varying: 946; GFX1164_DPP: ; %bb.0: ; %entry 947; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 948; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 949; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 950; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 951; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 952; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 953; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 954; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 955; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 956; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 957; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 958; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 959; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 960; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 961; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 962; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 963; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 964; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 965; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 966; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 967; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 968; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 969; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 970; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 971; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 972; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 973; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 974; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 975; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 976; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 977; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 978; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 979; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 980; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 981; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 982; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 983; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 984; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 985; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 986; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 987; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 988; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 989; GFX1164_DPP-NEXT: ; %bb.1: 990; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 991; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 992; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 993; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 994; GFX1164_DPP-NEXT: buffer_gl0_inv 995; GFX1164_DPP-NEXT: .LBB2_2: 996; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 997; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 998; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 999; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 1000; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 1001; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 1002; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 1003; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1005; GFX1164_DPP-NEXT: s_endpgm 1006; 1007; GFX1132_DPP-LABEL: add_i32_varying: 1008; GFX1132_DPP: ; %bb.0: ; %entry 1009; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1010; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 1011; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1012; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 1013; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 1014; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1015; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1016; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1017; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1018; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1019; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1020; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1021; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 1022; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1023; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 1024; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 1025; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 1026; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1027; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 1028; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1029; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 1030; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 1031; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 1032; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 1033; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 1034; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 1035; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1036; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 1037; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 1038; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 1039; GFX1132_DPP-NEXT: ; %bb.1: 1040; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 1041; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 1042; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 1043; GFX1132_DPP-NEXT: buffer_gl0_inv 1044; GFX1132_DPP-NEXT: .LBB2_2: 1045; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 1046; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1047; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 1048; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 1049; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 1050; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 1051; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 1052; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1054; GFX1132_DPP-NEXT: s_endpgm 1055entry: 1056 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1057 %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel 1058 store i32 %old, ptr addrspace(1) %out 1059 ret void 1060} 1061 1062define amdgpu_kernel void @add_i32_varying_nouse() { 1063; GFX7LESS_ITERATIVE-LABEL: add_i32_varying_nouse: 1064; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 1065; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 1066; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 1067; GFX7LESS_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop 1068; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1069; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 1070; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 1071; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 1072; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 1073; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 1074; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] 1075; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 1076; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB3_1 1077; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1078; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1079; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 1080; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1081; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 1082; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1083; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 1084; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 1085; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 1086; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 1087; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 1088; GFX7LESS_ITERATIVE-NEXT: ds_add_u32 v0, v1 1089; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 1090; GFX7LESS_ITERATIVE-NEXT: .LBB3_4: 1091; GFX7LESS_ITERATIVE-NEXT: s_endpgm 1092; 1093; GFX8_ITERATIVE-LABEL: add_i32_varying_nouse: 1094; GFX8_ITERATIVE: ; %bb.0: ; %entry 1095; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 1096; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 1097; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop 1098; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1099; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 1100; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 1101; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 1102; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 1103; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 1104; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 1105; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 1106; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1107; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1108; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1109; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1110; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 1111; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1112; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 1113; GFX8_ITERATIVE-NEXT: ; %bb.3: 1114; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 1115; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 1116; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 1117; GFX8_ITERATIVE-NEXT: ds_add_u32 v0, v1 1118; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 1119; GFX8_ITERATIVE-NEXT: .LBB3_4: 1120; GFX8_ITERATIVE-NEXT: s_endpgm 1121; 1122; GFX9_ITERATIVE-LABEL: add_i32_varying_nouse: 1123; GFX9_ITERATIVE: ; %bb.0: ; %entry 1124; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 1125; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 1126; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop 1127; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1128; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 1129; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 1130; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 1131; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 1132; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 1133; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 1134; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 1135; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1136; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1137; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1138; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1139; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 1140; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1141; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 1142; GFX9_ITERATIVE-NEXT: ; %bb.3: 1143; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 1144; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 1145; GFX9_ITERATIVE-NEXT: ds_add_u32 v0, v1 1146; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 1147; GFX9_ITERATIVE-NEXT: .LBB3_4: 1148; GFX9_ITERATIVE-NEXT: s_endpgm 1149; 1150; GFX1064_ITERATIVE-LABEL: add_i32_varying_nouse: 1151; GFX1064_ITERATIVE: ; %bb.0: ; %entry 1152; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 1153; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 1154; GFX1064_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop 1155; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1156; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 1157; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 1158; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 1159; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 1160; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 1161; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 1162; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 1163; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1164; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1165; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1166; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1167; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 1168; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1169; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 1170; GFX1064_ITERATIVE-NEXT: ; %bb.3: 1171; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 1172; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 1173; GFX1064_ITERATIVE-NEXT: ds_add_u32 v0, v1 1174; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 1175; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 1176; GFX1064_ITERATIVE-NEXT: .LBB3_4: 1177; GFX1064_ITERATIVE-NEXT: s_endpgm 1178; 1179; GFX1032_ITERATIVE-LABEL: add_i32_varying_nouse: 1180; GFX1032_ITERATIVE: ; %bb.0: ; %entry 1181; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 1182; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 1183; GFX1032_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop 1184; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1185; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 1186; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 1187; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 1188; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 1189; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 1190; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 1191; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 1192; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1193; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1194; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1195; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 1196; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 1197; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 1198; GFX1032_ITERATIVE-NEXT: ; %bb.3: 1199; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 1200; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0 1201; GFX1032_ITERATIVE-NEXT: ds_add_u32 v0, v1 1202; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 1204; GFX1032_ITERATIVE-NEXT: .LBB3_4: 1205; GFX1032_ITERATIVE-NEXT: s_endpgm 1206; 1207; GFX1164_ITERATIVE-LABEL: add_i32_varying_nouse: 1208; GFX1164_ITERATIVE: ; %bb.0: ; %entry 1209; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1210; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 1211; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 1212; GFX1164_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop 1213; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1214; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 1215; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1216; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 1217; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 1218; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1219; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] 1220; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 1221; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 1222; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 1223; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1224; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1225; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 1226; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1227; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1228; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 1229; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1230; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 1231; GFX1164_ITERATIVE-NEXT: ; %bb.3: 1232; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 1233; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 1234; GFX1164_ITERATIVE-NEXT: ds_add_u32 v0, v1 1235; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 1237; GFX1164_ITERATIVE-NEXT: .LBB3_4: 1238; GFX1164_ITERATIVE-NEXT: s_endpgm 1239; 1240; GFX1132_ITERATIVE-LABEL: add_i32_varying_nouse: 1241; GFX1132_ITERATIVE: ; %bb.0: ; %entry 1242; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1243; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 1244; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 1245; GFX1132_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop 1246; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1247; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 1248; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1249; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 1250; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 1251; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1252; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 1253; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 1254; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 1255; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 1256; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1257; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1258; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 1259; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 1260; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 1261; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 1262; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 1263; GFX1132_ITERATIVE-NEXT: ; %bb.3: 1264; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 1265; GFX1132_ITERATIVE-NEXT: ds_add_u32 v0, v1 1266; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 1267; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 1268; GFX1132_ITERATIVE-NEXT: .LBB3_4: 1269; GFX1132_ITERATIVE-NEXT: s_endpgm 1270; 1271; GFX7LESS_DPP-LABEL: add_i32_varying_nouse: 1272; GFX7LESS_DPP: ; %bb.0: ; %entry 1273; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 1274; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 1275; GFX7LESS_DPP-NEXT: ds_add_u32 v1, v0 1276; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 1277; GFX7LESS_DPP-NEXT: s_endpgm 1278; 1279; GFX8_DPP-LABEL: add_i32_varying_nouse: 1280; GFX8_DPP: ; %bb.0: ; %entry 1281; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 1282; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1283; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1284; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1285; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 1286; GFX8_DPP-NEXT: s_nop 1 1287; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1288; GFX8_DPP-NEXT: s_nop 1 1289; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1290; GFX8_DPP-NEXT: s_nop 1 1291; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1292; GFX8_DPP-NEXT: s_nop 1 1293; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1294; GFX8_DPP-NEXT: s_nop 1 1295; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1296; GFX8_DPP-NEXT: s_nop 1 1297; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1298; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 1299; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 1300; GFX8_DPP-NEXT: s_mov_b32 s0, s2 1301; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1302; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 1303; GFX8_DPP-NEXT: s_cbranch_execz .LBB3_2 1304; GFX8_DPP-NEXT: ; %bb.1: 1305; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 1306; GFX8_DPP-NEXT: s_mov_b32 m0, -1 1307; GFX8_DPP-NEXT: ds_add_u32 v2, v0 1308; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 1309; GFX8_DPP-NEXT: .LBB3_2: 1310; GFX8_DPP-NEXT: s_endpgm 1311; 1312; GFX9_DPP-LABEL: add_i32_varying_nouse: 1313; GFX9_DPP: ; %bb.0: ; %entry 1314; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 1315; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1316; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1317; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1318; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 1319; GFX9_DPP-NEXT: s_nop 1 1320; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1321; GFX9_DPP-NEXT: s_nop 1 1322; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1323; GFX9_DPP-NEXT: s_nop 1 1324; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1325; GFX9_DPP-NEXT: s_nop 1 1326; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1327; GFX9_DPP-NEXT: s_nop 1 1328; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1329; GFX9_DPP-NEXT: s_nop 1 1330; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1331; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 1332; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 1333; GFX9_DPP-NEXT: s_mov_b32 s0, s2 1334; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1335; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 1336; GFX9_DPP-NEXT: s_cbranch_execz .LBB3_2 1337; GFX9_DPP-NEXT: ; %bb.1: 1338; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 1339; GFX9_DPP-NEXT: ds_add_u32 v2, v0 1340; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 1341; GFX9_DPP-NEXT: .LBB3_2: 1342; GFX9_DPP-NEXT: s_endpgm 1343; 1344; GFX1064_DPP-LABEL: add_i32_varying_nouse: 1345; GFX1064_DPP: ; %bb.0: ; %entry 1346; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1347; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 1348; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1349; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1350; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1351; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1352; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 1353; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 1354; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 1355; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1356; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1357; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0 1358; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32 1359; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 1360; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 1361; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 1362; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 1363; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1364; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 1365; GFX1064_DPP-NEXT: s_cbranch_execz .LBB3_2 1366; GFX1064_DPP-NEXT: ; %bb.1: 1367; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 1368; GFX1064_DPP-NEXT: ds_add_u32 v0, v3 1369; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 1370; GFX1064_DPP-NEXT: buffer_gl0_inv 1371; GFX1064_DPP-NEXT: .LBB3_2: 1372; GFX1064_DPP-NEXT: s_endpgm 1373; 1374; GFX1032_DPP-LABEL: add_i32_varying_nouse: 1375; GFX1032_DPP: ; %bb.0: ; %entry 1376; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 1377; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 1378; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1379; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1380; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1381; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1382; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 1383; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 1384; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 1385; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 1386; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 1387; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 1388; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 1389; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo 1390; GFX1032_DPP-NEXT: s_cbranch_execz .LBB3_2 1391; GFX1032_DPP-NEXT: ; %bb.1: 1392; GFX1032_DPP-NEXT: ds_add_u32 v0, v3 1393; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 1394; GFX1032_DPP-NEXT: buffer_gl0_inv 1395; GFX1032_DPP-NEXT: .LBB3_2: 1396; GFX1032_DPP-NEXT: s_endpgm 1397; 1398; GFX1164_DPP-LABEL: add_i32_varying_nouse: 1399; GFX1164_DPP: ; %bb.0: ; %entry 1400; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1401; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1402; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1403; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 1404; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1405; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1406; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1407; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1408; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1409; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1410; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1411; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 1412; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 1413; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1414; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 1415; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 1416; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1417; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 1418; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe 1419; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 1420; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 1421; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 1422; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 1423; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 1424; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 1425; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec 1426; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) 1427; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 1428; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2 1429; GFX1164_DPP-NEXT: ; %bb.1: 1430; GFX1164_DPP-NEXT: ds_add_u32 v0, v3 1431; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 1432; GFX1164_DPP-NEXT: buffer_gl0_inv 1433; GFX1164_DPP-NEXT: .LBB3_2: 1434; GFX1164_DPP-NEXT: s_endpgm 1435; 1436; GFX1132_DPP-LABEL: add_i32_varying_nouse: 1437; GFX1132_DPP: ; %bb.0: ; %entry 1438; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1439; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 1440; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 1441; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 1442; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1443; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1444; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1445; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1446; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1447; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1448; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1449; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 1450; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 1451; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 1452; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 1453; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 1454; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 1455; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo 1456; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) 1457; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 1458; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 1459; GFX1132_DPP-NEXT: ; %bb.1: 1460; GFX1132_DPP-NEXT: ds_add_u32 v0, v3 1461; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 1462; GFX1132_DPP-NEXT: buffer_gl0_inv 1463; GFX1132_DPP-NEXT: .LBB3_2: 1464; GFX1132_DPP-NEXT: s_endpgm 1465entry: 1466 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1467 %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel 1468 ret void 1469} 1470 1471define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { 1472; GFX7LESS-LABEL: add_i64_constant: 1473; GFX7LESS: ; %bb.0: ; %entry 1474; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1475; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1476; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s3, v0 1477; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1478; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1479; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1480; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1481; GFX7LESS-NEXT: ; %bb.1: 1482; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1483; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1484; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1485; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 1486; GFX7LESS-NEXT: s_mov_b32 m0, -1 1487; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1488; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1489; GFX7LESS-NEXT: .LBB4_2: 1490; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1491; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1492; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1493; GFX7LESS-NEXT: s_mov_b32 s2, -1 1494; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1495; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 1496; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1497; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1498; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1499; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0 1500; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1501; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1502; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1503; GFX7LESS-NEXT: s_endpgm 1504; 1505; GFX8-LABEL: add_i64_constant: 1506; GFX8: ; %bb.0: ; %entry 1507; GFX8-NEXT: s_mov_b64 s[2:3], exec 1508; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1509; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 1510; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1511; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1512; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1513; GFX8-NEXT: s_cbranch_execz .LBB4_2 1514; GFX8-NEXT: ; %bb.1: 1515; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1516; GFX8-NEXT: s_mul_i32 s2, s2, 5 1517; GFX8-NEXT: v_mov_b32_e32 v0, s2 1518; GFX8-NEXT: v_mov_b32_e32 v1, 0 1519; GFX8-NEXT: s_mov_b32 m0, -1 1520; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1521; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1522; GFX8-NEXT: .LBB4_2: 1523; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1524; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1525; GFX8-NEXT: v_readfirstlane_b32 s3, v0 1526; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1527; GFX8-NEXT: v_mov_b32_e32 v0, s3 1528; GFX8-NEXT: v_mov_b32_e32 v1, s2 1529; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1530; GFX8-NEXT: s_mov_b32 s3, 0xf000 1531; GFX8-NEXT: s_mov_b32 s2, -1 1532; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1533; GFX8-NEXT: s_nop 1 1534; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1535; GFX8-NEXT: s_endpgm 1536; 1537; GFX9-LABEL: add_i64_constant: 1538; GFX9: ; %bb.0: ; %entry 1539; GFX9-NEXT: s_mov_b64 s[2:3], exec 1540; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1541; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 1542; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1543; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1544; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1545; GFX9-NEXT: s_cbranch_execz .LBB4_2 1546; GFX9-NEXT: ; %bb.1: 1547; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1548; GFX9-NEXT: s_mul_i32 s2, s2, 5 1549; GFX9-NEXT: v_mov_b32_e32 v0, s2 1550; GFX9-NEXT: v_mov_b32_e32 v1, 0 1551; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1552; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1553; GFX9-NEXT: .LBB4_2: 1554; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1555; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1556; GFX9-NEXT: v_readfirstlane_b32 s3, v0 1557; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1558; GFX9-NEXT: v_mov_b32_e32 v0, s3 1559; GFX9-NEXT: v_mov_b32_e32 v1, s2 1560; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1561; GFX9-NEXT: s_mov_b32 s3, 0xf000 1562; GFX9-NEXT: s_mov_b32 s2, -1 1563; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1564; GFX9-NEXT: s_nop 1 1565; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1566; GFX9-NEXT: s_endpgm 1567; 1568; GFX1064-LABEL: add_i64_constant: 1569; GFX1064: ; %bb.0: ; %entry 1570; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1571; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1572; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 1573; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1574; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1575; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1576; GFX1064-NEXT: s_cbranch_execz .LBB4_2 1577; GFX1064-NEXT: ; %bb.1: 1578; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1579; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1580; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1581; GFX1064-NEXT: v_mov_b32_e32 v0, s2 1582; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1583; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1584; GFX1064-NEXT: buffer_gl0_inv 1585; GFX1064-NEXT: .LBB4_2: 1586; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1587; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1588; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1589; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1590; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1591; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1592; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1593; GFX1064-NEXT: s_mov_b32 s2, -1 1594; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1595; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1596; GFX1064-NEXT: s_endpgm 1597; 1598; GFX1032-LABEL: add_i64_constant: 1599; GFX1032: ; %bb.0: ; %entry 1600; GFX1032-NEXT: s_mov_b32 s1, exec_lo 1601; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1602; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 1603; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1604; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1605; GFX1032-NEXT: s_cbranch_execz .LBB4_2 1606; GFX1032-NEXT: ; %bb.1: 1607; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 1608; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1609; GFX1032-NEXT: s_mul_i32 s1, s1, 5 1610; GFX1032-NEXT: v_mov_b32_e32 v0, s1 1611; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1612; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1613; GFX1032-NEXT: buffer_gl0_inv 1614; GFX1032-NEXT: .LBB4_2: 1615; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1616; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1617; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1618; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1619; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1620; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1621; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1622; GFX1032-NEXT: s_mov_b32 s2, -1 1623; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1624; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1625; GFX1032-NEXT: s_endpgm 1626; 1627; GFX1164-LABEL: add_i64_constant: 1628; GFX1164: ; %bb.0: ; %entry 1629; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1630; GFX1164-NEXT: s_mov_b64 s[0:1], exec 1631; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1632; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1633; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 1634; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1635; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1636; GFX1164-NEXT: s_cbranch_execz .LBB4_2 1637; GFX1164-NEXT: ; %bb.1: 1638; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1639; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1640; GFX1164-NEXT: s_mul_i32 s2, s2, 5 1641; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1642; GFX1164-NEXT: v_mov_b32_e32 v0, s2 1643; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1644; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1645; GFX1164-NEXT: buffer_gl0_inv 1646; GFX1164-NEXT: .LBB4_2: 1647; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 1648; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1649; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1650; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1651; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1652; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1653; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1654; GFX1164-NEXT: s_mov_b32 s2, -1 1655; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1656; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1657; GFX1164-NEXT: s_endpgm 1658; 1659; GFX1132-LABEL: add_i64_constant: 1660; GFX1132: ; %bb.0: ; %entry 1661; GFX1132-NEXT: s_mov_b32 s1, exec_lo 1662; GFX1132-NEXT: s_mov_b32 s0, exec_lo 1663; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 1664; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1665; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1666; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1667; GFX1132-NEXT: s_cbranch_execz .LBB4_2 1668; GFX1132-NEXT: ; %bb.1: 1669; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 1670; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1671; GFX1132-NEXT: s_mul_i32 s1, s1, 5 1672; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1673; GFX1132-NEXT: v_mov_b32_e32 v0, s1 1674; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1675; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1676; GFX1132-NEXT: buffer_gl0_inv 1677; GFX1132-NEXT: .LBB4_2: 1678; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 1679; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1680; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1681; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1682; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1683; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1684; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1685; GFX1132-NEXT: s_mov_b32 s2, -1 1686; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1687; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1688; GFX1132-NEXT: s_endpgm 1689entry: 1690 %old = atomicrmw add ptr addrspace(3) @local_var64, i64 5 acq_rel 1691 store i64 %old, ptr addrspace(1) %out 1692 ret void 1693} 1694 1695define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) { 1696; GFX7LESS-LABEL: add_i64_uniform: 1697; GFX7LESS: ; %bb.0: ; %entry 1698; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1699; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1700; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1701; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1702; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1703; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1704; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1705; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 1706; GFX7LESS-NEXT: ; %bb.1: 1707; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1708; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 1709; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1711; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1712; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 1713; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1714; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1715; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1716; GFX7LESS-NEXT: s_mov_b32 m0, -1 1717; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1718; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1719; GFX7LESS-NEXT: .LBB5_2: 1720; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1721; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1722; GFX7LESS-NEXT: s_mov_b32 s6, -1 1723; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1724; GFX7LESS-NEXT: s_mov_b32 s4, s0 1725; GFX7LESS-NEXT: s_mov_b32 s5, s1 1726; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1727; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0 1728; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 1729; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 1730; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 1731; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1732; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0 1733; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v2 1734; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1735; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1736; GFX7LESS-NEXT: s_endpgm 1737; 1738; GFX8-LABEL: add_i64_uniform: 1739; GFX8: ; %bb.0: ; %entry 1740; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1741; GFX8-NEXT: s_mov_b64 s[6:7], exec 1742; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1743; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1744; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1745; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1746; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1747; GFX8-NEXT: s_cbranch_execz .LBB5_2 1748; GFX8-NEXT: ; %bb.1: 1749; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 1750; GFX8-NEXT: v_mov_b32_e32 v0, s8 1751; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1752; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 1753; GFX8-NEXT: s_mul_i32 s6, s3, s8 1754; GFX8-NEXT: v_mov_b32_e32 v3, 0 1755; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1756; GFX8-NEXT: s_mov_b32 m0, -1 1757; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1758; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1759; GFX8-NEXT: .LBB5_2: 1760; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1761; GFX8-NEXT: v_readfirstlane_b32 s4, v1 1762; GFX8-NEXT: v_readfirstlane_b32 s5, v0 1763; GFX8-NEXT: v_mov_b32_e32 v0, s5 1764; GFX8-NEXT: v_mov_b32_e32 v1, s4 1765; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1766; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 1767; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 1768; GFX8-NEXT: s_mov_b32 s7, 0xf000 1769; GFX8-NEXT: s_mov_b32 s6, -1 1770; GFX8-NEXT: s_mov_b32 s4, s0 1771; GFX8-NEXT: s_mov_b32 s5, s1 1772; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1773; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1774; GFX8-NEXT: s_endpgm 1775; 1776; GFX9-LABEL: add_i64_uniform: 1777; GFX9: ; %bb.0: ; %entry 1778; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1779; GFX9-NEXT: s_mov_b64 s[6:7], exec 1780; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1781; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1782; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1783; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1784; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1785; GFX9-NEXT: s_cbranch_execz .LBB5_2 1786; GFX9-NEXT: ; %bb.1: 1787; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1788; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1789; GFX9-NEXT: s_mul_i32 s7, s3, s6 1790; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1791; GFX9-NEXT: s_add_i32 s8, s8, s7 1792; GFX9-NEXT: s_mul_i32 s6, s2, s6 1793; GFX9-NEXT: v_mov_b32_e32 v0, s6 1794; GFX9-NEXT: v_mov_b32_e32 v1, s8 1795; GFX9-NEXT: v_mov_b32_e32 v3, 0 1796; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1797; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1798; GFX9-NEXT: .LBB5_2: 1799; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1800; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1801; GFX9-NEXT: v_readfirstlane_b32 s5, v0 1802; GFX9-NEXT: v_mov_b32_e32 v0, s5 1803; GFX9-NEXT: v_mov_b32_e32 v1, s4 1804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1805; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] 1806; GFX9-NEXT: s_mov_b32 s7, 0xf000 1807; GFX9-NEXT: s_mov_b32 s6, -1 1808; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1809; GFX9-NEXT: s_mov_b32 s4, s0 1810; GFX9-NEXT: s_mov_b32 s5, s1 1811; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1812; GFX9-NEXT: s_endpgm 1813; 1814; GFX1064-LABEL: add_i64_uniform: 1815; GFX1064: ; %bb.0: ; %entry 1816; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1817; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1818; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1819; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1820; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1821; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1822; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1823; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1824; GFX1064-NEXT: ; %bb.1: 1825; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1826; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1827; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1828; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1829; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1830; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1831; GFX1064-NEXT: s_add_i32 s8, s8, s7 1832; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1833; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1834; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1835; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1836; GFX1064-NEXT: buffer_gl0_inv 1837; GFX1064-NEXT: .LBB5_2: 1838; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1839; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1840; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1841; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1842; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1843; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] 1844; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1845; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1846; GFX1064-NEXT: s_mov_b32 s2, -1 1847; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1848; GFX1064-NEXT: s_endpgm 1849; 1850; GFX1032-LABEL: add_i64_uniform: 1851; GFX1032: ; %bb.0: ; %entry 1852; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1853; GFX1032-NEXT: s_mov_b32 s6, exec_lo 1854; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1855; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 1856; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1857; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1858; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1859; GFX1032-NEXT: ; %bb.1: 1860; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 1861; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1862; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1863; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1864; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1865; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1866; GFX1032-NEXT: s_add_i32 s7, s7, s6 1867; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1868; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1869; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1870; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1871; GFX1032-NEXT: buffer_gl0_inv 1872; GFX1032-NEXT: .LBB5_2: 1873; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1874; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1875; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1876; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1877; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1878; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1879; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] 1880; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1881; GFX1032-NEXT: s_mov_b32 s2, -1 1882; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1883; GFX1032-NEXT: s_endpgm 1884; 1885; GFX1164-LABEL: add_i64_uniform: 1886; GFX1164: ; %bb.0: ; %entry 1887; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1888; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1889; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1890; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1891; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1892; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1893; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1894; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1895; GFX1164-NEXT: s_cbranch_execz .LBB5_2 1896; GFX1164-NEXT: ; %bb.1: 1897; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1898; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1899; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1900; GFX1164-NEXT: s_mul_i32 s7, s3, s6 1901; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 1902; GFX1164-NEXT: s_mul_i32 s6, s2, s6 1903; GFX1164-NEXT: s_add_i32 s8, s8, s7 1904; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1905; GFX1164-NEXT: v_mov_b32_e32 v1, s8 1906; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1907; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1908; GFX1164-NEXT: buffer_gl0_inv 1909; GFX1164-NEXT: .LBB5_2: 1910; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1911; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 1912; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 1913; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1914; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1915; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1916; GFX1164-NEXT: s_mov_b32 s2, -1 1917; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1918; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1919; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1920; GFX1164-NEXT: v_mov_b32_e32 v1, v3 1921; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1922; GFX1164-NEXT: s_endpgm 1923; 1924; GFX1132-LABEL: add_i64_uniform: 1925; GFX1132: ; %bb.0: ; %entry 1926; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1927; GFX1132-NEXT: s_mov_b32 s6, exec_lo 1928; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1929; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 1930; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1931; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1932; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1933; GFX1132-NEXT: s_cbranch_execz .LBB5_2 1934; GFX1132-NEXT: ; %bb.1: 1935; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 1936; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1937; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1938; GFX1132-NEXT: s_mul_i32 s6, s3, s5 1939; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 1940; GFX1132-NEXT: s_mul_i32 s5, s2, s5 1941; GFX1132-NEXT: s_add_i32 s7, s7, s6 1942; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1943; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 1944; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1945; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1946; GFX1132-NEXT: buffer_gl0_inv 1947; GFX1132-NEXT: .LBB5_2: 1948; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1949; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 1950; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 1951; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1952; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1953; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1954; GFX1132-NEXT: s_mov_b32 s2, -1 1955; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1956; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1957; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1958; GFX1132-NEXT: v_mov_b32_e32 v1, v3 1959; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1960; GFX1132-NEXT: s_endpgm 1961entry: 1962 %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel 1963 store i64 %old, ptr addrspace(1) %out 1964 ret void 1965} 1966 1967define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { 1968; GFX7LESS_ITERATIVE-LABEL: add_i64_varying: 1969; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 1970; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 1971; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 1972; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 1973; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 1974; GFX7LESS_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop 1975; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 1976; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] 1977; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 1978; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 1979; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 1980; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 1981; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 1982; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 1983; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 1984; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 1985; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 1986; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 1987; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 1988; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB6_1 1989; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 1990; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1991; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 1992; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1993; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 1994; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 1995; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 1996; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 1997; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 1998; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 1999; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 2000; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 2001; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 2002; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] 2003; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2004; GFX7LESS_ITERATIVE-NEXT: .LBB6_4: 2005; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 2006; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2007; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 2008; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 2009; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 2010; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 2011; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 2012; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1 2013; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc 2014; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2015; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2016; GFX7LESS_ITERATIVE-NEXT: s_endpgm 2017; 2018; GFX8_ITERATIVE-LABEL: add_i64_varying: 2019; GFX8_ITERATIVE: ; %bb.0: ; %entry 2020; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 2021; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2022; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2023; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2024; GFX8_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop 2025; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2026; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] 2027; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 2028; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 2029; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 2030; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 2031; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 2032; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 2033; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 2034; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 2035; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 2036; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 2037; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 2038; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2039; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2040; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2041; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2042; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2043; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 2044; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 2045; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 2046; GFX8_ITERATIVE-NEXT: ; %bb.3: 2047; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 2048; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 2049; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 2050; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 2051; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] 2052; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX8_ITERATIVE-NEXT: .LBB6_4: 2054; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 2055; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2056; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 2057; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 2058; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 2059; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1 2060; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 2061; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 2062; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc 2063; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2064; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2065; GFX8_ITERATIVE-NEXT: s_endpgm 2066; 2067; GFX9_ITERATIVE-LABEL: add_i64_varying: 2068; GFX9_ITERATIVE: ; %bb.0: ; %entry 2069; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 2070; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2071; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2072; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2073; GFX9_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop 2074; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2075; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] 2076; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 2077; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 2078; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 2079; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 2080; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 2081; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 2082; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 2083; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 2084; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 2085; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 2086; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 2087; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2088; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2089; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2090; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2091; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2092; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 2093; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 2094; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 2095; GFX9_ITERATIVE-NEXT: ; %bb.3: 2096; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 2097; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 2098; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 2099; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] 2100; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2101; GFX9_ITERATIVE-NEXT: .LBB6_4: 2102; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 2103; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2104; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 2105; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 2106; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 2107; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 2108; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 2109; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 2110; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc 2111; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2112; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2113; GFX9_ITERATIVE-NEXT: s_endpgm 2114; 2115; GFX1064_ITERATIVE-LABEL: add_i64_varying: 2116; GFX1064_ITERATIVE: ; %bb.0: ; %entry 2117; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2118; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 2119; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2120; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2121; GFX1064_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop 2122; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2123; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] 2124; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 2125; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 2126; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 2127; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 2128; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 2129; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 2130; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 2131; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 2132; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 2133; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 2134; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2135; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2136; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2137; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2138; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2139; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 2140; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 2141; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 2142; GFX1064_ITERATIVE-NEXT: ; %bb.3: 2143; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 2144; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 2145; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 2146; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] 2147; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2148; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 2149; GFX1064_ITERATIVE-NEXT: .LBB6_4: 2150; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 2151; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 2152; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2153; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 2154; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 2155; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 2156; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2157; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2158; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 2159; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2161; GFX1064_ITERATIVE-NEXT: s_endpgm 2162; 2163; GFX1032_ITERATIVE-LABEL: add_i64_varying: 2164; GFX1032_ITERATIVE: ; %bb.0: ; %entry 2165; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2166; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 2167; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2168; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 2169; GFX1032_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop 2170; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2171; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 2172; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 2173; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 2174; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 2175; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 2176; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 2177; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 2178; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 2179; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 2180; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 2181; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 2182; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2183; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2184; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 2185; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2186; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 2187; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 2188; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 2189; GFX1032_ITERATIVE-NEXT: ; %bb.3: 2190; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 2191; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 2192; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 2193; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] 2194; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2195; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 2196; GFX1032_ITERATIVE-NEXT: .LBB6_4: 2197; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 2198; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 2199; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2200; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 2201; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 2202; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 2203; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2204; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2205; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 2206; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2207; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2208; GFX1032_ITERATIVE-NEXT: s_endpgm 2209; 2210; GFX1164_ITERATIVE-LABEL: add_i64_varying: 2211; GFX1164_ITERATIVE: ; %bb.0: ; %entry 2212; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 2213; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 2214; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 2215; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2216; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 2217; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop 2218; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2219; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] 2220; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 2221; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 2222; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 2223; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 2224; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 2225; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 2226; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2227; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 2228; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 2229; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] 2230; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2231; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 2232; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 2233; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2234; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2235; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2236; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2237; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2238; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 2239; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 2240; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2241; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 2242; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 2243; GFX1164_ITERATIVE-NEXT: ; %bb.3: 2244; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 2245; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 2246; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 2247; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] 2248; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2249; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 2250; GFX1164_ITERATIVE-NEXT: .LBB6_4: 2251; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 2252; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2253; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 2254; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 2255; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2256; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 2257; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2258; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2259; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 2260; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2261; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2262; GFX1164_ITERATIVE-NEXT: s_endpgm 2263; 2264; GFX1132_ITERATIVE-LABEL: add_i64_varying: 2265; GFX1132_ITERATIVE: ; %bb.0: ; %entry 2266; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 2267; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 2268; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2269; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 2270; GFX1132_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop 2271; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2272; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 2273; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 2274; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 2275; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 2276; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 2277; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 2278; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 2279; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 2280; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 2281; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 2282; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 2283; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 2284; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 2285; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 2286; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2287; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2288; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2289; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2290; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 2291; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 2292; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 2293; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 2294; GFX1132_ITERATIVE-NEXT: ; %bb.3: 2295; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 2296; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 2297; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] 2298; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2299; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 2300; GFX1132_ITERATIVE-NEXT: .LBB6_4: 2301; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 2302; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2303; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 2304; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 2305; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2306; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 2307; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2308; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 2309; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 2310; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2311; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2312; GFX1132_ITERATIVE-NEXT: s_endpgm 2313; 2314; GFX7LESS_DPP-LABEL: add_i64_varying: 2315; GFX7LESS_DPP: ; %bb.0: ; %entry 2316; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2317; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 2318; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 2319; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 2320; GFX7LESS_DPP-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 2321; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 2323; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 2324; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2325; GFX7LESS_DPP-NEXT: s_endpgm 2326; 2327; GFX8_DPP-LABEL: add_i64_varying: 2328; GFX8_DPP: ; %bb.0: ; %entry 2329; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 2330; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 2331; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 2332; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 2333; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 2334; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2335; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 2336; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 2337; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2338; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2339; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2340; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2341; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2342; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 2343; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2344; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2345; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2346; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2347; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2348; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 2349; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2350; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2351; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2352; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2353; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2354; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 2355; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2356; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2357; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2358; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2359; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2360; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 2361; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2362; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2363; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2364; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2365; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2366; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 2367; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2368; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2369; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2370; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2371; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 2372; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 2373; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 2374; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 2375; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 2376; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2377; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 2378; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 2379; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 2380; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 2381; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 2382; GFX8_DPP-NEXT: ; %bb.1: 2383; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 2384; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 2385; GFX8_DPP-NEXT: s_mov_b32 m0, -1 2386; GFX8_DPP-NEXT: ds_add_rtn_u64 v[5:6], v7, v[5:6] 2387; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 2388; GFX8_DPP-NEXT: .LBB6_2: 2389; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 2390; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2391; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 2392; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 2393; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 2394; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 2395; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 2396; GFX8_DPP-NEXT: v_add_u32_e32 v5, vcc, s5, v5 2397; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 2398; GFX8_DPP-NEXT: s_mov_b32 s2, -1 2399; GFX8_DPP-NEXT: v_addc_u32_e32 v6, vcc, v0, v6, vcc 2400; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 2401; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 2402; GFX8_DPP-NEXT: s_endpgm 2403; 2404; GFX9_DPP-LABEL: add_i64_varying: 2405; GFX9_DPP: ; %bb.0: ; %entry 2406; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 2407; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 2408; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 2409; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 2410; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 2411; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2412; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 2413; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 2414; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2415; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2416; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2417; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2418; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 2419; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 2420; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2421; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2422; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2423; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2424; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 2425; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 2426; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2427; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2428; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2429; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2430; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 2431; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 2432; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2433; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2434; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2435; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2436; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 2437; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 2438; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2439; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2440; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2441; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2442; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 2443; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 2444; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2445; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2446; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2447; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 2448; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 2449; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 2450; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 2451; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 2452; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 2453; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2454; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 2455; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 2456; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 2457; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 2458; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 2459; GFX9_DPP-NEXT: ; %bb.1: 2460; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 2461; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 2462; GFX9_DPP-NEXT: ds_add_rtn_u64 v[5:6], v7, v[5:6] 2463; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 2464; GFX9_DPP-NEXT: .LBB6_2: 2465; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 2466; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2467; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 2468; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 2469; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 2470; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 2471; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 2472; GFX9_DPP-NEXT: v_add_co_u32_e32 v5, vcc, s5, v5 2473; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 2474; GFX9_DPP-NEXT: s_mov_b32 s2, -1 2475; GFX9_DPP-NEXT: v_addc_co_u32_e32 v6, vcc, v0, v6, vcc 2476; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 2477; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 2478; GFX9_DPP-NEXT: s_endpgm 2479; 2480; GFX1064_DPP-LABEL: add_i64_varying: 2481; GFX1064_DPP: ; %bb.0: ; %entry 2482; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 2483; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 2484; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 2485; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] 2486; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 2487; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 2488; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 2489; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2490; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, 0 2491; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf 2492; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 2493; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 2494; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc 2495; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 2496; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2497; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 2498; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2499; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 2500; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 2501; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 2502; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2503; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 2504; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2505; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 2506; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc 2507; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 2508; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2509; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2510; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 2511; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 2512; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 2513; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 2514; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 2515; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2516; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2517; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 2518; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 2519; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 2520; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 2521; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 2522; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 2523; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 2524; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 2525; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2526; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2527; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 2528; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 2529; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 2530; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2531; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 2532; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2533; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2534; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 2535; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 2536; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 2537; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 2538; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 2539; GFX1064_DPP-NEXT: v_writelane_b32 v8, s2, 16 2540; GFX1064_DPP-NEXT: v_writelane_b32 v7, s3, 16 2541; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 2542; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 2543; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 2544; GFX1064_DPP-NEXT: v_writelane_b32 v8, s6, 32 2545; GFX1064_DPP-NEXT: v_writelane_b32 v7, s7, 32 2546; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 2547; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 2548; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 2549; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 2550; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 2551; GFX1064_DPP-NEXT: v_writelane_b32 v8, s9, 48 2552; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 48 2553; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 2554; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 2555; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 2556; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 2557; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 2558; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 2559; GFX1064_DPP-NEXT: ; %bb.1: 2560; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 2561; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 2562; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10] 2563; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 2564; GFX1064_DPP-NEXT: buffer_gl0_inv 2565; GFX1064_DPP-NEXT: .LBB6_2: 2566; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 2567; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 2568; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2569; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 2570; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 2571; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8 2572; GFX1064_DPP-NEXT: s_mov_b32 null, 0 2573; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v10 2574; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s3, v11 2575; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 2576; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s4, v12, vcc 2577; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 2578; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 2579; GFX1064_DPP-NEXT: s_endpgm 2580; 2581; GFX1032_DPP-LABEL: add_i64_varying: 2582; GFX1032_DPP: ; %bb.0: ; %entry 2583; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 2584; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 2585; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 2586; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s2 2587; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 2588; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 2589; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 2590; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2591; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, 0 2592; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf 2593; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 2594; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 2595; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo 2596; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 2597; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2598; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 2599; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2600; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 2601; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 2602; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 2603; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2604; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 2605; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2606; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 2607; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo 2608; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 2609; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2610; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2611; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 2612; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 2613; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 2614; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 2615; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 2616; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2617; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2618; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 2619; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 2620; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 2621; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 2622; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 2623; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2624; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2625; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 2626; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 2627; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 2628; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 2629; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 2630; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16 2631; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 2632; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 2633; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 2634; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 2635; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 2636; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 2637; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 2638; GFX1032_DPP-NEXT: ; %bb.1: 2639; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 2640; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 2641; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10] 2642; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 2643; GFX1032_DPP-NEXT: buffer_gl0_inv 2644; GFX1032_DPP-NEXT: .LBB6_2: 2645; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 2646; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 2647; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2648; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v9 2649; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 2650; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 2651; GFX1032_DPP-NEXT: s_mov_b32 null, 0 2652; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v10 2653; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s3, v11 2654; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 2655; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo 2656; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 2657; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 2658; GFX1032_DPP-NEXT: s_endpgm 2659; 2660; GFX1164_DPP-LABEL: add_i64_varying: 2661; GFX1164_DPP: ; %bb.0: ; %entry 2662; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2663; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 2664; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 2665; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 2666; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 2667; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 2668; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 2669; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 2670; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2671; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2672; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2673; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2674; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 2675; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2676; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 2677; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2678; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2679; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 2680; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 2681; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2682; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 2683; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2684; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 2685; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 2686; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2687; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 2688; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2689; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2690; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 2691; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2692; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 2693; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2694; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 2695; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2696; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 2697; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2698; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2699; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 2700; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 2701; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 2702; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2703; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 2704; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 2705; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2706; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2707; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2708; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 2709; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 2710; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2711; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2712; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 2713; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2714; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 2715; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2716; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 2717; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 2718; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 2719; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 2720; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 2721; GFX1164_DPP-NEXT: v_writelane_b32 v6, s3, 16 2722; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 2723; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 2724; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 2725; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 2726; GFX1164_DPP-NEXT: v_writelane_b32 v6, s7, 32 2727; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 2728; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2729; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 2730; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 2731; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 2732; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 2733; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 2734; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 2735; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 2736; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 2737; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 2738; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 2739; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 2740; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 2741; GFX1164_DPP-NEXT: ; %bb.1: 2742; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 2743; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 2744; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[7:8], v0, v[7:8] 2745; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 2746; GFX1164_DPP-NEXT: buffer_gl0_inv 2747; GFX1164_DPP-NEXT: .LBB6_2: 2748; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 2749; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2750; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 2751; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 2752; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 2753; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8 2754; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2755; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s3, v9 2756; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 2757; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s4, v10, vcc 2758; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 2759; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 2760; GFX1164_DPP-NEXT: s_endpgm 2761; 2762; GFX1132_DPP-LABEL: add_i64_varying: 2763; GFX1132_DPP: ; %bb.0: ; %entry 2764; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2765; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 2766; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 2767; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 2768; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 2769; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 2770; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 2771; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 2772; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2773; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2774; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2775; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2776; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 2777; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2778; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 2779; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2780; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2781; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 2782; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 2783; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2784; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 2785; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2786; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 2787; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 2788; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2789; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 2790; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2791; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2792; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 2793; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2794; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 2795; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2796; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2797; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 2798; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 2799; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 2800; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2801; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 2802; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2803; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 2804; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 2805; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 2806; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2807; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 2808; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 2809; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 2810; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 2811; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 2812; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 2813; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16 2814; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 2815; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 2816; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 2817; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 2818; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 2819; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 2820; GFX1132_DPP-NEXT: ; %bb.1: 2821; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 2822; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9] 2823; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 2824; GFX1132_DPP-NEXT: buffer_gl0_inv 2825; GFX1132_DPP-NEXT: .LBB6_2: 2826; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 2827; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2828; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 2829; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 2830; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 2831; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 2832; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2833; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v10 2834; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 2835; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo 2836; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 2838; GFX1132_DPP-NEXT: s_endpgm 2839entry: 2840 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2841 %zext = zext i32 %lane to i64 2842 %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel 2843 store i64 %old, ptr addrspace(1) %out 2844 ret void 2845} 2846 2847define amdgpu_kernel void @add_i64_varying_nouse() { 2848; GFX7LESS_ITERATIVE-LABEL: add_i64_varying_nouse: 2849; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 2850; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 2851; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 2852; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2853; GFX7LESS_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop 2854; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2855; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] 2856; GFX7LESS_ITERATIVE-NEXT: s_nop 0 2857; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 2858; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 2859; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 2860; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 2861; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 2862; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] 2863; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 2864; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] 2865; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB7_1 2866; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2867; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2868; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 2869; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2870; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 2871; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 2872; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 2873; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 2874; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 2875; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 2876; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 2877; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 2878; GFX7LESS_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] 2879; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2880; GFX7LESS_ITERATIVE-NEXT: .LBB7_4: 2881; GFX7LESS_ITERATIVE-NEXT: s_endpgm 2882; 2883; GFX8_ITERATIVE-LABEL: add_i64_varying_nouse: 2884; GFX8_ITERATIVE: ; %bb.0: ; %entry 2885; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 2886; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 2887; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2888; GFX8_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop 2889; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2890; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] 2891; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 2892; GFX8_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 2893; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 2894; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 2895; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 2896; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] 2897; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 2898; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 2899; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2900; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2901; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2902; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2903; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 2904; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 2905; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 2906; GFX8_ITERATIVE-NEXT: ; %bb.3: 2907; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 2908; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 2909; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 2910; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 2911; GFX8_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] 2912; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2913; GFX8_ITERATIVE-NEXT: .LBB7_4: 2914; GFX8_ITERATIVE-NEXT: s_endpgm 2915; 2916; GFX9_ITERATIVE-LABEL: add_i64_varying_nouse: 2917; GFX9_ITERATIVE: ; %bb.0: ; %entry 2918; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 2919; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 2920; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2921; GFX9_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop 2922; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2923; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] 2924; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 2925; GFX9_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 2926; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 2927; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 2928; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 2929; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] 2930; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 2931; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 2932; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2933; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2934; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2935; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2936; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 2937; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 2938; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 2939; GFX9_ITERATIVE-NEXT: ; %bb.3: 2940; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 2941; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 2942; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 2943; GFX9_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] 2944; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2945; GFX9_ITERATIVE-NEXT: .LBB7_4: 2946; GFX9_ITERATIVE-NEXT: s_endpgm 2947; 2948; GFX1064_ITERATIVE-LABEL: add_i64_varying_nouse: 2949; GFX1064_ITERATIVE: ; %bb.0: ; %entry 2950; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 2951; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 2952; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2953; GFX1064_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop 2954; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2955; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] 2956; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 2957; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 2958; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 2959; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 2960; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 2961; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] 2962; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 2963; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 2964; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2965; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2966; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2967; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2968; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 2969; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 2970; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 2971; GFX1064_ITERATIVE-NEXT: ; %bb.3: 2972; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 2973; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 2974; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 2975; GFX1064_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] 2976; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 2977; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 2978; GFX1064_ITERATIVE-NEXT: .LBB7_4: 2979; GFX1064_ITERATIVE-NEXT: s_endpgm 2980; 2981; GFX1032_ITERATIVE-LABEL: add_i64_varying_nouse: 2982; GFX1032_ITERATIVE: ; %bb.0: ; %entry 2983; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 2984; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 2985; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 2986; GFX1032_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop 2987; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 2988; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 2989; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 2990; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3 2991; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 2992; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 2993; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 2994; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 2995; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 2996; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 2997; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 2998; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2999; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3000; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 3001; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 3002; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 3003; GFX1032_ITERATIVE-NEXT: ; %bb.3: 3004; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 3005; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 3006; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 3007; GFX1032_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] 3008; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3009; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 3010; GFX1032_ITERATIVE-NEXT: .LBB7_4: 3011; GFX1032_ITERATIVE-NEXT: s_endpgm 3012; 3013; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse: 3014; GFX1164_ITERATIVE: ; %bb.0: ; %entry 3015; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3016; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 3017; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 3018; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 3019; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop 3020; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 3021; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3] 3022; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3023; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 3024; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 3025; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 3026; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 3027; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 3028; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 3029; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] 3030; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3031; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 3032; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 3033; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 3034; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3035; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 3036; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3037; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3038; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 3039; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 3040; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 3041; GFX1164_ITERATIVE-NEXT: ; %bb.3: 3042; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 3043; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 3044; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 3045; GFX1164_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] 3046; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3047; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 3048; GFX1164_ITERATIVE-NEXT: .LBB7_4: 3049; GFX1164_ITERATIVE-NEXT: s_endpgm 3050; 3051; GFX1132_ITERATIVE-LABEL: add_i64_varying_nouse: 3052; GFX1132_ITERATIVE: ; %bb.0: ; %entry 3053; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 3054; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 3055; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 3056; GFX1132_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop 3057; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 3058; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 3059; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 3060; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 3061; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3 3062; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3063; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 3064; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 3065; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 3066; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3067; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 3068; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 3069; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 3070; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 3071; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3072; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 3073; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 3074; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 3075; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 3076; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 3077; GFX1132_ITERATIVE-NEXT: ; %bb.3: 3078; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 3079; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 3080; GFX1132_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] 3081; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3082; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 3083; GFX1132_ITERATIVE-NEXT: .LBB7_4: 3084; GFX1132_ITERATIVE-NEXT: s_endpgm 3085; 3086; GFX7LESS_DPP-LABEL: add_i64_varying_nouse: 3087; GFX7LESS_DPP: ; %bb.0: ; %entry 3088; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 3089; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 3090; GFX7LESS_DPP-NEXT: ds_add_u64 v1, v[0:1] 3091; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 3092; GFX7LESS_DPP-NEXT: s_endpgm 3093; 3094; GFX8_DPP-LABEL: add_i64_varying_nouse: 3095; GFX8_DPP: ; %bb.0: ; %entry 3096; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 3097; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 3098; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 3099; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 3100; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 3101; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 3102; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 3103; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 3104; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3105; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 3106; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3107; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 3108; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 3109; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 3110; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3111; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 3112; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3113; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 3114; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 3115; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 3116; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3117; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 3118; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3119; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 3120; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 3121; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 3122; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3123; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 3124; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3125; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 3126; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 3127; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 3128; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3129; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 3130; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 3131; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 3132; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 3133; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 3134; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3135; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 3136; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 3137; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 3138; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 3139; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 3140; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 3141; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 3142; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 3143; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 3144; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2 3145; GFX8_DPP-NEXT: ; %bb.1: 3146; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s1 3147; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s0 3148; GFX8_DPP-NEXT: s_mov_b32 m0, -1 3149; GFX8_DPP-NEXT: ds_add_u64 v5, v[6:7] 3150; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 3151; GFX8_DPP-NEXT: .LBB7_2: 3152; GFX8_DPP-NEXT: s_endpgm 3153; 3154; GFX9_DPP-LABEL: add_i64_varying_nouse: 3155; GFX9_DPP: ; %bb.0: ; %entry 3156; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 3157; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 3158; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 3159; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 3160; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 3161; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 3162; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 3163; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 3164; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3165; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 3166; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3167; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 3168; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 3169; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 3170; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3171; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 3172; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3173; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 3174; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 3175; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 3176; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3177; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 3178; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3179; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 3180; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 3181; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 3182; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3183; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 3184; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3185; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 3186; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 3187; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 3188; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3189; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 3190; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 3191; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 3192; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 3193; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 3194; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3195; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 3196; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 3197; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 3198; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 3199; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 3200; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 3201; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 3202; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 3203; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 3204; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2 3205; GFX9_DPP-NEXT: ; %bb.1: 3206; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s1 3207; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s0 3208; GFX9_DPP-NEXT: ds_add_u64 v5, v[6:7] 3209; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 3210; GFX9_DPP-NEXT: .LBB7_2: 3211; GFX9_DPP-NEXT: s_endpgm 3212; 3213; GFX1064_DPP-LABEL: add_i64_varying_nouse: 3214; GFX1064_DPP: ; %bb.0: ; %entry 3215; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 3216; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 3217; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 3218; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] 3219; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 3220; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 3221; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 3222; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf 3223; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf 3224; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 3225; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc 3226; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 3227; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_xmask:2 row_mask:0xf bank_mask:0xf 3228; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 3229; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:2 row_mask:0xf bank_mask:0xf 3230; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 3231; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 3232; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 3233; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf 3234; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 3235; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf 3236; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 3237; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc 3238; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_xmask:8 row_mask:0xf bank_mask:0xf 3239; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:8 row_mask:0xf bank_mask:0xf 3240; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 3241; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 3242; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, 0, 0 3243; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 3244; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 3245; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v4, vcc 3246; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 3247; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3248; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 3249; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 0 3250; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 0 3251; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 32 3252; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 32 3253; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 3254; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 3255; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 3256; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4 3257; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5 3258; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 3259; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 3260; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2 3261; GFX1064_DPP-NEXT: ; %bb.1: 3262; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 3263; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 3264; GFX1064_DPP-NEXT: ds_add_u64 v0, v[7:8] 3265; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 3266; GFX1064_DPP-NEXT: buffer_gl0_inv 3267; GFX1064_DPP-NEXT: .LBB7_2: 3268; GFX1064_DPP-NEXT: s_endpgm 3269; 3270; GFX1032_DPP-LABEL: add_i64_varying_nouse: 3271; GFX1032_DPP: ; %bb.0: ; %entry 3272; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 3273; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 3274; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 3275; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 3276; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 3277; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 3278; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 3279; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf 3280; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf 3281; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 3282; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo 3283; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 3284; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_xmask:2 row_mask:0xf bank_mask:0xf 3285; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 3286; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:2 row_mask:0xf bank_mask:0xf 3287; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 3288; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 3289; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 3290; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf 3291; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 3292; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf 3293; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 3294; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo 3295; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_xmask:8 row_mask:0xf bank_mask:0xf 3296; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_xmask:8 row_mask:0xf bank_mask:0xf 3297; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 3298; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 3299; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, 0, 0 3300; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 3301; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 3302; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo 3303; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 3304; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 3305; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 3306; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 3307; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 3308; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 3309; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo 3310; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2 3311; GFX1032_DPP-NEXT: ; %bb.1: 3312; GFX1032_DPP-NEXT: ds_add_u64 v0, v[7:8] 3313; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 3314; GFX1032_DPP-NEXT: buffer_gl0_inv 3315; GFX1032_DPP-NEXT: .LBB7_2: 3316; GFX1032_DPP-NEXT: s_endpgm 3317; 3318; GFX1164_DPP-LABEL: add_i64_varying_nouse: 3319; GFX1164_DPP: ; %bb.0: ; %entry 3320; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3321; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 3322; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3323; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 3324; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 3325; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 3326; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 3327; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3328; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf 3329; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3330; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3331; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 3332; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3333; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 3334; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3335; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf 3336; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 3337; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3338; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3339; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 3340; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf 3341; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 3342; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 3343; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3344; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3345; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf 3346; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0 3347; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3348; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 3349; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 3350; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3351; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 3352; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2 3353; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3354; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 3355; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1 3356; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 3357; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) 3358; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3359; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 3360; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 3361; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc 3362; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 3363; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 3364; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 3365; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 3366; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 3367; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec 3368; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) 3369; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v7 3370; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2 3371; GFX1164_DPP-NEXT: ; %bb.1: 3372; GFX1164_DPP-NEXT: ds_add_u64 v0, v[5:6] 3373; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 3374; GFX1164_DPP-NEXT: buffer_gl0_inv 3375; GFX1164_DPP-NEXT: .LBB7_2: 3376; GFX1164_DPP-NEXT: s_endpgm 3377; 3378; GFX1132_DPP-LABEL: add_i64_varying_nouse: 3379; GFX1132_DPP: ; %bb.0: ; %entry 3380; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3381; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 3382; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3383; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 3384; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 3385; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 3386; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 3387; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3388; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf 3389; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3390; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3391; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 3392; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3393; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 3394; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3395; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf 3396; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 3397; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3398; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3399; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 3400; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf 3401; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 3402; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 3403; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3404; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3405; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf 3406; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0 3407; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3408; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 3409; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v2, v3 3410; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3411; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 3412; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo 3413; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 3414; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 3415; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v2 3416; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 3417; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v3 3418; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo 3419; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) 3420; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v7 3421; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2 3422; GFX1132_DPP-NEXT: ; %bb.1: 3423; GFX1132_DPP-NEXT: ds_add_u64 v0, v[5:6] 3424; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 3425; GFX1132_DPP-NEXT: buffer_gl0_inv 3426; GFX1132_DPP-NEXT: .LBB7_2: 3427; GFX1132_DPP-NEXT: s_endpgm 3428entry: 3429 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3430 %zext = zext i32 %lane to i64 3431 %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel 3432 ret void 3433} 3434 3435define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { 3436; GFX7LESS-LABEL: sub_i32_constant: 3437; GFX7LESS: ; %bb.0: ; %entry 3438; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 3439; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3440; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 3441; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3442; GFX7LESS-NEXT: ; implicit-def: $vgpr1 3443; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 3444; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 3445; GFX7LESS-NEXT: ; %bb.1: 3446; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3447; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 3448; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3449; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 3450; GFX7LESS-NEXT: s_mov_b32 m0, -1 3451; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 3452; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3453; GFX7LESS-NEXT: .LBB8_2: 3454; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 3455; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3456; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3457; GFX7LESS-NEXT: s_mov_b32 s2, -1 3458; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 3459; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3460; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3461; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3462; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3463; GFX7LESS-NEXT: s_endpgm 3464; 3465; GFX8-LABEL: sub_i32_constant: 3466; GFX8: ; %bb.0: ; %entry 3467; GFX8-NEXT: s_mov_b64 s[2:3], exec 3468; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3469; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3470; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3471; GFX8-NEXT: ; implicit-def: $vgpr1 3472; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 3473; GFX8-NEXT: s_cbranch_execz .LBB8_2 3474; GFX8-NEXT: ; %bb.1: 3475; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3476; GFX8-NEXT: s_mul_i32 s2, s2, 5 3477; GFX8-NEXT: v_mov_b32_e32 v1, 0 3478; GFX8-NEXT: v_mov_b32_e32 v2, s2 3479; GFX8-NEXT: s_mov_b32 m0, -1 3480; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 3481; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3482; GFX8-NEXT: .LBB8_2: 3483; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 3484; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3485; GFX8-NEXT: v_readfirstlane_b32 s4, v1 3486; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3487; GFX8-NEXT: s_mov_b32 s3, 0xf000 3488; GFX8-NEXT: s_mov_b32 s2, -1 3489; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 3490; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3491; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3492; GFX8-NEXT: s_endpgm 3493; 3494; GFX9-LABEL: sub_i32_constant: 3495; GFX9: ; %bb.0: ; %entry 3496; GFX9-NEXT: s_mov_b64 s[2:3], exec 3497; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3498; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3499; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3500; GFX9-NEXT: ; implicit-def: $vgpr1 3501; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 3502; GFX9-NEXT: s_cbranch_execz .LBB8_2 3503; GFX9-NEXT: ; %bb.1: 3504; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3505; GFX9-NEXT: s_mul_i32 s2, s2, 5 3506; GFX9-NEXT: v_mov_b32_e32 v1, 0 3507; GFX9-NEXT: v_mov_b32_e32 v2, s2 3508; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 3509; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3510; GFX9-NEXT: .LBB8_2: 3511; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 3512; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3513; GFX9-NEXT: v_readfirstlane_b32 s4, v1 3514; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3515; GFX9-NEXT: s_mov_b32 s3, 0xf000 3516; GFX9-NEXT: s_mov_b32 s2, -1 3517; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3518; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3519; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3520; GFX9-NEXT: s_endpgm 3521; 3522; GFX1064-LABEL: sub_i32_constant: 3523; GFX1064: ; %bb.0: ; %entry 3524; GFX1064-NEXT: s_mov_b64 s[2:3], exec 3525; GFX1064-NEXT: ; implicit-def: $vgpr1 3526; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3527; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3528; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3529; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 3530; GFX1064-NEXT: s_cbranch_execz .LBB8_2 3531; GFX1064-NEXT: ; %bb.1: 3532; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3533; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3534; GFX1064-NEXT: s_mul_i32 s2, s2, 5 3535; GFX1064-NEXT: v_mov_b32_e32 v2, s2 3536; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 3537; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3538; GFX1064-NEXT: buffer_gl0_inv 3539; GFX1064-NEXT: .LBB8_2: 3540; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3541; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 3542; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3543; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 3544; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3545; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3546; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3547; GFX1064-NEXT: s_mov_b32 s2, -1 3548; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3549; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3550; GFX1064-NEXT: s_endpgm 3551; 3552; GFX1032-LABEL: sub_i32_constant: 3553; GFX1032: ; %bb.0: ; %entry 3554; GFX1032-NEXT: s_mov_b32 s1, exec_lo 3555; GFX1032-NEXT: ; implicit-def: $vgpr1 3556; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 3557; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3558; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 3559; GFX1032-NEXT: s_cbranch_execz .LBB8_2 3560; GFX1032-NEXT: ; %bb.1: 3561; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 3562; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3563; GFX1032-NEXT: s_mul_i32 s1, s1, 5 3564; GFX1032-NEXT: v_mov_b32_e32 v2, s1 3565; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 3566; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3567; GFX1032-NEXT: buffer_gl0_inv 3568; GFX1032-NEXT: .LBB8_2: 3569; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3570; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 3571; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3572; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 3573; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3574; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3575; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3576; GFX1032-NEXT: s_mov_b32 s2, -1 3577; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3578; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3579; GFX1032-NEXT: s_endpgm 3580; 3581; GFX1164-LABEL: sub_i32_constant: 3582; GFX1164: ; %bb.0: ; %entry 3583; GFX1164-NEXT: s_mov_b64 s[2:3], exec 3584; GFX1164-NEXT: s_mov_b64 s[0:1], exec 3585; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3586; GFX1164-NEXT: ; implicit-def: $vgpr1 3587; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3588; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3589; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 3590; GFX1164-NEXT: s_cbranch_execz .LBB8_2 3591; GFX1164-NEXT: ; %bb.1: 3592; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3593; GFX1164-NEXT: v_mov_b32_e32 v1, 0 3594; GFX1164-NEXT: s_mul_i32 s2, s2, 5 3595; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3596; GFX1164-NEXT: v_mov_b32_e32 v2, s2 3597; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 3598; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3599; GFX1164-NEXT: buffer_gl0_inv 3600; GFX1164-NEXT: .LBB8_2: 3601; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 3602; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 3603; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 3604; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3605; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3606; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3607; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3608; GFX1164-NEXT: s_mov_b32 s2, -1 3609; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3610; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3611; GFX1164-NEXT: s_endpgm 3612; 3613; GFX1132-LABEL: sub_i32_constant: 3614; GFX1132: ; %bb.0: ; %entry 3615; GFX1132-NEXT: s_mov_b32 s1, exec_lo 3616; GFX1132-NEXT: s_mov_b32 s0, exec_lo 3617; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 3618; GFX1132-NEXT: ; implicit-def: $vgpr1 3619; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3620; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 3621; GFX1132-NEXT: s_cbranch_execz .LBB8_2 3622; GFX1132-NEXT: ; %bb.1: 3623; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 3624; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3625; GFX1132-NEXT: s_mul_i32 s1, s1, 5 3626; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 3627; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 3628; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3629; GFX1132-NEXT: buffer_gl0_inv 3630; GFX1132-NEXT: .LBB8_2: 3631; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 3632; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 3633; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 3634; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 3635; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3636; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3637; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3638; GFX1132-NEXT: s_mov_b32 s2, -1 3639; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3640; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3641; GFX1132-NEXT: s_endpgm 3642entry: 3643 %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 5 acq_rel 3644 store i32 %old, ptr addrspace(1) %out 3645 ret void 3646} 3647 3648define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) { 3649; GFX7LESS-LABEL: sub_i32_uniform: 3650; GFX7LESS: ; %bb.0: ; %entry 3651; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 3652; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xb 3653; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3654; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 3655; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3656; GFX7LESS-NEXT: ; implicit-def: $vgpr1 3657; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 3658; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 3659; GFX7LESS-NEXT: ; %bb.1: 3660; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3661; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3662; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 3663; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3664; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 3665; GFX7LESS-NEXT: s_mov_b32 m0, -1 3666; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 3667; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3668; GFX7LESS-NEXT: .LBB9_2: 3669; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 3670; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3671; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3672; GFX7LESS-NEXT: s_mov_b32 s2, -1 3673; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 3674; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3675; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 3676; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3677; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3678; GFX7LESS-NEXT: s_endpgm 3679; 3680; GFX8-LABEL: sub_i32_uniform: 3681; GFX8: ; %bb.0: ; %entry 3682; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c 3683; GFX8-NEXT: s_mov_b64 s[2:3], exec 3684; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3685; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3686; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3687; GFX8-NEXT: ; implicit-def: $vgpr1 3688; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 3689; GFX8-NEXT: s_cbranch_execz .LBB9_2 3690; GFX8-NEXT: ; %bb.1: 3691; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3692; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3693; GFX8-NEXT: s_mul_i32 s2, s6, s2 3694; GFX8-NEXT: v_mov_b32_e32 v1, 0 3695; GFX8-NEXT: v_mov_b32_e32 v2, s2 3696; GFX8-NEXT: s_mov_b32 m0, -1 3697; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 3698; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3699; GFX8-NEXT: .LBB9_2: 3700; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 3701; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3702; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3703; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 3704; GFX8-NEXT: v_readfirstlane_b32 s4, v1 3705; GFX8-NEXT: s_mov_b32 s3, 0xf000 3706; GFX8-NEXT: s_mov_b32 s2, -1 3707; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 3708; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3709; GFX8-NEXT: s_endpgm 3710; 3711; GFX9-LABEL: sub_i32_uniform: 3712; GFX9: ; %bb.0: ; %entry 3713; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 3714; GFX9-NEXT: s_mov_b64 s[2:3], exec 3715; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3716; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3717; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3718; GFX9-NEXT: ; implicit-def: $vgpr1 3719; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 3720; GFX9-NEXT: s_cbranch_execz .LBB9_2 3721; GFX9-NEXT: ; %bb.1: 3722; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3723; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3724; GFX9-NEXT: s_mul_i32 s2, s6, s2 3725; GFX9-NEXT: v_mov_b32_e32 v1, 0 3726; GFX9-NEXT: v_mov_b32_e32 v2, s2 3727; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 3728; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3729; GFX9-NEXT: .LBB9_2: 3730; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 3731; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3732; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3733; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 3734; GFX9-NEXT: v_readfirstlane_b32 s4, v1 3735; GFX9-NEXT: s_mov_b32 s3, 0xf000 3736; GFX9-NEXT: s_mov_b32 s2, -1 3737; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3738; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3739; GFX9-NEXT: s_endpgm 3740; 3741; GFX1064-LABEL: sub_i32_uniform: 3742; GFX1064: ; %bb.0: ; %entry 3743; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c 3744; GFX1064-NEXT: s_mov_b64 s[2:3], exec 3745; GFX1064-NEXT: ; implicit-def: $vgpr1 3746; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3747; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3748; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3749; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 3750; GFX1064-NEXT: s_cbranch_execz .LBB9_2 3751; GFX1064-NEXT: ; %bb.1: 3752; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3753; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3754; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3755; GFX1064-NEXT: s_mul_i32 s2, s6, s2 3756; GFX1064-NEXT: v_mov_b32_e32 v2, s2 3757; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 3758; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3759; GFX1064-NEXT: buffer_gl0_inv 3760; GFX1064-NEXT: .LBB9_2: 3761; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3762; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 3763; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3764; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3765; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 3766; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 3767; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3768; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3769; GFX1064-NEXT: s_mov_b32 s2, -1 3770; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3771; GFX1064-NEXT: s_endpgm 3772; 3773; GFX1032-LABEL: sub_i32_uniform: 3774; GFX1032: ; %bb.0: ; %entry 3775; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x2c 3776; GFX1032-NEXT: s_mov_b32 s2, exec_lo 3777; GFX1032-NEXT: ; implicit-def: $vgpr1 3778; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3779; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3780; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 3781; GFX1032-NEXT: s_cbranch_execz .LBB9_2 3782; GFX1032-NEXT: ; %bb.1: 3783; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 3784; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3785; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3786; GFX1032-NEXT: s_mul_i32 s2, s0, s2 3787; GFX1032-NEXT: v_mov_b32_e32 v2, s2 3788; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 3789; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3790; GFX1032-NEXT: buffer_gl0_inv 3791; GFX1032-NEXT: .LBB9_2: 3792; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3793; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 3794; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 3795; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3796; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 3797; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 3798; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 3799; GFX1032-NEXT: s_mov_b32 s10, -1 3800; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 3801; GFX1032-NEXT: buffer_store_dword v0, off, s[8:11], 0 3802; GFX1032-NEXT: s_endpgm 3803; 3804; GFX1164-LABEL: sub_i32_uniform: 3805; GFX1164: ; %bb.0: ; %entry 3806; GFX1164-NEXT: s_load_b32 s6, s[4:5], 0x2c 3807; GFX1164-NEXT: s_mov_b64 s[2:3], exec 3808; GFX1164-NEXT: s_mov_b64 s[0:1], exec 3809; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3810; GFX1164-NEXT: ; implicit-def: $vgpr1 3811; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3812; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3813; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 3814; GFX1164-NEXT: s_cbranch_execz .LBB9_2 3815; GFX1164-NEXT: ; %bb.1: 3816; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 3817; GFX1164-NEXT: v_mov_b32_e32 v1, 0 3818; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3819; GFX1164-NEXT: s_mul_i32 s2, s6, s2 3820; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3821; GFX1164-NEXT: v_mov_b32_e32 v2, s2 3822; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 3823; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3824; GFX1164-NEXT: buffer_gl0_inv 3825; GFX1164-NEXT: .LBB9_2: 3826; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 3827; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 3828; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3829; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 3830; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 3831; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3832; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3833; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 3834; GFX1164-NEXT: s_mov_b32 s2, -1 3835; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3836; GFX1164-NEXT: s_endpgm 3837; 3838; GFX1132-LABEL: sub_i32_uniform: 3839; GFX1132: ; %bb.0: ; %entry 3840; GFX1132-NEXT: s_load_b32 s0, s[4:5], 0x2c 3841; GFX1132-NEXT: s_mov_b32 s2, exec_lo 3842; GFX1132-NEXT: s_mov_b32 s1, exec_lo 3843; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3844; GFX1132-NEXT: ; implicit-def: $vgpr1 3845; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3846; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 3847; GFX1132-NEXT: s_cbranch_execz .LBB9_2 3848; GFX1132-NEXT: ; %bb.1: 3849; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 3850; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3851; GFX1132-NEXT: s_mul_i32 s2, s0, s2 3852; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3853; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 3854; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 3855; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3856; GFX1132-NEXT: buffer_gl0_inv 3857; GFX1132-NEXT: .LBB9_2: 3858; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 3859; GFX1132-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 3860; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3861; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 3862; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 3863; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 3864; GFX1132-NEXT: s_mov_b32 s6, -1 3865; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3866; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 3867; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 3868; GFX1132-NEXT: s_endpgm 3869entry: 3870 %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %subitive acq_rel 3871 store i32 %old, ptr addrspace(1) %out 3872 ret void 3873} 3874 3875define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { 3876; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: 3877; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 3878; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 3879; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 3880; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 3881; GFX7LESS_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop 3882; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 3883; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 3884; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 3885; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 3886; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 3887; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 3888; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 3889; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 3890; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 3891; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 3892; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB10_1 3893; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 3894; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3895; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3896; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3897; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 3898; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 3899; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 3900; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 3901; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 3902; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 3903; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 3904; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 3905; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 3906; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3907; GFX7LESS_ITERATIVE-NEXT: .LBB10_4: 3908; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 3909; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3910; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 3911; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 3912; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 3913; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 3914; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3915; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 3916; GFX7LESS_ITERATIVE-NEXT: s_endpgm 3917; 3918; GFX8_ITERATIVE-LABEL: sub_i32_varying: 3919; GFX8_ITERATIVE: ; %bb.0: ; %entry 3920; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 3921; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 3922; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 3923; GFX8_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop 3924; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 3925; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 3926; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 3927; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 3928; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 3929; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 3930; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 3931; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 3932; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 3933; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 3934; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 3935; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3936; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3937; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3938; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 3939; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 3940; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 3941; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 3942; GFX8_ITERATIVE-NEXT: ; %bb.3: 3943; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 3944; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 3945; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 3946; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 3947; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3948; GFX8_ITERATIVE-NEXT: .LBB10_4: 3949; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 3950; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3951; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 3952; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 3953; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 3954; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 3955; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3956; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 3957; GFX8_ITERATIVE-NEXT: s_endpgm 3958; 3959; GFX9_ITERATIVE-LABEL: sub_i32_varying: 3960; GFX9_ITERATIVE: ; %bb.0: ; %entry 3961; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 3962; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 3963; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 3964; GFX9_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop 3965; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 3966; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 3967; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 3968; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 3969; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 3970; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 3971; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 3972; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 3973; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 3974; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 3975; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 3976; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3977; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3978; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3979; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 3980; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 3981; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 3982; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 3983; GFX9_ITERATIVE-NEXT: ; %bb.3: 3984; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 3985; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 3986; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 3987; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3988; GFX9_ITERATIVE-NEXT: .LBB10_4: 3989; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 3990; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3991; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 3992; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 3993; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 3994; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1 3995; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 3996; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 3997; GFX9_ITERATIVE-NEXT: s_endpgm 3998; 3999; GFX1064_ITERATIVE-LABEL: sub_i32_varying: 4000; GFX1064_ITERATIVE: ; %bb.0: ; %entry 4001; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4002; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 4003; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4004; GFX1064_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop 4005; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4006; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 4007; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 4008; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 4009; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 4010; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 4011; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 4012; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4013; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 4014; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4015; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4016; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4017; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4018; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4019; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 4020; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4021; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 4022; GFX1064_ITERATIVE-NEXT: ; %bb.3: 4023; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 4024; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 4025; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 4026; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4027; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 4028; GFX1064_ITERATIVE-NEXT: .LBB10_4: 4029; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 4030; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 4031; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4032; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 4033; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4034; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 4035; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 4036; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4037; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 4038; GFX1064_ITERATIVE-NEXT: s_endpgm 4039; 4040; GFX1032_ITERATIVE-LABEL: sub_i32_varying: 4041; GFX1032_ITERATIVE: ; %bb.0: ; %entry 4042; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 4043; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 4044; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4045; GFX1032_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop 4046; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4047; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 4048; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 4049; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 4050; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 4051; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 4052; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 4053; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 4054; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 4055; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4056; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4057; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4058; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4059; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 4060; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 4061; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 4062; GFX1032_ITERATIVE-NEXT: ; %bb.3: 4063; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 4064; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 4065; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 4066; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4067; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 4068; GFX1032_ITERATIVE-NEXT: .LBB10_4: 4069; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 4070; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 4071; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4072; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 4073; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4074; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 4075; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 4076; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4077; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 4078; GFX1032_ITERATIVE-NEXT: s_endpgm 4079; 4080; GFX1164_ITERATIVE-LABEL: sub_i32_varying: 4081; GFX1164_ITERATIVE: ; %bb.0: ; %entry 4082; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 4083; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4084; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 4085; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4086; GFX1164_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop 4087; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4088; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 4089; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4090; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 4091; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 4092; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 4093; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 4094; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 4095; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 4096; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4097; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 4098; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4099; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 4100; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4101; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 4102; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 4103; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4104; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 4105; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4106; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4107; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 4108; GFX1164_ITERATIVE-NEXT: ; %bb.3: 4109; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 4110; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 4111; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 4112; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4113; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 4114; GFX1164_ITERATIVE-NEXT: .LBB10_4: 4115; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 4116; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 4117; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 4118; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4119; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 4120; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4121; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 4122; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4123; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4124; GFX1164_ITERATIVE-NEXT: s_endpgm 4125; 4126; GFX1132_ITERATIVE-LABEL: sub_i32_varying: 4127; GFX1132_ITERATIVE: ; %bb.0: ; %entry 4128; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 4129; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 4130; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 4131; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 4132; GFX1132_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop 4133; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4134; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 4135; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4136; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 4137; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 4138; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 4139; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 4140; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 4141; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 4142; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 4143; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 4144; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4145; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 4146; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4147; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 4148; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 4149; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 4150; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 4151; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 4152; GFX1132_ITERATIVE-NEXT: ; %bb.3: 4153; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 4154; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 4155; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4156; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 4157; GFX1132_ITERATIVE-NEXT: .LBB10_4: 4158; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 4159; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 4160; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 4161; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 4162; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 4163; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 4164; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 4165; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4166; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4167; GFX1132_ITERATIVE-NEXT: s_endpgm 4168; 4169; GFX7LESS_DPP-LABEL: sub_i32_varying: 4170; GFX7LESS_DPP: ; %bb.0: ; %entry 4171; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4172; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 4173; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 4174; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 4175; GFX7LESS_DPP-NEXT: ds_sub_rtn_u32 v0, v1, v0 4176; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 4177; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 4178; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 4179; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4180; GFX7LESS_DPP-NEXT: s_endpgm 4181; 4182; GFX8_DPP-LABEL: sub_i32_varying: 4183; GFX8_DPP: ; %bb.0: ; %entry 4184; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 4185; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 4186; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 4187; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4188; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4189; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 4190; GFX8_DPP-NEXT: s_nop 0 4191; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4192; GFX8_DPP-NEXT: s_nop 1 4193; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4194; GFX8_DPP-NEXT: s_nop 1 4195; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4196; GFX8_DPP-NEXT: s_nop 1 4197; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4198; GFX8_DPP-NEXT: s_nop 1 4199; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 4200; GFX8_DPP-NEXT: s_nop 1 4201; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 4202; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 4203; GFX8_DPP-NEXT: s_nop 0 4204; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 4205; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 4206; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 4207; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 4208; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 4209; GFX8_DPP-NEXT: s_cbranch_execz .LBB10_2 4210; GFX8_DPP-NEXT: ; %bb.1: 4211; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 4212; GFX8_DPP-NEXT: s_mov_b32 m0, -1 4213; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 4214; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 4215; GFX8_DPP-NEXT: .LBB10_2: 4216; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 4217; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4218; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 4219; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 4220; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 4221; GFX8_DPP-NEXT: s_mov_b32 s2, -1 4222; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 4223; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 4224; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4225; GFX8_DPP-NEXT: s_endpgm 4226; 4227; GFX9_DPP-LABEL: sub_i32_varying: 4228; GFX9_DPP: ; %bb.0: ; %entry 4229; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 4230; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 4231; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 4232; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4233; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4234; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 4235; GFX9_DPP-NEXT: s_nop 0 4236; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4237; GFX9_DPP-NEXT: s_nop 1 4238; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4239; GFX9_DPP-NEXT: s_nop 1 4240; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4241; GFX9_DPP-NEXT: s_nop 1 4242; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4243; GFX9_DPP-NEXT: s_nop 1 4244; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 4245; GFX9_DPP-NEXT: s_nop 1 4246; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 4247; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 4248; GFX9_DPP-NEXT: s_nop 0 4249; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 4250; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 4251; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 4252; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 4253; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 4254; GFX9_DPP-NEXT: s_cbranch_execz .LBB10_2 4255; GFX9_DPP-NEXT: ; %bb.1: 4256; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 4257; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 4258; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 4259; GFX9_DPP-NEXT: .LBB10_2: 4260; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 4261; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4262; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 4263; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 4264; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 4265; GFX9_DPP-NEXT: s_mov_b32 s2, -1 4266; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 4267; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 4268; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4269; GFX9_DPP-NEXT: s_endpgm 4270; 4271; GFX1064_DPP-LABEL: sub_i32_varying: 4272; GFX1064_DPP: ; %bb.0: ; %entry 4273; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4274; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4275; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 4276; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4277; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4278; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4279; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4280; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 4281; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4282; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 4283; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 4284; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4285; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4286; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 4287; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 4288; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 4289; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 4290; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4291; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4292; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 4293; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 4294; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 4295; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 4296; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4297; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 4298; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4299; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 4300; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 4301; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4302; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 4303; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 4304; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 4305; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2 4306; GFX1064_DPP-NEXT: ; %bb.1: 4307; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 4308; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 4309; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 4310; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 4311; GFX1064_DPP-NEXT: buffer_gl0_inv 4312; GFX1064_DPP-NEXT: .LBB10_2: 4313; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 4314; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 4315; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4316; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 4317; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 4318; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 4319; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 4320; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 4321; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4322; GFX1064_DPP-NEXT: s_endpgm 4323; 4324; GFX1032_DPP-LABEL: sub_i32_varying: 4325; GFX1032_DPP: ; %bb.0: ; %entry 4326; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 4327; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 4328; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 4329; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4330; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4331; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4332; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4333; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 4334; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4335; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 4336; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 4337; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4338; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 4339; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4340; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 4341; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 4342; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 4343; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 4344; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 4345; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 4346; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4347; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 4348; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 4349; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 4350; GFX1032_DPP-NEXT: ; %bb.1: 4351; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 4352; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 4353; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 4354; GFX1032_DPP-NEXT: buffer_gl0_inv 4355; GFX1032_DPP-NEXT: .LBB10_2: 4356; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 4357; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 4358; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4359; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 4360; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 4361; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 4362; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 4363; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 4364; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 4365; GFX1032_DPP-NEXT: s_endpgm 4366; 4367; GFX1164_DPP-LABEL: sub_i32_varying: 4368; GFX1164_DPP: ; %bb.0: ; %entry 4369; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4370; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4371; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4372; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4373; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 4374; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 4375; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4376; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4377; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4378; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4379; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4380; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4381; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 4382; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4383; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4384; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 4385; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 4386; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4387; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4388; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4389; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 4390; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 4391; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4392; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 4393; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 4394; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4395; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4396; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 4397; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 4398; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 4399; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 4400; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4401; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4402; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 4403; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4404; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 4405; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 4406; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4407; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 4408; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 4409; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 4410; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2 4411; GFX1164_DPP-NEXT: ; %bb.1: 4412; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 4413; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 4414; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 4415; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 4416; GFX1164_DPP-NEXT: buffer_gl0_inv 4417; GFX1164_DPP-NEXT: .LBB10_2: 4418; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 4419; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 4420; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 4421; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 4422; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 4423; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 4424; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 4425; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 4426; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4427; GFX1164_DPP-NEXT: s_endpgm 4428; 4429; GFX1132_DPP-LABEL: sub_i32_varying: 4430; GFX1132_DPP: ; %bb.0: ; %entry 4431; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4432; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 4433; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4434; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 4435; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 4436; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 4437; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4438; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4439; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4440; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4441; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4442; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4443; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 4444; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4445; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 4446; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 4447; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 4448; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4449; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 4450; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4451; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 4452; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 4453; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 4454; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 4455; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 4456; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 4457; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4458; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 4459; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 4460; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 4461; GFX1132_DPP-NEXT: ; %bb.1: 4462; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 4463; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 4464; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 4465; GFX1132_DPP-NEXT: buffer_gl0_inv 4466; GFX1132_DPP-NEXT: .LBB10_2: 4467; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 4468; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 4469; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 4470; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 4471; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 4472; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 4473; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 4474; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 4475; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4476; GFX1132_DPP-NEXT: s_endpgm 4477entry: 4478 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4479 %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel 4480 store i32 %old, ptr addrspace(1) %out 4481 ret void 4482} 4483 4484define amdgpu_kernel void @sub_i32_varying_nouse() { 4485; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying_nouse: 4486; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 4487; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4488; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 4489; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 4490; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4491; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 4492; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 4493; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 4494; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 4495; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 4496; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] 4497; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 4498; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1 4499; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4500; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4501; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4502; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4503; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 4504; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4505; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 4506; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 4507; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 4508; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 4509; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 4510; GFX7LESS_ITERATIVE-NEXT: ds_sub_u32 v0, v1 4511; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4512; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: 4513; GFX7LESS_ITERATIVE-NEXT: s_endpgm 4514; 4515; GFX8_ITERATIVE-LABEL: sub_i32_varying_nouse: 4516; GFX8_ITERATIVE: ; %bb.0: ; %entry 4517; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4518; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 4519; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 4520; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4521; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 4522; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 4523; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 4524; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 4525; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 4526; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4527; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 4528; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4529; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4530; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4531; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4532; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 4533; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4534; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 4535; GFX8_ITERATIVE-NEXT: ; %bb.3: 4536; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 4537; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 4538; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 4539; GFX8_ITERATIVE-NEXT: ds_sub_u32 v0, v1 4540; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4541; GFX8_ITERATIVE-NEXT: .LBB11_4: 4542; GFX8_ITERATIVE-NEXT: s_endpgm 4543; 4544; GFX9_ITERATIVE-LABEL: sub_i32_varying_nouse: 4545; GFX9_ITERATIVE: ; %bb.0: ; %entry 4546; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4547; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 4548; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 4549; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4550; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 4551; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 4552; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 4553; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 4554; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 4555; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4556; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 4557; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4558; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4559; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4560; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4561; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 4562; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4563; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 4564; GFX9_ITERATIVE-NEXT: ; %bb.3: 4565; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 4566; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 4567; GFX9_ITERATIVE-NEXT: ds_sub_u32 v0, v1 4568; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4569; GFX9_ITERATIVE-NEXT: .LBB11_4: 4570; GFX9_ITERATIVE-NEXT: s_endpgm 4571; 4572; GFX1064_ITERATIVE-LABEL: sub_i32_varying_nouse: 4573; GFX1064_ITERATIVE: ; %bb.0: ; %entry 4574; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4575; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 4576; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 4577; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4578; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 4579; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 4580; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 4581; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 4582; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 4583; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4584; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 4585; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4586; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4587; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4588; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4589; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 4590; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4591; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 4592; GFX1064_ITERATIVE-NEXT: ; %bb.3: 4593; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 4594; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 4595; GFX1064_ITERATIVE-NEXT: ds_sub_u32 v0, v1 4596; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4597; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 4598; GFX1064_ITERATIVE-NEXT: .LBB11_4: 4599; GFX1064_ITERATIVE-NEXT: s_endpgm 4600; 4601; GFX1032_ITERATIVE-LABEL: sub_i32_varying_nouse: 4602; GFX1032_ITERATIVE: ; %bb.0: ; %entry 4603; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 4604; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 4605; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 4606; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4607; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 4608; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 4609; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 4610; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 4611; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 4612; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 4613; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 4614; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4615; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4616; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4617; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 4618; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 4619; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 4620; GFX1032_ITERATIVE-NEXT: ; %bb.3: 4621; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 4622; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0 4623; GFX1032_ITERATIVE-NEXT: ds_sub_u32 v0, v1 4624; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4625; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 4626; GFX1032_ITERATIVE-NEXT: .LBB11_4: 4627; GFX1032_ITERATIVE-NEXT: s_endpgm 4628; 4629; GFX1164_ITERATIVE-LABEL: sub_i32_varying_nouse: 4630; GFX1164_ITERATIVE: ; %bb.0: ; %entry 4631; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4632; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4633; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 4634; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 4635; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4636; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 4637; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4638; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 4639; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 4640; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4641; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] 4642; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 4643; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 4644; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 4645; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4646; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4647; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 4648; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4649; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4650; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 4651; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4652; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 4653; GFX1164_ITERATIVE-NEXT: ; %bb.3: 4654; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 4655; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 4656; GFX1164_ITERATIVE-NEXT: ds_sub_u32 v0, v1 4657; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4658; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 4659; GFX1164_ITERATIVE-NEXT: .LBB11_4: 4660; GFX1164_ITERATIVE-NEXT: s_endpgm 4661; 4662; GFX1132_ITERATIVE-LABEL: sub_i32_varying_nouse: 4663; GFX1132_ITERATIVE: ; %bb.0: ; %entry 4664; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4665; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 4666; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 4667; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop 4668; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 4669; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 4670; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4671; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 4672; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 4673; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4674; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 4675; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 4676; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 4677; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 4678; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 4679; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4680; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 4681; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 4682; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 4683; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 4684; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 4685; GFX1132_ITERATIVE-NEXT: ; %bb.3: 4686; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 4687; GFX1132_ITERATIVE-NEXT: ds_sub_u32 v0, v1 4688; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 4689; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 4690; GFX1132_ITERATIVE-NEXT: .LBB11_4: 4691; GFX1132_ITERATIVE-NEXT: s_endpgm 4692; 4693; GFX7LESS_DPP-LABEL: sub_i32_varying_nouse: 4694; GFX7LESS_DPP: ; %bb.0: ; %entry 4695; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 4696; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 4697; GFX7LESS_DPP-NEXT: ds_sub_u32 v1, v0 4698; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 4699; GFX7LESS_DPP-NEXT: s_endpgm 4700; 4701; GFX8_DPP-LABEL: sub_i32_varying_nouse: 4702; GFX8_DPP: ; %bb.0: ; %entry 4703; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 4704; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4705; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4706; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4707; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4708; GFX8_DPP-NEXT: s_nop 1 4709; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4710; GFX8_DPP-NEXT: s_nop 1 4711; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4712; GFX8_DPP-NEXT: s_nop 1 4713; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4714; GFX8_DPP-NEXT: s_nop 1 4715; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4716; GFX8_DPP-NEXT: s_nop 1 4717; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 4718; GFX8_DPP-NEXT: s_nop 1 4719; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 4720; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 4721; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 4722; GFX8_DPP-NEXT: s_mov_b32 s0, s2 4723; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4724; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 4725; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 4726; GFX8_DPP-NEXT: ; %bb.1: 4727; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 4728; GFX8_DPP-NEXT: s_mov_b32 m0, -1 4729; GFX8_DPP-NEXT: ds_sub_u32 v2, v0 4730; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 4731; GFX8_DPP-NEXT: .LBB11_2: 4732; GFX8_DPP-NEXT: s_endpgm 4733; 4734; GFX9_DPP-LABEL: sub_i32_varying_nouse: 4735; GFX9_DPP: ; %bb.0: ; %entry 4736; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 4737; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4738; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4739; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4740; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4741; GFX9_DPP-NEXT: s_nop 1 4742; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4743; GFX9_DPP-NEXT: s_nop 1 4744; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4745; GFX9_DPP-NEXT: s_nop 1 4746; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4747; GFX9_DPP-NEXT: s_nop 1 4748; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4749; GFX9_DPP-NEXT: s_nop 1 4750; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 4751; GFX9_DPP-NEXT: s_nop 1 4752; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 4753; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 4754; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 4755; GFX9_DPP-NEXT: s_mov_b32 s0, s2 4756; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4757; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 4758; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 4759; GFX9_DPP-NEXT: ; %bb.1: 4760; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 4761; GFX9_DPP-NEXT: ds_sub_u32 v2, v0 4762; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 4763; GFX9_DPP-NEXT: .LBB11_2: 4764; GFX9_DPP-NEXT: s_endpgm 4765; 4766; GFX1064_DPP-LABEL: sub_i32_varying_nouse: 4767; GFX1064_DPP: ; %bb.0: ; %entry 4768; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4769; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4770; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4771; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4772; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4773; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4774; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 4775; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 4776; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 4777; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4778; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4779; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0 4780; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32 4781; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 4782; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 4783; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 4784; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 4785; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4786; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc 4787; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 4788; GFX1064_DPP-NEXT: ; %bb.1: 4789; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 4790; GFX1064_DPP-NEXT: ds_sub_u32 v0, v3 4791; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 4792; GFX1064_DPP-NEXT: buffer_gl0_inv 4793; GFX1064_DPP-NEXT: .LBB11_2: 4794; GFX1064_DPP-NEXT: s_endpgm 4795; 4796; GFX1032_DPP-LABEL: sub_i32_varying_nouse: 4797; GFX1032_DPP: ; %bb.0: ; %entry 4798; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 4799; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 4800; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4801; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4802; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4803; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4804; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 4805; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 4806; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 4807; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 4808; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 4809; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 4810; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4811; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo 4812; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 4813; GFX1032_DPP-NEXT: ; %bb.1: 4814; GFX1032_DPP-NEXT: ds_sub_u32 v0, v3 4815; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 4816; GFX1032_DPP-NEXT: buffer_gl0_inv 4817; GFX1032_DPP-NEXT: .LBB11_2: 4818; GFX1032_DPP-NEXT: s_endpgm 4819; 4820; GFX1164_DPP-LABEL: sub_i32_varying_nouse: 4821; GFX1164_DPP: ; %bb.0: ; %entry 4822; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4823; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4824; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4825; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 4826; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4827; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4828; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4829; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4830; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4831; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4832; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4833; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 4834; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 4835; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4836; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 4837; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 4838; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4839; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 4840; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe 4841; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 4842; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 4843; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 4844; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 4845; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 4846; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 4847; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec 4848; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) 4849; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 4850; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 4851; GFX1164_DPP-NEXT: ; %bb.1: 4852; GFX1164_DPP-NEXT: ds_sub_u32 v0, v3 4853; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 4854; GFX1164_DPP-NEXT: buffer_gl0_inv 4855; GFX1164_DPP-NEXT: .LBB11_2: 4856; GFX1164_DPP-NEXT: s_endpgm 4857; 4858; GFX1132_DPP-LABEL: sub_i32_varying_nouse: 4859; GFX1132_DPP: ; %bb.0: ; %entry 4860; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4861; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 4862; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 4863; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 4864; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4865; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4866; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4867; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4868; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4869; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4870; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4871; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 4872; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 4873; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 4874; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 4875; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 4876; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 4877; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo 4878; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) 4879; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 4880; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 4881; GFX1132_DPP-NEXT: ; %bb.1: 4882; GFX1132_DPP-NEXT: ds_sub_u32 v0, v3 4883; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 4884; GFX1132_DPP-NEXT: buffer_gl0_inv 4885; GFX1132_DPP-NEXT: .LBB11_2: 4886; GFX1132_DPP-NEXT: s_endpgm 4887entry: 4888 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4889 %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel 4890 ret void 4891} 4892 4893define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { 4894; GFX7LESS-LABEL: sub_i64_constant: 4895; GFX7LESS: ; %bb.0: ; %entry 4896; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 4897; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4898; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s3, v0 4899; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 4900; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4901; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 4902; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 4903; GFX7LESS-NEXT: ; %bb.1: 4904; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 4905; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 4906; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4907; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 4908; GFX7LESS-NEXT: s_mov_b32 m0, -1 4909; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 4910; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4911; GFX7LESS-NEXT: .LBB12_2: 4912; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 4913; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4914; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4915; GFX7LESS-NEXT: s_mov_b32 s2, -1 4916; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 4917; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 4918; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 4919; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 4920; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4921; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 4922; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 4923; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4924; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4925; GFX7LESS-NEXT: s_endpgm 4926; 4927; GFX8-LABEL: sub_i64_constant: 4928; GFX8: ; %bb.0: ; %entry 4929; GFX8-NEXT: s_mov_b64 s[2:3], exec 4930; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4931; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 4932; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 4933; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4934; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 4935; GFX8-NEXT: s_cbranch_execz .LBB12_2 4936; GFX8-NEXT: ; %bb.1: 4937; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 4938; GFX8-NEXT: s_mul_i32 s2, s2, 5 4939; GFX8-NEXT: v_mov_b32_e32 v0, s2 4940; GFX8-NEXT: v_mov_b32_e32 v1, 0 4941; GFX8-NEXT: s_mov_b32 m0, -1 4942; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 4943; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4944; GFX8-NEXT: .LBB12_2: 4945; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 4946; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4947; GFX8-NEXT: v_readfirstlane_b32 s4, v1 4948; GFX8-NEXT: v_readfirstlane_b32 s5, v0 4949; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 4950; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 4951; GFX8-NEXT: v_mov_b32_e32 v2, s4 4952; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 4953; GFX8-NEXT: s_mov_b32 s3, 0xf000 4954; GFX8-NEXT: s_mov_b32 s2, -1 4955; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 4956; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4957; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4958; GFX8-NEXT: s_endpgm 4959; 4960; GFX9-LABEL: sub_i64_constant: 4961; GFX9: ; %bb.0: ; %entry 4962; GFX9-NEXT: s_mov_b64 s[2:3], exec 4963; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4964; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 4965; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 4966; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4967; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 4968; GFX9-NEXT: s_cbranch_execz .LBB12_2 4969; GFX9-NEXT: ; %bb.1: 4970; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 4971; GFX9-NEXT: s_mul_i32 s2, s2, 5 4972; GFX9-NEXT: v_mov_b32_e32 v0, s2 4973; GFX9-NEXT: v_mov_b32_e32 v1, 0 4974; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 4975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4976; GFX9-NEXT: .LBB12_2: 4977; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 4978; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4979; GFX9-NEXT: v_readfirstlane_b32 s4, v1 4980; GFX9-NEXT: v_readfirstlane_b32 s5, v0 4981; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 4982; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 4983; GFX9-NEXT: v_mov_b32_e32 v2, s4 4984; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 4985; GFX9-NEXT: s_mov_b32 s3, 0xf000 4986; GFX9-NEXT: s_mov_b32 s2, -1 4987; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 4988; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4989; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4990; GFX9-NEXT: s_endpgm 4991; 4992; GFX1064-LABEL: sub_i64_constant: 4993; GFX1064: ; %bb.0: ; %entry 4994; GFX1064-NEXT: s_mov_b64 s[2:3], exec 4995; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4996; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 4997; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4998; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 4999; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 5000; GFX1064-NEXT: s_cbranch_execz .LBB12_2 5001; GFX1064-NEXT: ; %bb.1: 5002; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 5003; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5004; GFX1064-NEXT: s_mul_i32 s2, s2, 5 5005; GFX1064-NEXT: v_mov_b32_e32 v0, s2 5006; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 5007; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5008; GFX1064-NEXT: buffer_gl0_inv 5009; GFX1064-NEXT: .LBB12_2: 5010; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5011; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 5012; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5013; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5014; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5015; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5016; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5017; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 5018; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5019; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5020; GFX1064-NEXT: s_mov_b32 s2, -1 5021; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5022; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5023; GFX1064-NEXT: s_endpgm 5024; 5025; GFX1032-LABEL: sub_i64_constant: 5026; GFX1032: ; %bb.0: ; %entry 5027; GFX1032-NEXT: s_mov_b32 s1, exec_lo 5028; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5029; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 5030; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 5031; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 5032; GFX1032-NEXT: s_cbranch_execz .LBB12_2 5033; GFX1032-NEXT: ; %bb.1: 5034; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 5035; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5036; GFX1032-NEXT: s_mul_i32 s1, s1, 5 5037; GFX1032-NEXT: v_mov_b32_e32 v0, s1 5038; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 5039; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5040; GFX1032-NEXT: buffer_gl0_inv 5041; GFX1032-NEXT: .LBB12_2: 5042; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5043; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 5044; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5045; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5046; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5047; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5048; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5049; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 5050; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5051; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5052; GFX1032-NEXT: s_mov_b32 s2, -1 5053; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5054; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5055; GFX1032-NEXT: s_endpgm 5056; 5057; GFX1164-LABEL: sub_i64_constant: 5058; GFX1164: ; %bb.0: ; %entry 5059; GFX1164-NEXT: s_mov_b64 s[2:3], exec 5060; GFX1164-NEXT: s_mov_b64 s[0:1], exec 5061; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 5062; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5063; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 5064; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5065; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 5066; GFX1164-NEXT: s_cbranch_execz .LBB12_2 5067; GFX1164-NEXT: ; %bb.1: 5068; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 5069; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5070; GFX1164-NEXT: s_mul_i32 s2, s2, 5 5071; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5072; GFX1164-NEXT: v_mov_b32_e32 v0, s2 5073; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 5074; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5075; GFX1164-NEXT: buffer_gl0_inv 5076; GFX1164-NEXT: .LBB12_2: 5077; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 5078; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 5079; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5080; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5081; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5082; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5083; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5084; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 5085; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5086; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5087; GFX1164-NEXT: s_mov_b32 s2, -1 5088; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5089; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5090; GFX1164-NEXT: s_endpgm 5091; 5092; GFX1132-LABEL: sub_i64_constant: 5093; GFX1132: ; %bb.0: ; %entry 5094; GFX1132-NEXT: s_mov_b32 s1, exec_lo 5095; GFX1132-NEXT: s_mov_b32 s0, exec_lo 5096; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 5097; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5098; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5099; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 5100; GFX1132-NEXT: s_cbranch_execz .LBB12_2 5101; GFX1132-NEXT: ; %bb.1: 5102; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 5103; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5104; GFX1132-NEXT: s_mul_i32 s1, s1, 5 5105; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5106; GFX1132-NEXT: v_mov_b32_e32 v0, s1 5107; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 5108; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5109; GFX1132-NEXT: buffer_gl0_inv 5110; GFX1132-NEXT: .LBB12_2: 5111; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 5112; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 5113; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5114; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 5115; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5116; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 5117; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 5118; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 5119; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5120; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5121; GFX1132-NEXT: s_mov_b32 s2, -1 5122; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5123; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5124; GFX1132-NEXT: s_endpgm 5125entry: 5126 %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel 5127 store i64 %old, ptr addrspace(1) %out 5128 ret void 5129} 5130 5131define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) { 5132; GFX7LESS-LABEL: sub_i64_uniform: 5133; GFX7LESS: ; %bb.0: ; %entry 5134; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 5135; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5136; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 5137; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 5138; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5139; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5140; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 5141; GFX7LESS-NEXT: s_cbranch_execz .LBB13_2 5142; GFX7LESS-NEXT: ; %bb.1: 5143; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 5144; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 5145; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5146; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 5147; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 5148; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 5149; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 5150; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 5151; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 5152; GFX7LESS-NEXT: s_mov_b32 m0, -1 5153; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 5154; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5155; GFX7LESS-NEXT: .LBB13_2: 5156; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 5157; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 5158; GFX7LESS-NEXT: s_mov_b32 s6, -1 5159; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5160; GFX7LESS-NEXT: s_mov_b32 s4, s0 5161; GFX7LESS-NEXT: s_mov_b32 s5, s1 5162; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 5163; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0 5164; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 5165; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 5166; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 5167; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 5168; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0 5169; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v2 5170; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 5171; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5172; GFX7LESS-NEXT: s_endpgm 5173; 5174; GFX8-LABEL: sub_i64_uniform: 5175; GFX8: ; %bb.0: ; %entry 5176; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5177; GFX8-NEXT: s_mov_b64 s[6:7], exec 5178; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5179; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5180; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5181; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5182; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 5183; GFX8-NEXT: s_cbranch_execz .LBB13_2 5184; GFX8-NEXT: ; %bb.1: 5185; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 5186; GFX8-NEXT: v_mov_b32_e32 v0, s8 5187; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5188; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 5189; GFX8-NEXT: s_mul_i32 s6, s3, s8 5190; GFX8-NEXT: v_mov_b32_e32 v3, 0 5191; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 5192; GFX8-NEXT: s_mov_b32 m0, -1 5193; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 5194; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5195; GFX8-NEXT: .LBB13_2: 5196; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5197; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5198; GFX8-NEXT: s_mov_b32 s4, s0 5199; GFX8-NEXT: s_mov_b32 s5, s1 5200; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 5201; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 5202; GFX8-NEXT: v_readfirstlane_b32 s0, v1 5203; GFX8-NEXT: v_readfirstlane_b32 s1, v0 5204; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 5205; GFX8-NEXT: v_mov_b32_e32 v3, s0 5206; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2 5207; GFX8-NEXT: s_mov_b32 s7, 0xf000 5208; GFX8-NEXT: s_mov_b32 s6, -1 5209; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 5210; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5211; GFX8-NEXT: s_endpgm 5212; 5213; GFX9-LABEL: sub_i64_uniform: 5214; GFX9: ; %bb.0: ; %entry 5215; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5216; GFX9-NEXT: s_mov_b64 s[6:7], exec 5217; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5218; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5219; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5220; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5221; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 5222; GFX9-NEXT: s_cbranch_execz .LBB13_2 5223; GFX9-NEXT: ; %bb.1: 5224; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 5225; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5226; GFX9-NEXT: s_mul_i32 s7, s3, s6 5227; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 5228; GFX9-NEXT: s_add_i32 s8, s8, s7 5229; GFX9-NEXT: s_mul_i32 s6, s2, s6 5230; GFX9-NEXT: v_mov_b32_e32 v0, s6 5231; GFX9-NEXT: v_mov_b32_e32 v1, s8 5232; GFX9-NEXT: v_mov_b32_e32 v3, 0 5233; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 5234; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5235; GFX9-NEXT: .LBB13_2: 5236; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 5237; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5238; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 5239; GFX9-NEXT: s_mov_b32 s4, s0 5240; GFX9-NEXT: s_mov_b32 s5, s1 5241; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 5242; GFX9-NEXT: v_readfirstlane_b32 s0, v1 5243; GFX9-NEXT: v_readfirstlane_b32 s1, v0 5244; GFX9-NEXT: v_mov_b32_e32 v1, v4 5245; GFX9-NEXT: v_mov_b32_e32 v2, s0 5246; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 5247; GFX9-NEXT: s_mov_b32 s7, 0xf000 5248; GFX9-NEXT: s_mov_b32 s6, -1 5249; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 5250; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5251; GFX9-NEXT: s_endpgm 5252; 5253; GFX1064-LABEL: sub_i64_uniform: 5254; GFX1064: ; %bb.0: ; %entry 5255; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5256; GFX1064-NEXT: s_mov_b64 s[6:7], exec 5257; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5258; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5259; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5260; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5261; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5262; GFX1064-NEXT: s_cbranch_execz .LBB13_2 5263; GFX1064-NEXT: ; %bb.1: 5264; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 5265; GFX1064-NEXT: v_mov_b32_e32 v3, 0 5266; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5267; GFX1064-NEXT: s_mul_i32 s7, s3, s6 5268; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 5269; GFX1064-NEXT: s_mul_i32 s6, s2, s6 5270; GFX1064-NEXT: s_add_i32 s8, s8, s7 5271; GFX1064-NEXT: v_mov_b32_e32 v0, s6 5272; GFX1064-NEXT: v_mov_b32_e32 v1, s8 5273; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 5274; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5275; GFX1064-NEXT: buffer_gl0_inv 5276; GFX1064-NEXT: .LBB13_2: 5277; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5278; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5279; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5280; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 5281; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 5282; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 5283; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5284; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5285; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 5286; GFX1064-NEXT: v_mov_b32_e32 v1, v4 5287; GFX1064-NEXT: s_mov_b32 s2, -1 5288; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 5289; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5290; GFX1064-NEXT: s_endpgm 5291; 5292; GFX1032-LABEL: sub_i64_uniform: 5293; GFX1032: ; %bb.0: ; %entry 5294; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5295; GFX1032-NEXT: s_mov_b32 s6, exec_lo 5296; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5297; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 5298; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 5299; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 5300; GFX1032-NEXT: s_cbranch_execz .LBB13_2 5301; GFX1032-NEXT: ; %bb.1: 5302; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 5303; GFX1032-NEXT: v_mov_b32_e32 v3, 0 5304; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5305; GFX1032-NEXT: s_mul_i32 s6, s3, s5 5306; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 5307; GFX1032-NEXT: s_mul_i32 s5, s2, s5 5308; GFX1032-NEXT: s_add_i32 s7, s7, s6 5309; GFX1032-NEXT: v_mov_b32_e32 v0, s5 5310; GFX1032-NEXT: v_mov_b32_e32 v1, s7 5311; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 5312; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5313; GFX1032-NEXT: buffer_gl0_inv 5314; GFX1032-NEXT: .LBB13_2: 5315; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5316; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 5317; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5318; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 5319; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 5320; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] 5321; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5322; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5323; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 5324; GFX1032-NEXT: v_mov_b32_e32 v1, v4 5325; GFX1032-NEXT: s_mov_b32 s2, -1 5326; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 5327; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5328; GFX1032-NEXT: s_endpgm 5329; 5330; GFX1164-LABEL: sub_i64_uniform: 5331; GFX1164: ; %bb.0: ; %entry 5332; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5333; GFX1164-NEXT: s_mov_b64 s[6:7], exec 5334; GFX1164-NEXT: s_mov_b64 s[4:5], exec 5335; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 5336; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5337; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 5338; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5339; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 5340; GFX1164-NEXT: s_cbranch_execz .LBB13_2 5341; GFX1164-NEXT: ; %bb.1: 5342; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 5343; GFX1164-NEXT: v_mov_b32_e32 v3, 0 5344; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5345; GFX1164-NEXT: s_mul_i32 s7, s3, s6 5346; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 5347; GFX1164-NEXT: s_mul_i32 s6, s2, s6 5348; GFX1164-NEXT: s_add_i32 s8, s8, s7 5349; GFX1164-NEXT: v_mov_b32_e32 v0, s6 5350; GFX1164-NEXT: v_mov_b32_e32 v1, s8 5351; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 5352; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5353; GFX1164-NEXT: buffer_gl0_inv 5354; GFX1164-NEXT: .LBB13_2: 5355; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5356; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5357; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 5358; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5359; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 5360; GFX1164-NEXT: s_waitcnt_depctr 0xfff 5361; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 5362; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 5363; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5364; GFX1164-NEXT: s_mov_b32 s2, -1 5365; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 5366; GFX1164-NEXT: v_mov_b32_e32 v1, v5 5367; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 5368; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5369; GFX1164-NEXT: s_endpgm 5370; 5371; GFX1132-LABEL: sub_i64_uniform: 5372; GFX1132: ; %bb.0: ; %entry 5373; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5374; GFX1132-NEXT: s_mov_b32 s6, exec_lo 5375; GFX1132-NEXT: s_mov_b32 s4, exec_lo 5376; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 5377; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5378; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5379; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 5380; GFX1132-NEXT: s_cbranch_execz .LBB13_2 5381; GFX1132-NEXT: ; %bb.1: 5382; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 5383; GFX1132-NEXT: v_mov_b32_e32 v3, 0 5384; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5385; GFX1132-NEXT: s_mul_i32 s6, s3, s5 5386; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 5387; GFX1132-NEXT: s_mul_i32 s5, s2, s5 5388; GFX1132-NEXT: s_add_i32 s7, s7, s6 5389; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5390; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 5391; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 5392; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5393; GFX1132-NEXT: buffer_gl0_inv 5394; GFX1132-NEXT: .LBB13_2: 5395; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 5396; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5397; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 5398; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5399; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 5400; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 5401; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 5402; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 5403; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5404; GFX1132-NEXT: s_mov_b32 s2, -1 5405; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 5406; GFX1132-NEXT: v_mov_b32_e32 v1, v5 5407; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 5408; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5409; GFX1132-NEXT: s_endpgm 5410entry: 5411 %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel 5412 store i64 %old, ptr addrspace(1) %out 5413 ret void 5414} 5415 5416define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { 5417; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying: 5418; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 5419; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 5420; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 5421; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 5422; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 5423; GFX7LESS_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop 5424; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5425; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] 5426; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 5427; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 5428; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 5429; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 5430; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 5431; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 5432; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 5433; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 5434; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 5435; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 5436; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 5437; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB14_1 5438; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5439; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5440; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5441; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5442; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 5443; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 5444; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 5445; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 5446; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 5447; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 5448; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 5449; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 5450; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 5451; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] 5452; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5453; GFX7LESS_ITERATIVE-NEXT: .LBB14_4: 5454; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 5455; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5456; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 5457; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 5458; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 5459; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 5460; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 5461; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s5, v1 5462; GFX7LESS_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc 5463; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5464; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5465; GFX7LESS_ITERATIVE-NEXT: s_endpgm 5466; 5467; GFX8_ITERATIVE-LABEL: sub_i64_varying: 5468; GFX8_ITERATIVE: ; %bb.0: ; %entry 5469; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 5470; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 5471; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 5472; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 5473; GFX8_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop 5474; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5475; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] 5476; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 5477; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 5478; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 5479; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 5480; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 5481; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 5482; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 5483; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 5484; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 5485; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 5486; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 5487; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5488; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5489; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5490; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5491; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 5492; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 5493; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 5494; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 5495; GFX8_ITERATIVE-NEXT: ; %bb.3: 5496; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 5497; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 5498; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 5499; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 5500; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] 5501; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5502; GFX8_ITERATIVE-NEXT: .LBB14_4: 5503; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 5504; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5505; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 5506; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 5507; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 5508; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s5, v1 5509; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 5510; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 5511; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc 5512; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5513; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5514; GFX8_ITERATIVE-NEXT: s_endpgm 5515; 5516; GFX9_ITERATIVE-LABEL: sub_i64_varying: 5517; GFX9_ITERATIVE: ; %bb.0: ; %entry 5518; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 5519; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 5520; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 5521; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 5522; GFX9_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop 5523; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5524; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] 5525; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 5526; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 5527; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 5528; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 5529; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 5530; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 5531; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 5532; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 5533; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 5534; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 5535; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 5536; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5537; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5538; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5539; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5540; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 5541; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 5542; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 5543; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 5544; GFX9_ITERATIVE-NEXT: ; %bb.3: 5545; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 5546; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 5547; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 5548; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] 5549; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5550; GFX9_ITERATIVE-NEXT: .LBB14_4: 5551; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 5552; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5553; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 5554; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 5555; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 5556; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 5557; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 5558; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 5559; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc 5560; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5561; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5562; GFX9_ITERATIVE-NEXT: s_endpgm 5563; 5564; GFX1064_ITERATIVE-LABEL: sub_i64_varying: 5565; GFX1064_ITERATIVE: ; %bb.0: ; %entry 5566; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 5567; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 5568; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 5569; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 5570; GFX1064_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop 5571; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5572; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] 5573; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 5574; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 5575; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 5576; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 5577; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 5578; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 5579; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 5580; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 5581; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 5582; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 5583; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5584; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5585; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 5586; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5587; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5588; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 5589; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 5590; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 5591; GFX1064_ITERATIVE-NEXT: ; %bb.3: 5592; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 5593; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 5594; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 5595; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] 5596; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5597; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 5598; GFX1064_ITERATIVE-NEXT: .LBB14_4: 5599; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 5600; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 5601; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5602; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 5603; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 5604; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 5605; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 5606; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 5607; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 5608; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5609; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5610; GFX1064_ITERATIVE-NEXT: s_endpgm 5611; 5612; GFX1032_ITERATIVE-LABEL: sub_i64_varying: 5613; GFX1032_ITERATIVE: ; %bb.0: ; %entry 5614; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 5615; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 5616; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 5617; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 5618; GFX1032_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop 5619; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5620; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 5621; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 5622; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 5623; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 5624; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 5625; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 5626; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 5627; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 5628; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 5629; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 5630; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 5631; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5632; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5633; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 5634; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5635; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 5636; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 5637; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 5638; GFX1032_ITERATIVE-NEXT: ; %bb.3: 5639; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 5640; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 5641; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 5642; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] 5643; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5644; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 5645; GFX1032_ITERATIVE-NEXT: .LBB14_4: 5646; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 5647; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 5648; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5649; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 5650; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 5651; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 5652; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 5653; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 5654; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 5655; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5656; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5657; GFX1032_ITERATIVE-NEXT: s_endpgm 5658; 5659; GFX1164_ITERATIVE-LABEL: sub_i64_varying: 5660; GFX1164_ITERATIVE: ; %bb.0: ; %entry 5661; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 5662; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 5663; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 5664; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 5665; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 5666; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop 5667; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5668; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] 5669; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 5670; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 5671; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 5672; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 5673; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 5674; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 5675; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 5676; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 5677; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 5678; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] 5679; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5680; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 5681; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 5682; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5683; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 5684; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5685; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 5686; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5687; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 5688; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 5689; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5690; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 5691; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 5692; GFX1164_ITERATIVE-NEXT: ; %bb.3: 5693; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 5694; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 5695; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 5696; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] 5697; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5698; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 5699; GFX1164_ITERATIVE-NEXT: .LBB14_4: 5700; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 5701; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 5702; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 5703; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 5704; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 5705; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 5706; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 5707; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 5708; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 5709; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5710; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5711; GFX1164_ITERATIVE-NEXT: s_endpgm 5712; 5713; GFX1132_ITERATIVE-LABEL: sub_i64_varying: 5714; GFX1132_ITERATIVE: ; %bb.0: ; %entry 5715; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 5716; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 5717; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 5718; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 5719; GFX1132_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop 5720; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 5721; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 5722; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 5723; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 5724; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 5725; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 5726; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 5727; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 5728; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 5729; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 5730; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 5731; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 5732; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 5733; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 5734; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 5735; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 5736; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 5737; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 5738; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 5739; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 5740; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 5741; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 5742; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 5743; GFX1132_ITERATIVE-NEXT: ; %bb.3: 5744; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 5745; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 5746; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] 5747; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5748; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 5749; GFX1132_ITERATIVE-NEXT: .LBB14_4: 5750; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 5751; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 5752; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 5753; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 5754; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 5755; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 5756; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 5757; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 5758; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 5759; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 5760; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5761; GFX1132_ITERATIVE-NEXT: s_endpgm 5762; 5763; GFX7LESS_DPP-LABEL: sub_i64_varying: 5764; GFX7LESS_DPP: ; %bb.0: ; %entry 5765; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5766; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 5767; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 5768; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 5769; GFX7LESS_DPP-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 5770; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 5771; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 5772; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 5773; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5774; GFX7LESS_DPP-NEXT: s_endpgm 5775; 5776; GFX8_DPP-LABEL: sub_i64_varying: 5777; GFX8_DPP: ; %bb.0: ; %entry 5778; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 5779; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 5780; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 5781; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 5782; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 5783; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 5784; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 5785; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 5786; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5787; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 5788; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5789; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 5790; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 5791; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 5792; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5793; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 5794; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5795; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 5796; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 5797; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 5798; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5799; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 5800; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5801; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 5802; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 5803; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 5804; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5805; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 5806; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5807; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 5808; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 5809; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 5810; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5811; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 5812; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 5813; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 5814; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 5815; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 5816; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5817; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v2, v4 5818; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 5819; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 5820; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 5821; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 5822; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 5823; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 5824; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 5825; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5826; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 5827; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5828; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 5829; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 5830; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 5831; GFX8_DPP-NEXT: ; %bb.1: 5832; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 5833; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 5834; GFX8_DPP-NEXT: s_mov_b32 m0, -1 5835; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[5:6], v7, v[5:6] 5836; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 5837; GFX8_DPP-NEXT: .LBB14_2: 5838; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 5839; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5840; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 5841; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 5842; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 5843; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 5844; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 5845; GFX8_DPP-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 5846; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 5847; GFX8_DPP-NEXT: s_mov_b32 s2, -1 5848; GFX8_DPP-NEXT: v_subb_u32_e32 v6, vcc, v0, v6, vcc 5849; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 5850; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 5851; GFX8_DPP-NEXT: s_endpgm 5852; 5853; GFX9_DPP-LABEL: sub_i64_varying: 5854; GFX9_DPP: ; %bb.0: ; %entry 5855; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 5856; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 5857; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 5858; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 5859; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 5860; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 5861; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 5862; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 5863; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5864; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 5865; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5866; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 5867; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 5868; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 5869; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5870; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 5871; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5872; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 5873; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 5874; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 5875; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5876; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 5877; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5878; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 5879; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 5880; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 5881; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5882; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 5883; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5884; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 5885; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 5886; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 5887; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5888; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 5889; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 5890; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 5891; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 5892; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 5893; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5894; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 5895; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 5896; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 5897; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 5898; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 5899; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 5900; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 5901; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 5902; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5903; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 5904; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5905; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 5906; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 5907; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 5908; GFX9_DPP-NEXT: ; %bb.1: 5909; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 5910; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 5911; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[5:6], v7, v[5:6] 5912; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 5913; GFX9_DPP-NEXT: .LBB14_2: 5914; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 5915; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5916; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 5917; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 5918; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 5919; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 5920; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 5921; GFX9_DPP-NEXT: v_sub_co_u32_e32 v5, vcc, s5, v5 5922; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 5923; GFX9_DPP-NEXT: s_mov_b32 s2, -1 5924; GFX9_DPP-NEXT: v_subb_co_u32_e32 v6, vcc, v0, v6, vcc 5925; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 5926; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 5927; GFX9_DPP-NEXT: s_endpgm 5928; 5929; GFX1064_DPP-LABEL: sub_i64_varying: 5930; GFX1064_DPP: ; %bb.0: ; %entry 5931; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 5932; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 5933; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 5934; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] 5935; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 5936; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 5937; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 5938; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5939; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, 0 5940; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf 5941; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 5942; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v2, v1 5943; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc 5944; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 5945; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5946; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 5947; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5948; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 5949; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 5950; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 5951; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5952; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 5953; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5954; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v4 5955; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc 5956; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 5957; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5958; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5959; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v6 5960; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 5961; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 5962; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 5963; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 5964; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5965; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5966; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 5967; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 5968; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 5969; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 5970; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 5971; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 5972; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 5973; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 5974; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5975; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5976; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 5977; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc 5978; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 5979; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5980; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 5981; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5982; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5983; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 5984; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 5985; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 5986; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 5987; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 5988; GFX1064_DPP-NEXT: v_writelane_b32 v8, s2, 16 5989; GFX1064_DPP-NEXT: v_writelane_b32 v7, s3, 16 5990; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 5991; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 5992; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 5993; GFX1064_DPP-NEXT: v_writelane_b32 v8, s6, 32 5994; GFX1064_DPP-NEXT: v_writelane_b32 v7, s7, 32 5995; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 5996; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 5997; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 5998; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 5999; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 6000; GFX1064_DPP-NEXT: v_writelane_b32 v8, s9, 48 6001; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 48 6002; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 6003; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 6004; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 6005; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 6006; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 6007; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 6008; GFX1064_DPP-NEXT: ; %bb.1: 6009; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 6010; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 6011; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10] 6012; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 6013; GFX1064_DPP-NEXT: buffer_gl0_inv 6014; GFX1064_DPP-NEXT: .LBB14_2: 6015; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 6016; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 6017; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6018; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 6019; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 6020; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8 6021; GFX1064_DPP-NEXT: s_mov_b32 null, 0 6022; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v10 6023; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s3, v11 6024; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 6025; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s4, v12, vcc 6026; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 6027; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 6028; GFX1064_DPP-NEXT: s_endpgm 6029; 6030; GFX1032_DPP-LABEL: sub_i64_varying: 6031; GFX1032_DPP: ; %bb.0: ; %entry 6032; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 6033; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 6034; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 6035; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s2 6036; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 6037; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 6038; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 6039; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6040; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, 0 6041; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf 6042; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 6043; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v2, v1 6044; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo 6045; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 6046; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6047; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 6048; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6049; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 6050; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 6051; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 6052; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6053; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 6054; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6055; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 6056; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo 6057; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 6058; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6059; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6060; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6 6061; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 6062; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 6063; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 6064; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v2, -1, -1 6065; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6066; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6067; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 6068; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo 6069; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 6070; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 6071; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 6072; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6073; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6074; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 6075; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 6076; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 6077; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 6078; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 6079; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16 6080; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 6081; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 6082; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 6083; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 6084; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 6085; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 6086; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 6087; GFX1032_DPP-NEXT: ; %bb.1: 6088; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 6089; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 6090; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10] 6091; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 6092; GFX1032_DPP-NEXT: buffer_gl0_inv 6093; GFX1032_DPP-NEXT: .LBB14_2: 6094; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 6095; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 6096; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6097; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v9 6098; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 6099; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 6100; GFX1032_DPP-NEXT: s_mov_b32 null, 0 6101; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v10 6102; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s3, v11 6103; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 6104; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo 6105; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 6106; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 6107; GFX1032_DPP-NEXT: s_endpgm 6108; 6109; GFX1164_DPP-LABEL: sub_i64_varying: 6110; GFX1164_DPP: ; %bb.0: ; %entry 6111; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 6112; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6113; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6114; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 6115; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 6116; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 6117; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 6118; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 6119; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 6120; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6121; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 6122; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6123; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 6124; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 6125; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 6126; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 6127; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6128; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 6129; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6130; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 6131; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 6132; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6133; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 6134; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc 6135; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 6136; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 6137; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 6138; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6139; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 6140; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6141; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc 6142; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6143; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 6144; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 6145; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 6146; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6147; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6148; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 6149; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 6150; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 6151; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6152; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 6153; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 6154; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6155; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6156; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6157; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc 6158; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 6159; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6160; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6161; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6162; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6163; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 6164; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6165; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 6166; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 6167; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 6168; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 6169; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 6170; GFX1164_DPP-NEXT: v_writelane_b32 v6, s3, 16 6171; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 6172; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 6173; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 6174; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 6175; GFX1164_DPP-NEXT: v_writelane_b32 v6, s7, 32 6176; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 6177; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6178; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 6179; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 6180; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 6181; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 6182; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 6183; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 6184; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 6185; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 6186; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 6187; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 6188; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 6189; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 6190; GFX1164_DPP-NEXT: ; %bb.1: 6191; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 6192; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 6193; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[7:8], v0, v[7:8] 6194; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 6195; GFX1164_DPP-NEXT: buffer_gl0_inv 6196; GFX1164_DPP-NEXT: .LBB14_2: 6197; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 6198; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 6199; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 6200; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 6201; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 6202; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8 6203; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6204; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s3, v9 6205; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 6206; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s4, v10, vcc 6207; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 6208; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 6209; GFX1164_DPP-NEXT: s_endpgm 6210; 6211; GFX1132_DPP-LABEL: sub_i64_varying: 6212; GFX1132_DPP: ; %bb.0: ; %entry 6213; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 6214; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 6215; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6216; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 6217; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 6218; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 6219; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 6220; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 6221; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 6222; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6223; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 6224; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6225; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 6226; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 6227; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 6228; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 6229; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6230; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 6231; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 6232; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 6233; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 6234; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6235; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 6236; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo 6237; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 6238; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 6239; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 6240; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6241; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 6242; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6243; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo 6244; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6245; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6246; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 6247; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 6248; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 6249; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6250; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 6251; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6252; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 6253; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 6254; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 6255; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6256; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 6257; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 6258; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 6259; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 6260; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 6261; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 6262; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16 6263; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 6264; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 6265; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 6266; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 6267; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 6268; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 6269; GFX1132_DPP-NEXT: ; %bb.1: 6270; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 6271; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9] 6272; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 6273; GFX1132_DPP-NEXT: buffer_gl0_inv 6274; GFX1132_DPP-NEXT: .LBB14_2: 6275; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 6276; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 6277; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 6278; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 6279; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 6280; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 6281; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6282; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v10 6283; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 6284; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo 6285; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 6286; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 6287; GFX1132_DPP-NEXT: s_endpgm 6288entry: 6289 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6290 %zext = zext i32 %lane to i64 6291 %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %zext acq_rel 6292 store i64 %old, ptr addrspace(1) %out 6293 ret void 6294} 6295 6296define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { 6297; GFX7LESS_ITERATIVE-LABEL: and_i32_varying: 6298; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 6299; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6300; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 6301; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 6302; GFX7LESS_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop 6303; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6304; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 6305; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 6306; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 6307; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 6308; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 6309; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 6310; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 6311; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 6312; GFX7LESS_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 6313; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB15_1 6314; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6315; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6316; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6317; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6318; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 6319; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 6320; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 6321; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 6322; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 6323; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 6324; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 6325; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 6326; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 6327; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6328; GFX7LESS_ITERATIVE-NEXT: .LBB15_4: 6329; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 6330; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6331; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 6332; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 6333; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 6334; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 6335; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6336; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 6337; GFX7LESS_ITERATIVE-NEXT: s_endpgm 6338; 6339; GFX8_ITERATIVE-LABEL: and_i32_varying: 6340; GFX8_ITERATIVE: ; %bb.0: ; %entry 6341; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6342; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 6343; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 6344; GFX8_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop 6345; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6346; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 6347; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 6348; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 6349; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 6350; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 6351; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 6352; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 6353; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 6354; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 6355; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6356; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6357; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6358; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6359; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 6360; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 6361; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 6362; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 6363; GFX8_ITERATIVE-NEXT: ; %bb.3: 6364; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 6365; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 6366; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 6367; GFX8_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 6368; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6369; GFX8_ITERATIVE-NEXT: .LBB15_4: 6370; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 6371; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6372; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 6373; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 6374; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 6375; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 6376; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6377; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 6378; GFX8_ITERATIVE-NEXT: s_endpgm 6379; 6380; GFX9_ITERATIVE-LABEL: and_i32_varying: 6381; GFX9_ITERATIVE: ; %bb.0: ; %entry 6382; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6383; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 6384; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 6385; GFX9_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop 6386; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6387; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 6388; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 6389; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 6390; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 6391; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 6392; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 6393; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 6394; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 6395; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 6396; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6397; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6398; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6399; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6400; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 6401; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 6402; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 6403; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 6404; GFX9_ITERATIVE-NEXT: ; %bb.3: 6405; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 6406; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 6407; GFX9_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 6408; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6409; GFX9_ITERATIVE-NEXT: .LBB15_4: 6410; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 6411; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6412; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 6413; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 6414; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 6415; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 6416; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6417; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 6418; GFX9_ITERATIVE-NEXT: s_endpgm 6419; 6420; GFX1064_ITERATIVE-LABEL: and_i32_varying: 6421; GFX1064_ITERATIVE: ; %bb.0: ; %entry 6422; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6423; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 6424; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 6425; GFX1064_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop 6426; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6427; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 6428; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 6429; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 6430; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 6431; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 6432; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 6433; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 6434; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 6435; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6436; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6437; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6438; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6439; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 6440; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 6441; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 6442; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 6443; GFX1064_ITERATIVE-NEXT: ; %bb.3: 6444; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 6445; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 6446; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 6447; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6448; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 6449; GFX1064_ITERATIVE-NEXT: .LBB15_4: 6450; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 6451; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 6452; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6453; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 6454; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6455; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 6456; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 6457; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6458; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 6459; GFX1064_ITERATIVE-NEXT: s_endpgm 6460; 6461; GFX1032_ITERATIVE-LABEL: and_i32_varying: 6462; GFX1032_ITERATIVE: ; %bb.0: ; %entry 6463; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 6464; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 6465; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 6466; GFX1032_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop 6467; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6468; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 6469; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 6470; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 6471; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 6472; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 6473; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 6474; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 6475; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 6476; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6477; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6478; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6479; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 6480; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 6481; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 6482; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 6483; GFX1032_ITERATIVE-NEXT: ; %bb.3: 6484; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 6485; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 6486; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 6487; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6488; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 6489; GFX1032_ITERATIVE-NEXT: .LBB15_4: 6490; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 6491; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 6492; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6493; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 6494; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6495; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 6496; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 6497; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6498; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 6499; GFX1032_ITERATIVE-NEXT: s_endpgm 6500; 6501; GFX1164_ITERATIVE-LABEL: and_i32_varying: 6502; GFX1164_ITERATIVE: ; %bb.0: ; %entry 6503; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 6504; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 6505; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 6506; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 6507; GFX1164_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop 6508; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6509; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 6510; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 6511; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 6512; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 6513; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 6514; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 6515; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 6516; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 6517; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 6518; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 6519; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6520; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 6521; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6522; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 6523; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 6524; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 6525; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 6526; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6527; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 6528; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 6529; GFX1164_ITERATIVE-NEXT: ; %bb.3: 6530; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 6531; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 6532; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 6533; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6534; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 6535; GFX1164_ITERATIVE-NEXT: .LBB15_4: 6536; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 6537; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 6538; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 6539; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6540; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 6541; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 6542; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 6543; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6544; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6545; GFX1164_ITERATIVE-NEXT: s_endpgm 6546; 6547; GFX1132_ITERATIVE-LABEL: and_i32_varying: 6548; GFX1132_ITERATIVE: ; %bb.0: ; %entry 6549; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 6550; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 6551; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 6552; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 6553; GFX1132_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop 6554; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6555; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 6556; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 6557; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 6558; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 6559; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 6560; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 6561; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 6562; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 6563; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 6564; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 6565; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6566; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 6567; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6568; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 6569; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 6570; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 6571; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 6572; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 6573; GFX1132_ITERATIVE-NEXT: ; %bb.3: 6574; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 6575; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 6576; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6577; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 6578; GFX1132_ITERATIVE-NEXT: .LBB15_4: 6579; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 6580; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 6581; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 6582; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 6583; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 6584; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 6585; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 6586; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6587; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6588; GFX1132_ITERATIVE-NEXT: s_endpgm 6589; 6590; GFX7LESS_DPP-LABEL: and_i32_varying: 6591; GFX7LESS_DPP: ; %bb.0: ; %entry 6592; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6593; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 6594; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 6595; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 6596; GFX7LESS_DPP-NEXT: ds_and_rtn_b32 v0, v1, v0 6597; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 6598; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 6599; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 6600; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 6601; GFX7LESS_DPP-NEXT: s_endpgm 6602; 6603; GFX8_DPP-LABEL: and_i32_varying: 6604; GFX8_DPP: ; %bb.0: ; %entry 6605; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6606; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6607; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6608; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] 6609; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 6610; GFX8_DPP-NEXT: s_nop 0 6611; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6612; GFX8_DPP-NEXT: s_nop 1 6613; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6614; GFX8_DPP-NEXT: s_nop 1 6615; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6616; GFX8_DPP-NEXT: s_nop 1 6617; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6618; GFX8_DPP-NEXT: s_nop 1 6619; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6620; GFX8_DPP-NEXT: s_nop 1 6621; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6622; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 6623; GFX8_DPP-NEXT: s_nop 0 6624; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6625; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 6626; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6627; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 6628; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 6629; GFX8_DPP-NEXT: s_cbranch_execz .LBB15_2 6630; GFX8_DPP-NEXT: ; %bb.1: 6631; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 6632; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 6633; GFX8_DPP-NEXT: s_mov_b32 m0, -1 6634; GFX8_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 6635; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 6636; GFX8_DPP-NEXT: .LBB15_2: 6637; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 6638; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6639; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 6640; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 6641; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 6642; GFX8_DPP-NEXT: s_mov_b32 s2, -1 6643; GFX8_DPP-NEXT: v_and_b32_e32 v0, s4, v0 6644; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 6645; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 6646; GFX8_DPP-NEXT: s_endpgm 6647; 6648; GFX9_DPP-LABEL: and_i32_varying: 6649; GFX9_DPP: ; %bb.0: ; %entry 6650; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6651; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6652; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6653; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] 6654; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 6655; GFX9_DPP-NEXT: s_nop 0 6656; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6657; GFX9_DPP-NEXT: s_nop 1 6658; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6659; GFX9_DPP-NEXT: s_nop 1 6660; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6661; GFX9_DPP-NEXT: s_nop 1 6662; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6663; GFX9_DPP-NEXT: s_nop 1 6664; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6665; GFX9_DPP-NEXT: s_nop 1 6666; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6667; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 6668; GFX9_DPP-NEXT: s_nop 0 6669; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6670; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 6671; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6672; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 6673; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 6674; GFX9_DPP-NEXT: s_cbranch_execz .LBB15_2 6675; GFX9_DPP-NEXT: ; %bb.1: 6676; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 6677; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 6678; GFX9_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 6679; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 6680; GFX9_DPP-NEXT: .LBB15_2: 6681; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 6682; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6683; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 6684; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 6685; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 6686; GFX9_DPP-NEXT: s_mov_b32 s2, -1 6687; GFX9_DPP-NEXT: v_and_b32_e32 v0, s4, v0 6688; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 6689; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 6690; GFX9_DPP-NEXT: s_endpgm 6691; 6692; GFX1064_DPP-LABEL: and_i32_varying: 6693; GFX1064_DPP: ; %bb.0: ; %entry 6694; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6695; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] 6696; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 6697; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6698; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6699; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6700; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6701; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 6702; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6703; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 6704; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 6705; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6706; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6707; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 6708; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 6709; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 6710; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 6711; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6712; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6713; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 6714; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 6715; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 6716; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 6717; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6718; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6719; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 6720; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 6721; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6722; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 6723; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 6724; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 6725; GFX1064_DPP-NEXT: s_cbranch_execz .LBB15_2 6726; GFX1064_DPP-NEXT: ; %bb.1: 6727; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 6728; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 6729; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 6730; GFX1064_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 6731; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 6732; GFX1064_DPP-NEXT: buffer_gl0_inv 6733; GFX1064_DPP-NEXT: .LBB15_2: 6734; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 6735; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 6736; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6737; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 6738; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 6739; GFX1064_DPP-NEXT: v_and_b32_e32 v0, s3, v0 6740; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 6741; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 6742; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 6743; GFX1064_DPP-NEXT: s_endpgm 6744; 6745; GFX1032_DPP-LABEL: and_i32_varying: 6746; GFX1032_DPP: ; %bb.0: ; %entry 6747; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 6748; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 6749; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 6750; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6751; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6752; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6753; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6754; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 6755; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6756; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 6757; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 6758; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6759; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 6760; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6761; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 6762; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 6763; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 6764; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 6765; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 6766; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6767; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 6768; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 6769; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2 6770; GFX1032_DPP-NEXT: ; %bb.1: 6771; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 6772; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 6773; GFX1032_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 6774; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 6775; GFX1032_DPP-NEXT: buffer_gl0_inv 6776; GFX1032_DPP-NEXT: .LBB15_2: 6777; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 6778; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 6779; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6780; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 6781; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 6782; GFX1032_DPP-NEXT: v_and_b32_e32 v0, s3, v0 6783; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 6784; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 6785; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 6786; GFX1032_DPP-NEXT: s_endpgm 6787; 6788; GFX1164_DPP-LABEL: and_i32_varying: 6789; GFX1164_DPP: ; %bb.0: ; %entry 6790; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 6791; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6792; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 6793; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] 6794; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 6795; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 6796; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6797; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6798; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6799; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6800; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6801; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6802; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 6803; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6804; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6805; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 6806; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 6807; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6808; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6809; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6810; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 6811; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 6812; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6813; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 6814; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 6815; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6816; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6817; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 6818; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 6819; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 6820; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 6821; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 6822; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6823; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 6824; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 6825; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 6826; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6827; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 6828; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 6829; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 6830; GFX1164_DPP-NEXT: s_cbranch_execz .LBB15_2 6831; GFX1164_DPP-NEXT: ; %bb.1: 6832; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 6833; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 6834; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 6835; GFX1164_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 6836; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 6837; GFX1164_DPP-NEXT: buffer_gl0_inv 6838; GFX1164_DPP-NEXT: .LBB15_2: 6839; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 6840; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 6841; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 6842; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 6843; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 6844; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s3, v0 6845; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 6846; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 6847; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6848; GFX1164_DPP-NEXT: s_endpgm 6849; 6850; GFX1132_DPP-LABEL: and_i32_varying: 6851; GFX1132_DPP: ; %bb.0: ; %entry 6852; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 6853; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 6854; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 6855; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 6856; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 6857; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 6858; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6859; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6860; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6861; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6862; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6863; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6864; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 6865; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6866; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 6867; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 6868; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 6869; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6870; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 6871; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6872; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 6873; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 6874; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 6875; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 6876; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 6877; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6878; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 6879; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 6880; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2 6881; GFX1132_DPP-NEXT: ; %bb.1: 6882; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 6883; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 6884; GFX1132_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 6885; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 6886; GFX1132_DPP-NEXT: buffer_gl0_inv 6887; GFX1132_DPP-NEXT: .LBB15_2: 6888; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 6889; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 6890; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 6891; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 6892; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 6893; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s3, v0 6894; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 6895; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 6896; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6897; GFX1132_DPP-NEXT: s_endpgm 6898entry: 6899 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6900 %old = atomicrmw and ptr addrspace(3) @local_var32, i32 %lane acq_rel 6901 store i32 %old, ptr addrspace(1) %out 6902 ret void 6903} 6904 6905define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { 6906; GFX7LESS_ITERATIVE-LABEL: and_i64_varying: 6907; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 6908; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 6909; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 6910; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 6911; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 6912; GFX7LESS_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop 6913; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6914; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 6915; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 6916; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 6917; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 6918; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 6919; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 6920; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 6921; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 6922; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[2:3], 0 6923; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] 6924; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] 6925; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB16_1 6926; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6927; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6928; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6929; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6930; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 6931; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 6932; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 6933; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 6934; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 6935; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 6936; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 6937; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 6938; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 6939; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] 6940; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6941; GFX7LESS_ITERATIVE-NEXT: .LBB16_4: 6942; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 6943; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6944; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 6945; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 6946; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 6947; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 6948; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 6949; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 6950; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6951; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 6952; GFX7LESS_ITERATIVE-NEXT: s_endpgm 6953; 6954; GFX8_ITERATIVE-LABEL: and_i64_varying: 6955; GFX8_ITERATIVE: ; %bb.0: ; %entry 6956; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 6957; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 6958; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 6959; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 6960; GFX8_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop 6961; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 6962; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 6963; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 6964; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 6965; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 6966; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 6967; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 6968; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 6969; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] 6970; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 6971; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 6972; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 6973; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 6974; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6975; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6976; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6977; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 6978; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 6979; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 6980; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 6981; GFX8_ITERATIVE-NEXT: ; %bb.3: 6982; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 6983; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 6984; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 6985; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 6986; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] 6987; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6988; GFX8_ITERATIVE-NEXT: .LBB16_4: 6989; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 6990; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6991; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 6992; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 6993; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 6994; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 6995; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 6996; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 6997; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 6998; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 6999; GFX8_ITERATIVE-NEXT: s_endpgm 7000; 7001; GFX9_ITERATIVE-LABEL: and_i64_varying: 7002; GFX9_ITERATIVE: ; %bb.0: ; %entry 7003; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 7004; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 7005; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 7006; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 7007; GFX9_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop 7008; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7009; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 7010; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 7011; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 7012; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 7013; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 7014; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 7015; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 7016; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] 7017; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 7018; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 7019; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 7020; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7021; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7022; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 7023; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 7024; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 7025; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 7026; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 7027; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 7028; GFX9_ITERATIVE-NEXT: ; %bb.3: 7029; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 7030; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 7031; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 7032; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] 7033; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7034; GFX9_ITERATIVE-NEXT: .LBB16_4: 7035; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 7036; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7037; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 7038; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 7039; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 7040; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 7041; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 7042; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 7043; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7044; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 7045; GFX9_ITERATIVE-NEXT: s_endpgm 7046; 7047; GFX1064_ITERATIVE-LABEL: and_i64_varying: 7048; GFX1064_ITERATIVE: ; %bb.0: ; %entry 7049; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 7050; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 7051; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 7052; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 7053; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop 7054; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7055; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] 7056; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 7057; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 7058; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 7059; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 7060; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 7061; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 7062; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] 7063; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 7064; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 7065; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7066; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7067; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 7068; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 7069; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 7070; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 7071; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 7072; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 7073; GFX1064_ITERATIVE-NEXT: ; %bb.3: 7074; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 7075; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 7076; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 7077; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] 7078; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7079; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 7080; GFX1064_ITERATIVE-NEXT: .LBB16_4: 7081; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 7082; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 7083; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7084; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 7085; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 7086; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 7087; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1 7088; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 7089; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 7090; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7091; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 7092; GFX1064_ITERATIVE-NEXT: s_endpgm 7093; 7094; GFX1032_ITERATIVE-LABEL: and_i64_varying: 7095; GFX1032_ITERATIVE: ; %bb.0: ; %entry 7096; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 7097; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 7098; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 7099; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 7100; GFX1032_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop 7101; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7102; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 7103; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 7104; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 7105; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 7106; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 7107; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 7108; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 7109; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] 7110; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 7111; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 7112; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7113; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7114; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 7115; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 7116; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 7117; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 7118; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 7119; GFX1032_ITERATIVE-NEXT: ; %bb.3: 7120; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 7121; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 7122; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 7123; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] 7124; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7125; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 7126; GFX1032_ITERATIVE-NEXT: .LBB16_4: 7127; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 7128; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 7129; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7130; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 7131; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 7132; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 7133; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1 7134; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 7135; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 7136; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7137; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 7138; GFX1032_ITERATIVE-NEXT: s_endpgm 7139; 7140; GFX1164_ITERATIVE-LABEL: and_i64_varying: 7141; GFX1164_ITERATIVE: ; %bb.0: ; %entry 7142; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 7143; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 7144; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 7145; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 7146; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 7147; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop 7148; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7149; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] 7150; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 7151; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 7152; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 7153; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 7154; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 7155; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 7156; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] 7157; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] 7158; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 7159; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 7160; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7161; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 7162; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7163; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 7164; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 7165; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 7166; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 7167; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7168; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 7169; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 7170; GFX1164_ITERATIVE-NEXT: ; %bb.3: 7171; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 7172; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 7173; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 7174; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] 7175; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7176; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 7177; GFX1164_ITERATIVE-NEXT: .LBB16_4: 7178; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 7179; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 7180; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 7181; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 7182; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7183; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, s2, v1 7184; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s3, v0 7185; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 7186; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 7187; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7188; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 7189; GFX1164_ITERATIVE-NEXT: s_endpgm 7190; 7191; GFX1132_ITERATIVE-LABEL: and_i64_varying: 7192; GFX1132_ITERATIVE: ; %bb.0: ; %entry 7193; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 7194; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 7195; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 7196; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 7197; GFX1132_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop 7198; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7199; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 7200; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 7201; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 7202; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 7203; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 7204; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 7205; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 7206; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 7207; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] 7208; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 7209; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 7210; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7211; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 7212; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 7213; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 7214; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 7215; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 7216; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 7217; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 7218; GFX1132_ITERATIVE-NEXT: ; %bb.3: 7219; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 7220; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 7221; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] 7222; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7223; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 7224; GFX1132_ITERATIVE-NEXT: .LBB16_4: 7225; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 7226; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 7227; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 7228; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 7229; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7230; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, s2, v1 7231; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s3, v0 7232; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 7233; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 7234; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7235; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 7236; GFX1132_ITERATIVE-NEXT: s_endpgm 7237; 7238; GFX7LESS_DPP-LABEL: and_i64_varying: 7239; GFX7LESS_DPP: ; %bb.0: ; %entry 7240; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7241; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 7242; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 7243; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 7244; GFX7LESS_DPP-NEXT: ds_and_rtn_b64 v[0:1], v1, v[0:1] 7245; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 7246; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 7247; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 7248; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7249; GFX7LESS_DPP-NEXT: s_endpgm 7250; 7251; GFX8_DPP-LABEL: and_i64_varying: 7252; GFX8_DPP: ; %bb.0: ; %entry 7253; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 7254; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 7255; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 7256; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 7257; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, 0, s[0:1] 7258; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v0, s[0:1] 7259; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 7260; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf 7261; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf 7262; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 7263; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf 7264; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf 7265; GFX8_DPP-NEXT: s_nop 0 7266; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf 7267; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf 7268; GFX8_DPP-NEXT: s_nop 0 7269; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf 7270; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf 7271; GFX8_DPP-NEXT: s_nop 0 7272; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 7273; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf 7274; GFX8_DPP-NEXT: s_nop 0 7275; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 7276; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf 7277; GFX8_DPP-NEXT: v_readlane_b32 s3, v3, 63 7278; GFX8_DPP-NEXT: v_readlane_b32 s2, v4, 63 7279; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 7280; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 7281; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 7282; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 7283; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 7284; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 7285; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 7286; GFX8_DPP-NEXT: ; %bb.1: 7287; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 7288; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 7289; GFX8_DPP-NEXT: s_mov_b32 m0, -1 7290; GFX8_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] 7291; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 7292; GFX8_DPP-NEXT: .LBB16_2: 7293; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 7294; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7295; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 7296; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 7297; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 7298; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 7299; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 7300; GFX8_DPP-NEXT: s_mov_b32 s2, -1 7301; GFX8_DPP-NEXT: v_and_b32_e32 v6, s4, v6 7302; GFX8_DPP-NEXT: v_and_b32_e32 v5, s5, v5 7303; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 7304; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 7305; GFX8_DPP-NEXT: s_endpgm 7306; 7307; GFX9_DPP-LABEL: and_i64_varying: 7308; GFX9_DPP: ; %bb.0: ; %entry 7309; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 7310; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 7311; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 7312; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 7313; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, 0, s[0:1] 7314; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v0, s[0:1] 7315; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 7316; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf 7317; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf 7318; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 7319; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf 7320; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf 7321; GFX9_DPP-NEXT: s_nop 0 7322; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf 7323; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf 7324; GFX9_DPP-NEXT: s_nop 0 7325; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf 7326; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf 7327; GFX9_DPP-NEXT: s_nop 0 7328; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 7329; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf 7330; GFX9_DPP-NEXT: s_nop 0 7331; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 7332; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf 7333; GFX9_DPP-NEXT: v_readlane_b32 s3, v3, 63 7334; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 7335; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 7336; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 7337; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 7338; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 7339; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 7340; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 7341; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 7342; GFX9_DPP-NEXT: ; %bb.1: 7343; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 7344; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 7345; GFX9_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] 7346; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 7347; GFX9_DPP-NEXT: .LBB16_2: 7348; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 7349; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7350; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 7351; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 7352; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 7353; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 7354; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 7355; GFX9_DPP-NEXT: s_mov_b32 s2, -1 7356; GFX9_DPP-NEXT: v_and_b32_e32 v6, s4, v6 7357; GFX9_DPP-NEXT: v_and_b32_e32 v5, s5, v5 7358; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 7359; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 7360; GFX9_DPP-NEXT: s_endpgm 7361; 7362; GFX1064_DPP-LABEL: and_i64_varying: 7363; GFX1064_DPP: ; %bb.0: ; %entry 7364; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 7365; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] 7366; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] 7367; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 7368; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 7369; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7370; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7371; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 7372; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 7373; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 7374; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 7375; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 7376; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 7377; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 7378; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 7379; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7380; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7381; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 7382; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 7383; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 7384; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 7385; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 7386; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 7387; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 7388; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7389; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 7390; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7391; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 7392; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7393; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 7394; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 7395; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 7396; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 7397; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 7398; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 7399; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 7400; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 7401; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 7402; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 7403; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 7404; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 7405; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 7406; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 7407; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 7408; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 7409; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 7410; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 7411; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 7412; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 7413; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 7414; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 7415; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 7416; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 7417; GFX1064_DPP-NEXT: ; %bb.1: 7418; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 7419; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 7420; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] 7421; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 7422; GFX1064_DPP-NEXT: buffer_gl0_inv 7423; GFX1064_DPP-NEXT: .LBB16_2: 7424; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 7425; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 7426; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7427; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 7428; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 7429; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 7430; GFX1064_DPP-NEXT: s_mov_b32 null, 0 7431; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 7432; GFX1064_DPP-NEXT: v_and_b32_e32 v9, s3, v9 7433; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s4, v8 7434; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 7435; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 7436; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 7437; GFX1064_DPP-NEXT: s_endpgm 7438; 7439; GFX1032_DPP-LABEL: and_i64_varying: 7440; GFX1032_DPP: ; %bb.0: ; %entry 7441; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 7442; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s2 7443; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s2 7444; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 7445; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 7446; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7447; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7448; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 7449; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 7450; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 7451; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 7452; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 7453; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 7454; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 7455; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 7456; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7457; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7458; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 7459; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 7460; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 7461; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7462; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7463; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 7464; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 7465; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 7466; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 7467; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 7468; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 7469; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 7470; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 7471; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 7472; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 7473; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 7474; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 7475; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 7476; GFX1032_DPP-NEXT: ; %bb.1: 7477; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 7478; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 7479; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] 7480; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 7481; GFX1032_DPP-NEXT: buffer_gl0_inv 7482; GFX1032_DPP-NEXT: .LBB16_2: 7483; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 7484; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 7485; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7486; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 7487; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 7488; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 7489; GFX1032_DPP-NEXT: s_mov_b32 null, 0 7490; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 7491; GFX1032_DPP-NEXT: v_and_b32_e32 v9, s3, v9 7492; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s4, v8 7493; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 7494; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 7495; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 7496; GFX1032_DPP-NEXT: s_endpgm 7497; 7498; GFX1164_DPP-LABEL: and_i64_varying: 7499; GFX1164_DPP: ; %bb.0: ; %entry 7500; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 7501; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 7502; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 7503; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] 7504; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] 7505; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 7506; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 7507; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 7508; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7509; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7510; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7511; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 7512; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 7513; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7514; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 7515; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 7516; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7517; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 7518; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 7519; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7520; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 7521; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 7522; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7523; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7524; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7525; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7526; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 7527; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 7528; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7529; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 7530; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 7531; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7532; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 7533; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 7534; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 7535; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 7536; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7537; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 7538; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7539; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 7540; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7541; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 7542; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 7543; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 7544; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 7545; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 7546; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 7547; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 7548; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 7549; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 7550; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 7551; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 7552; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 7553; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7554; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 7555; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 7556; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 7557; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 7558; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 7559; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 7560; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 7561; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 7562; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 7563; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 7564; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 7565; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 7566; GFX1164_DPP-NEXT: ; %bb.1: 7567; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 7568; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 7569; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] 7570; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 7571; GFX1164_DPP-NEXT: buffer_gl0_inv 7572; GFX1164_DPP-NEXT: .LBB16_2: 7573; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 7574; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 7575; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 7576; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 7577; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 7578; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 7579; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7580; GFX1164_DPP-NEXT: v_and_b32_e32 v9, s3, v9 7581; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s4, v8 7582; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 7583; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 7584; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 7585; GFX1164_DPP-NEXT: s_endpgm 7586; 7587; GFX1132_DPP-LABEL: and_i64_varying: 7588; GFX1132_DPP: ; %bb.0: ; %entry 7589; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 7590; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 7591; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 7592; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s2 7593; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s2 7594; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1 7595; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 7596; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7597; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7598; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7599; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 7600; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 7601; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7602; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 7603; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 7604; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7605; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 7606; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 7607; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7608; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 7609; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 7610; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7611; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7612; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 7613; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 7614; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 7615; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 7616; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 7617; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 7618; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 7619; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 7620; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 7621; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7622; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 7623; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 7624; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 7625; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 7626; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 7627; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 7628; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 7629; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 7630; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 7631; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 7632; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 7633; GFX1132_DPP-NEXT: ; %bb.1: 7634; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 7635; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] 7636; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 7637; GFX1132_DPP-NEXT: buffer_gl0_inv 7638; GFX1132_DPP-NEXT: .LBB16_2: 7639; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 7640; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 7641; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 7642; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 7643; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 7644; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 7645; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 7646; GFX1132_DPP-NEXT: v_and_b32_e32 v9, s3, v9 7647; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s4, v8 7648; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 7649; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 7650; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 7651; GFX1132_DPP-NEXT: s_endpgm 7652entry: 7653 %lane = call i32 @llvm.amdgcn.workitem.id.x() 7654 %lane_ext = zext i32 %lane to i64 7655 %old = atomicrmw and ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel 7656 store i64 %old, ptr addrspace(1) %out 7657 ret void 7658} 7659 7660define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { 7661; GFX7LESS_ITERATIVE-LABEL: or_i32_varying: 7662; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 7663; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 7664; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 7665; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 7666; GFX7LESS_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop 7667; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7668; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 7669; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 7670; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 7671; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 7672; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 7673; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 7674; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 7675; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 7676; GFX7LESS_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 7677; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB17_1 7678; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7679; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 7680; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 7681; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 7682; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 7683; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 7684; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 7685; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 7686; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 7687; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 7688; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 7689; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 7690; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 7691; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7692; GFX7LESS_ITERATIVE-NEXT: .LBB17_4: 7693; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 7694; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7695; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 7696; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 7697; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 7698; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 7699; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7700; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 7701; GFX7LESS_ITERATIVE-NEXT: s_endpgm 7702; 7703; GFX8_ITERATIVE-LABEL: or_i32_varying: 7704; GFX8_ITERATIVE: ; %bb.0: ; %entry 7705; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 7706; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 7707; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 7708; GFX8_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop 7709; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7710; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 7711; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 7712; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 7713; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 7714; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 7715; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 7716; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 7717; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 7718; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 7719; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7720; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7721; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 7722; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 7723; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 7724; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 7725; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 7726; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 7727; GFX8_ITERATIVE-NEXT: ; %bb.3: 7728; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 7729; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 7730; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 7731; GFX8_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 7732; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7733; GFX8_ITERATIVE-NEXT: .LBB17_4: 7734; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 7735; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7736; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 7737; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 7738; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 7739; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 7740; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7741; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 7742; GFX8_ITERATIVE-NEXT: s_endpgm 7743; 7744; GFX9_ITERATIVE-LABEL: or_i32_varying: 7745; GFX9_ITERATIVE: ; %bb.0: ; %entry 7746; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 7747; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 7748; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 7749; GFX9_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop 7750; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7751; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 7752; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 7753; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 7754; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 7755; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 7756; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 7757; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 7758; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 7759; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 7760; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7761; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7762; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 7763; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 7764; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 7765; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 7766; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 7767; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 7768; GFX9_ITERATIVE-NEXT: ; %bb.3: 7769; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 7770; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 7771; GFX9_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 7772; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7773; GFX9_ITERATIVE-NEXT: .LBB17_4: 7774; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 7775; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7776; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 7777; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 7778; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 7779; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 7780; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7781; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 7782; GFX9_ITERATIVE-NEXT: s_endpgm 7783; 7784; GFX1064_ITERATIVE-LABEL: or_i32_varying: 7785; GFX1064_ITERATIVE: ; %bb.0: ; %entry 7786; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 7787; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 7788; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 7789; GFX1064_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop 7790; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7791; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 7792; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 7793; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 7794; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 7795; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 7796; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 7797; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 7798; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 7799; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7800; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7801; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 7802; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 7803; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 7804; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 7805; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 7806; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 7807; GFX1064_ITERATIVE-NEXT: ; %bb.3: 7808; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 7809; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 7810; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 7811; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7812; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 7813; GFX1064_ITERATIVE-NEXT: .LBB17_4: 7814; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 7815; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 7816; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7817; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 7818; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 7819; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 7820; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 7821; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7822; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 7823; GFX1064_ITERATIVE-NEXT: s_endpgm 7824; 7825; GFX1032_ITERATIVE-LABEL: or_i32_varying: 7826; GFX1032_ITERATIVE: ; %bb.0: ; %entry 7827; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 7828; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 7829; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 7830; GFX1032_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop 7831; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7832; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 7833; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 7834; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 7835; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 7836; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 7837; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 7838; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 7839; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 7840; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7841; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 7842; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 7843; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 7844; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 7845; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 7846; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 7847; GFX1032_ITERATIVE-NEXT: ; %bb.3: 7848; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 7849; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 7850; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 7851; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7852; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 7853; GFX1032_ITERATIVE-NEXT: .LBB17_4: 7854; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 7855; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 7856; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7857; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 7858; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 7859; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 7860; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 7861; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7862; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 7863; GFX1032_ITERATIVE-NEXT: s_endpgm 7864; 7865; GFX1164_ITERATIVE-LABEL: or_i32_varying: 7866; GFX1164_ITERATIVE: ; %bb.0: ; %entry 7867; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 7868; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 7869; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 7870; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 7871; GFX1164_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop 7872; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7873; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 7874; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 7875; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 7876; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 7877; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 7878; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 7879; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 7880; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 7881; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 7882; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 7883; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7884; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 7885; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7886; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 7887; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 7888; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 7889; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 7890; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7891; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 7892; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 7893; GFX1164_ITERATIVE-NEXT: ; %bb.3: 7894; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 7895; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 7896; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 7897; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7898; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 7899; GFX1164_ITERATIVE-NEXT: .LBB17_4: 7900; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 7901; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 7902; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 7903; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 7904; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 7905; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 7906; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 7907; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7908; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 7909; GFX1164_ITERATIVE-NEXT: s_endpgm 7910; 7911; GFX1132_ITERATIVE-LABEL: or_i32_varying: 7912; GFX1132_ITERATIVE: ; %bb.0: ; %entry 7913; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 7914; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 7915; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 7916; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 7917; GFX1132_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop 7918; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 7919; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 7920; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 7921; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 7922; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 7923; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 7924; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 7925; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 7926; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 7927; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 7928; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 7929; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 7930; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 7931; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 7932; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 7933; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 7934; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 7935; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 7936; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 7937; GFX1132_ITERATIVE-NEXT: ; %bb.3: 7938; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 7939; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 7940; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7941; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 7942; GFX1132_ITERATIVE-NEXT: .LBB17_4: 7943; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 7944; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 7945; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 7946; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 7947; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 7948; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 7949; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 7950; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 7951; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 7952; GFX1132_ITERATIVE-NEXT: s_endpgm 7953; 7954; GFX7LESS_DPP-LABEL: or_i32_varying: 7955; GFX7LESS_DPP: ; %bb.0: ; %entry 7956; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7957; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 7958; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 7959; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 7960; GFX7LESS_DPP-NEXT: ds_or_rtn_b32 v0, v1, v0 7961; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 7962; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 7963; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 7964; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 7965; GFX7LESS_DPP-NEXT: s_endpgm 7966; 7967; GFX8_DPP-LABEL: or_i32_varying: 7968; GFX8_DPP: ; %bb.0: ; %entry 7969; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 7970; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 7971; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 7972; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 7973; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 7974; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 7975; GFX8_DPP-NEXT: s_nop 0 7976; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 7977; GFX8_DPP-NEXT: s_nop 1 7978; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 7979; GFX8_DPP-NEXT: s_nop 1 7980; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 7981; GFX8_DPP-NEXT: s_nop 1 7982; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 7983; GFX8_DPP-NEXT: s_nop 1 7984; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 7985; GFX8_DPP-NEXT: s_nop 1 7986; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 7987; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 7988; GFX8_DPP-NEXT: s_nop 0 7989; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 7990; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 7991; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 7992; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 7993; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 7994; GFX8_DPP-NEXT: s_cbranch_execz .LBB17_2 7995; GFX8_DPP-NEXT: ; %bb.1: 7996; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 7997; GFX8_DPP-NEXT: s_mov_b32 m0, -1 7998; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 7999; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 8000; GFX8_DPP-NEXT: .LBB17_2: 8001; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 8002; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8003; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 8004; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 8005; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 8006; GFX8_DPP-NEXT: s_mov_b32 s2, -1 8007; GFX8_DPP-NEXT: v_or_b32_e32 v0, s4, v0 8008; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 8009; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 8010; GFX8_DPP-NEXT: s_endpgm 8011; 8012; GFX9_DPP-LABEL: or_i32_varying: 8013; GFX9_DPP: ; %bb.0: ; %entry 8014; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 8015; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 8016; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 8017; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8018; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 8019; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 8020; GFX9_DPP-NEXT: s_nop 0 8021; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8022; GFX9_DPP-NEXT: s_nop 1 8023; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8024; GFX9_DPP-NEXT: s_nop 1 8025; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8026; GFX9_DPP-NEXT: s_nop 1 8027; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8028; GFX9_DPP-NEXT: s_nop 1 8029; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 8030; GFX9_DPP-NEXT: s_nop 1 8031; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 8032; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 8033; GFX9_DPP-NEXT: s_nop 0 8034; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 8035; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 8036; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 8037; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 8038; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 8039; GFX9_DPP-NEXT: s_cbranch_execz .LBB17_2 8040; GFX9_DPP-NEXT: ; %bb.1: 8041; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 8042; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 8043; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 8044; GFX9_DPP-NEXT: .LBB17_2: 8045; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 8046; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8047; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 8048; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 8049; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 8050; GFX9_DPP-NEXT: s_mov_b32 s2, -1 8051; GFX9_DPP-NEXT: v_or_b32_e32 v0, s4, v0 8052; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 8053; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 8054; GFX9_DPP-NEXT: s_endpgm 8055; 8056; GFX1064_DPP-LABEL: or_i32_varying: 8057; GFX1064_DPP: ; %bb.0: ; %entry 8058; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8059; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 8060; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 8061; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8062; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8063; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8064; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8065; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 8066; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8067; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 8068; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 8069; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 8070; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 8071; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 8072; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 8073; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 8074; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 8075; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8076; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8077; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 8078; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 8079; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 8080; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 8081; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 8082; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 8083; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8084; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 8085; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 8086; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 8087; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 8088; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 8089; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 8090; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2 8091; GFX1064_DPP-NEXT: ; %bb.1: 8092; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 8093; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 8094; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 8095; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 8096; GFX1064_DPP-NEXT: buffer_gl0_inv 8097; GFX1064_DPP-NEXT: .LBB17_2: 8098; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 8099; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 8100; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8101; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 8102; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 8103; GFX1064_DPP-NEXT: v_or_b32_e32 v0, s3, v0 8104; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 8105; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 8106; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 8107; GFX1064_DPP-NEXT: s_endpgm 8108; 8109; GFX1032_DPP-LABEL: or_i32_varying: 8110; GFX1032_DPP: ; %bb.0: ; %entry 8111; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 8112; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 8113; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 8114; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8115; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8116; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8117; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8118; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 8119; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8120; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 8121; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 8122; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 8123; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 8124; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8125; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 8126; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 8127; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 8128; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 8129; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 8130; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 8131; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 8132; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 8133; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 8134; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 8135; GFX1032_DPP-NEXT: ; %bb.1: 8136; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 8137; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 8138; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 8139; GFX1032_DPP-NEXT: buffer_gl0_inv 8140; GFX1032_DPP-NEXT: .LBB17_2: 8141; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 8142; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 8143; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8144; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 8145; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 8146; GFX1032_DPP-NEXT: v_or_b32_e32 v0, s3, v0 8147; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 8148; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 8149; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 8150; GFX1032_DPP-NEXT: s_endpgm 8151; 8152; GFX1164_DPP-LABEL: or_i32_varying: 8153; GFX1164_DPP: ; %bb.0: ; %entry 8154; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 8155; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8156; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 8157; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 8158; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 8159; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 8160; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8161; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8162; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8163; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8164; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8165; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8166; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 8167; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8168; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8169; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 8170; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 8171; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8172; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 8173; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 8174; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 8175; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 8176; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 8177; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 8178; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 8179; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8180; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8181; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 8182; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 8183; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 8184; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 8185; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8186; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 8187; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 8188; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8189; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 8190; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 8191; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 8192; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 8193; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 8194; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 8195; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2 8196; GFX1164_DPP-NEXT: ; %bb.1: 8197; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 8198; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 8199; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 8200; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 8201; GFX1164_DPP-NEXT: buffer_gl0_inv 8202; GFX1164_DPP-NEXT: .LBB17_2: 8203; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 8204; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 8205; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 8206; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 8207; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 8208; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s3, v0 8209; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 8210; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 8211; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 8212; GFX1164_DPP-NEXT: s_endpgm 8213; 8214; GFX1132_DPP-LABEL: or_i32_varying: 8215; GFX1132_DPP: ; %bb.0: ; %entry 8216; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 8217; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 8218; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 8219; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 8220; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 8221; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 8222; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8223; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8224; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8225; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8226; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8227; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8228; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 8229; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8230; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 8231; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 8232; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 8233; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 8234; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 8235; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8236; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 8237; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 8238; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 8239; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 8240; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 8241; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 8242; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 8243; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 8244; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 8245; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 8246; GFX1132_DPP-NEXT: ; %bb.1: 8247; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 8248; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 8249; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 8250; GFX1132_DPP-NEXT: buffer_gl0_inv 8251; GFX1132_DPP-NEXT: .LBB17_2: 8252; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 8253; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 8254; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 8255; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 8256; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 8257; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s3, v0 8258; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 8259; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 8260; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 8261; GFX1132_DPP-NEXT: s_endpgm 8262entry: 8263 %lane = call i32 @llvm.amdgcn.workitem.id.x() 8264 %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel 8265 store i32 %old, ptr addrspace(1) %out 8266 ret void 8267} 8268 8269define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { 8270; GFX7LESS_ITERATIVE-LABEL: or_i64_varying: 8271; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 8272; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 8273; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 8274; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 8275; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 8276; GFX7LESS_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop 8277; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 8278; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 8279; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 8280; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 8281; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 8282; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 8283; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 8284; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 8285; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 8286; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[2:3], 0 8287; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] 8288; GFX7LESS_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] 8289; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB18_1 8290; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 8291; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 8292; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 8293; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 8294; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 8295; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 8296; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 8297; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 8298; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 8299; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 8300; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 8301; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 8302; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 8303; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] 8304; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8305; GFX7LESS_ITERATIVE-NEXT: .LBB18_4: 8306; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 8307; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 8308; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 8309; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 8310; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 8311; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 8312; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 8313; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 8314; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8315; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 8316; GFX7LESS_ITERATIVE-NEXT: s_endpgm 8317; 8318; GFX8_ITERATIVE-LABEL: or_i64_varying: 8319; GFX8_ITERATIVE: ; %bb.0: ; %entry 8320; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 8321; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 8322; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 8323; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 8324; GFX8_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop 8325; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 8326; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 8327; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 8328; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 8329; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 8330; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 8331; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 8332; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 8333; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] 8334; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 8335; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 8336; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 8337; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 8338; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8339; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 8340; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 8341; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 8342; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 8343; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 8344; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 8345; GFX8_ITERATIVE-NEXT: ; %bb.3: 8346; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 8347; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 8348; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 8349; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 8350; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] 8351; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8352; GFX8_ITERATIVE-NEXT: .LBB18_4: 8353; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 8354; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8355; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 8356; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 8357; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 8358; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 8359; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 8360; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 8361; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8362; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 8363; GFX8_ITERATIVE-NEXT: s_endpgm 8364; 8365; GFX9_ITERATIVE-LABEL: or_i64_varying: 8366; GFX9_ITERATIVE: ; %bb.0: ; %entry 8367; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 8368; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 8369; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 8370; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 8371; GFX9_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop 8372; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 8373; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 8374; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 8375; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 8376; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 8377; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 8378; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 8379; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 8380; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] 8381; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 8382; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 8383; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 8384; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 8385; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8386; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 8387; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 8388; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 8389; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 8390; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 8391; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 8392; GFX9_ITERATIVE-NEXT: ; %bb.3: 8393; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 8394; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 8395; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 8396; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] 8397; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8398; GFX9_ITERATIVE-NEXT: .LBB18_4: 8399; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 8400; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8401; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 8402; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 8403; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 8404; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 8405; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 8406; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 8407; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8408; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 8409; GFX9_ITERATIVE-NEXT: s_endpgm 8410; 8411; GFX1064_ITERATIVE-LABEL: or_i64_varying: 8412; GFX1064_ITERATIVE: ; %bb.0: ; %entry 8413; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 8414; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 8415; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 8416; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 8417; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop 8418; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 8419; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] 8420; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 8421; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 8422; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 8423; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 8424; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 8425; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 8426; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] 8427; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 8428; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 8429; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 8430; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8431; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 8432; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 8433; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 8434; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 8435; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 8436; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 8437; GFX1064_ITERATIVE-NEXT: ; %bb.3: 8438; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 8439; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 8440; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 8441; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] 8442; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8443; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 8444; GFX1064_ITERATIVE-NEXT: .LBB18_4: 8445; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 8446; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 8447; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8448; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 8449; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 8450; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 8451; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1 8452; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 8453; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 8454; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8455; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 8456; GFX1064_ITERATIVE-NEXT: s_endpgm 8457; 8458; GFX1032_ITERATIVE-LABEL: or_i64_varying: 8459; GFX1032_ITERATIVE: ; %bb.0: ; %entry 8460; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 8461; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 8462; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 8463; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 8464; GFX1032_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop 8465; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 8466; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 8467; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 8468; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 8469; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 8470; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 8471; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 8472; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 8473; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] 8474; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 8475; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 8476; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 8477; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8478; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 8479; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 8480; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 8481; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 8482; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 8483; GFX1032_ITERATIVE-NEXT: ; %bb.3: 8484; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 8485; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 8486; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 8487; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] 8488; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8489; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 8490; GFX1032_ITERATIVE-NEXT: .LBB18_4: 8491; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 8492; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 8493; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8494; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 8495; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 8496; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 8497; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1 8498; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 8499; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 8500; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8501; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 8502; GFX1032_ITERATIVE-NEXT: s_endpgm 8503; 8504; GFX1164_ITERATIVE-LABEL: or_i64_varying: 8505; GFX1164_ITERATIVE: ; %bb.0: ; %entry 8506; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 8507; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 8508; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 8509; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 8510; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 8511; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop 8512; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 8513; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] 8514; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 8515; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 8516; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 8517; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 8518; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 8519; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 8520; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] 8521; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] 8522; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 8523; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 8524; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 8525; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 8526; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8527; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 8528; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 8529; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 8530; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 8531; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8532; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 8533; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 8534; GFX1164_ITERATIVE-NEXT: ; %bb.3: 8535; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 8536; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 8537; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 8538; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] 8539; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8540; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 8541; GFX1164_ITERATIVE-NEXT: .LBB18_4: 8542; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 8543; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 8544; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 8545; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 8546; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8547; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v1, s2, v1 8548; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s3, v0 8549; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 8550; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 8551; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8552; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 8553; GFX1164_ITERATIVE-NEXT: s_endpgm 8554; 8555; GFX1132_ITERATIVE-LABEL: or_i64_varying: 8556; GFX1132_ITERATIVE: ; %bb.0: ; %entry 8557; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 8558; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 8559; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 8560; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 8561; GFX1132_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop 8562; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 8563; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 8564; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 8565; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 8566; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 8567; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 8568; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 8569; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 8570; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 8571; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] 8572; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 8573; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 8574; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 8575; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 8576; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 8577; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 8578; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 8579; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 8580; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 8581; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 8582; GFX1132_ITERATIVE-NEXT: ; %bb.3: 8583; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 8584; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 8585; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] 8586; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8587; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 8588; GFX1132_ITERATIVE-NEXT: .LBB18_4: 8589; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 8590; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 8591; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 8592; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 8593; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8594; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v1, s2, v1 8595; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s3, v0 8596; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 8597; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 8598; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 8599; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 8600; GFX1132_ITERATIVE-NEXT: s_endpgm 8601; 8602; GFX7LESS_DPP-LABEL: or_i64_varying: 8603; GFX7LESS_DPP: ; %bb.0: ; %entry 8604; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 8605; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 8606; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 8607; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 8608; GFX7LESS_DPP-NEXT: ds_or_rtn_b64 v[0:1], v1, v[0:1] 8609; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 8610; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 8611; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 8612; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8613; GFX7LESS_DPP-NEXT: s_endpgm 8614; 8615; GFX8_DPP-LABEL: or_i64_varying: 8616; GFX8_DPP: ; %bb.0: ; %entry 8617; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 8618; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 8619; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 8620; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8621; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 8622; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 8623; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 8624; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8625; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8626; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 8627; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8628; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8629; GFX8_DPP-NEXT: s_nop 0 8630; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8631; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8632; GFX8_DPP-NEXT: s_nop 0 8633; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8634; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8635; GFX8_DPP-NEXT: s_nop 0 8636; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 8637; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 8638; GFX8_DPP-NEXT: s_nop 0 8639; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 8640; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 8641; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 8642; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 8643; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 8644; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 8645; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 8646; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 8647; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 8648; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 8649; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 8650; GFX8_DPP-NEXT: ; %bb.1: 8651; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 8652; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 8653; GFX8_DPP-NEXT: s_mov_b32 m0, -1 8654; GFX8_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] 8655; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 8656; GFX8_DPP-NEXT: .LBB18_2: 8657; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 8658; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8659; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 8660; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 8661; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 8662; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 8663; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 8664; GFX8_DPP-NEXT: s_mov_b32 s2, -1 8665; GFX8_DPP-NEXT: v_or_b32_e32 v6, s4, v6 8666; GFX8_DPP-NEXT: v_or_b32_e32 v5, s5, v5 8667; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 8668; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 8669; GFX8_DPP-NEXT: s_endpgm 8670; 8671; GFX9_DPP-LABEL: or_i64_varying: 8672; GFX9_DPP: ; %bb.0: ; %entry 8673; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 8674; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 8675; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 8676; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8677; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 8678; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 8679; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 8680; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8681; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8682; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 8683; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8684; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8685; GFX9_DPP-NEXT: s_nop 0 8686; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8687; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8688; GFX9_DPP-NEXT: s_nop 0 8689; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8690; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8691; GFX9_DPP-NEXT: s_nop 0 8692; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 8693; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 8694; GFX9_DPP-NEXT: s_nop 0 8695; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 8696; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 8697; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 8698; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 8699; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 8700; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 8701; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 8702; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 8703; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 8704; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 8705; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 8706; GFX9_DPP-NEXT: ; %bb.1: 8707; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 8708; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 8709; GFX9_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] 8710; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 8711; GFX9_DPP-NEXT: .LBB18_2: 8712; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 8713; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8714; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 8715; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 8716; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 8717; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 8718; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 8719; GFX9_DPP-NEXT: s_mov_b32 s2, -1 8720; GFX9_DPP-NEXT: v_or_b32_e32 v6, s4, v6 8721; GFX9_DPP-NEXT: v_or_b32_e32 v5, s5, v5 8722; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 8723; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 8724; GFX9_DPP-NEXT: s_endpgm 8725; 8726; GFX1064_DPP-LABEL: or_i64_varying: 8727; GFX1064_DPP: ; %bb.0: ; %entry 8728; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8729; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 8730; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 8731; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 8732; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 8733; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8734; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8735; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8736; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8737; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8738; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8739; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8740; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8741; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 8742; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 8743; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8744; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8745; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 8746; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 8747; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 8748; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 8749; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 8750; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 8751; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 8752; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8753; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8754; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 8755; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 8756; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 8757; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 8758; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 8759; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 8760; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 8761; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 8762; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 8763; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 8764; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 8765; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 8766; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 8767; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 8768; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 8769; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 8770; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 8771; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 8772; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 8773; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 8774; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 8775; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 8776; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 8777; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 8778; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 8779; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 8780; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 8781; GFX1064_DPP-NEXT: ; %bb.1: 8782; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 8783; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 8784; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] 8785; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 8786; GFX1064_DPP-NEXT: buffer_gl0_inv 8787; GFX1064_DPP-NEXT: .LBB18_2: 8788; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 8789; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 8790; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8791; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 8792; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 8793; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 8794; GFX1064_DPP-NEXT: s_mov_b32 null, 0 8795; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 8796; GFX1064_DPP-NEXT: v_or_b32_e32 v9, s3, v9 8797; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s4, v8 8798; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 8799; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 8800; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 8801; GFX1064_DPP-NEXT: s_endpgm 8802; 8803; GFX1032_DPP-LABEL: or_i64_varying: 8804; GFX1032_DPP: ; %bb.0: ; %entry 8805; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 8806; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 8807; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 8808; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 8809; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 8810; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8811; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8812; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8813; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8814; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8815; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8816; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8817; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8818; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 8819; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 8820; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8821; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8822; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 8823; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 8824; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 8825; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 8826; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 8827; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 8828; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 8829; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 8830; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 8831; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 8832; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 8833; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 8834; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 8835; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 8836; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 8837; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 8838; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 8839; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 8840; GFX1032_DPP-NEXT: ; %bb.1: 8841; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 8842; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 8843; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] 8844; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 8845; GFX1032_DPP-NEXT: buffer_gl0_inv 8846; GFX1032_DPP-NEXT: .LBB18_2: 8847; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 8848; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 8849; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8850; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 8851; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 8852; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 8853; GFX1032_DPP-NEXT: s_mov_b32 null, 0 8854; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 8855; GFX1032_DPP-NEXT: v_or_b32_e32 v9, s3, v9 8856; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s4, v8 8857; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 8858; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 8859; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 8860; GFX1032_DPP-NEXT: s_endpgm 8861; 8862; GFX1164_DPP-LABEL: or_i64_varying: 8863; GFX1164_DPP: ; %bb.0: ; %entry 8864; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 8865; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8866; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 8867; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 8868; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 8869; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 8870; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 8871; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 8872; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8873; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8874; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8875; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8876; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8877; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8878; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8879; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8880; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8881; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8882; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8883; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8884; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 8885; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 8886; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8887; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8888; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8889; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8890; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 8891; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 8892; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8893; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 8894; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 8895; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8896; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 8897; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 8898; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 8899; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 8900; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 8901; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 8902; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 8903; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 8904; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 8905; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 8906; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 8907; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 8908; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 8909; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 8910; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 8911; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 8912; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 8913; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 8914; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 8915; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 8916; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 8917; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8918; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 8919; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 8920; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 8921; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 8922; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 8923; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 8924; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 8925; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 8926; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 8927; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 8928; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 8929; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 8930; GFX1164_DPP-NEXT: ; %bb.1: 8931; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 8932; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 8933; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] 8934; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 8935; GFX1164_DPP-NEXT: buffer_gl0_inv 8936; GFX1164_DPP-NEXT: .LBB18_2: 8937; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 8938; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 8939; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 8940; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 8941; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 8942; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 8943; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8944; GFX1164_DPP-NEXT: v_or_b32_e32 v9, s3, v9 8945; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s4, v8 8946; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 8947; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 8948; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 8949; GFX1164_DPP-NEXT: s_endpgm 8950; 8951; GFX1132_DPP-LABEL: or_i64_varying: 8952; GFX1132_DPP: ; %bb.0: ; %entry 8953; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 8954; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 8955; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 8956; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 8957; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 8958; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 8959; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 8960; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8961; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 8962; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8963; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8964; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 8965; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8966; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8967; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 8968; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8969; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8970; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 8971; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8972; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 8973; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 8974; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 8975; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8976; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 8977; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 8978; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 8979; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 8980; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 8981; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 8982; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 8983; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 8984; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 8985; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8986; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 8987; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 8988; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 8989; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 8990; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 8991; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 8992; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 8993; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 8994; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 8995; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 8996; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 8997; GFX1132_DPP-NEXT: ; %bb.1: 8998; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 8999; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] 9000; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 9001; GFX1132_DPP-NEXT: buffer_gl0_inv 9002; GFX1132_DPP-NEXT: .LBB18_2: 9003; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 9004; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 9005; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 9006; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 9007; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 9008; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 9009; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 9010; GFX1132_DPP-NEXT: v_or_b32_e32 v9, s3, v9 9011; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s4, v8 9012; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 9013; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 9014; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 9015; GFX1132_DPP-NEXT: s_endpgm 9016entry: 9017 %lane = call i32 @llvm.amdgcn.workitem.id.x() 9018 %lane_ext = zext i32 %lane to i64 9019 %old = atomicrmw or ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel 9020 store i64 %old, ptr addrspace(1) %out 9021 ret void 9022} 9023 9024define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { 9025; GFX7LESS_ITERATIVE-LABEL: xor_i32_varying: 9026; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 9027; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 9028; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 9029; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 9030; GFX7LESS_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop 9031; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9032; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 9033; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 9034; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 9035; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 9036; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 9037; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 9038; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 9039; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 9040; GFX7LESS_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 9041; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB19_1 9042; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9043; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 9044; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 9045; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9046; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 9047; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 9048; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 9049; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 9050; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 9051; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9052; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 9053; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 9054; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 9055; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9056; GFX7LESS_ITERATIVE-NEXT: .LBB19_4: 9057; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 9058; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 9059; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 9060; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 9061; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 9062; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 9063; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9064; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 9065; GFX7LESS_ITERATIVE-NEXT: s_endpgm 9066; 9067; GFX8_ITERATIVE-LABEL: xor_i32_varying: 9068; GFX8_ITERATIVE: ; %bb.0: ; %entry 9069; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 9070; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 9071; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 9072; GFX8_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop 9073; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9074; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 9075; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 9076; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 9077; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 9078; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 9079; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 9080; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 9081; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 9082; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 9083; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9084; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9085; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 9086; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9087; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 9088; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 9089; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 9090; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 9091; GFX8_ITERATIVE-NEXT: ; %bb.3: 9092; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9093; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 9094; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 9095; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 9096; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9097; GFX8_ITERATIVE-NEXT: .LBB19_4: 9098; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 9099; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9100; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 9101; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 9102; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 9103; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 9104; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9105; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 9106; GFX8_ITERATIVE-NEXT: s_endpgm 9107; 9108; GFX9_ITERATIVE-LABEL: xor_i32_varying: 9109; GFX9_ITERATIVE: ; %bb.0: ; %entry 9110; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 9111; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 9112; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 9113; GFX9_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop 9114; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9115; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 9116; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 9117; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 9118; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 9119; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 9120; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 9121; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 9122; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 9123; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 9124; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9125; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9126; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 9127; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9128; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 9129; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 9130; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 9131; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 9132; GFX9_ITERATIVE-NEXT: ; %bb.3: 9133; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9134; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 9135; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 9136; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9137; GFX9_ITERATIVE-NEXT: .LBB19_4: 9138; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 9139; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9140; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 9141; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 9142; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 9143; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 9144; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9145; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 9146; GFX9_ITERATIVE-NEXT: s_endpgm 9147; 9148; GFX1064_ITERATIVE-LABEL: xor_i32_varying: 9149; GFX1064_ITERATIVE: ; %bb.0: ; %entry 9150; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 9151; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 9152; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 9153; GFX1064_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop 9154; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9155; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 9156; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 9157; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 9158; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 9159; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 9160; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 9161; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 9162; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 9163; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9164; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9165; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 9166; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9167; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 9168; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 9169; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 9170; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 9171; GFX1064_ITERATIVE-NEXT: ; %bb.3: 9172; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9173; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 9174; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 9175; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9176; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 9177; GFX1064_ITERATIVE-NEXT: .LBB19_4: 9178; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 9179; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 9180; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9181; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 9182; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 9183; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 9184; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 9185; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9186; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 9187; GFX1064_ITERATIVE-NEXT: s_endpgm 9188; 9189; GFX1032_ITERATIVE-LABEL: xor_i32_varying: 9190; GFX1032_ITERATIVE: ; %bb.0: ; %entry 9191; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 9192; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 9193; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 9194; GFX1032_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop 9195; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9196; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 9197; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 9198; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 9199; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 9200; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 9201; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 9202; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 9203; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 9204; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9205; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9206; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 9207; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 9208; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 9209; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 9210; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 9211; GFX1032_ITERATIVE-NEXT: ; %bb.3: 9212; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9213; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 9214; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 9215; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9216; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 9217; GFX1032_ITERATIVE-NEXT: .LBB19_4: 9218; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 9219; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 9220; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9221; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 9222; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 9223; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 9224; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 9225; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9226; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 9227; GFX1032_ITERATIVE-NEXT: s_endpgm 9228; 9229; GFX1164_ITERATIVE-LABEL: xor_i32_varying: 9230; GFX1164_ITERATIVE: ; %bb.0: ; %entry 9231; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 9232; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 9233; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 9234; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 9235; GFX1164_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop 9236; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9237; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 9238; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 9239; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 9240; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 9241; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 9242; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 9243; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 9244; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 9245; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 9246; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 9247; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9248; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 9249; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9250; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 9251; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 9252; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 9253; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 9254; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9255; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 9256; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 9257; GFX1164_ITERATIVE-NEXT: ; %bb.3: 9258; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 9259; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 9260; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 9261; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9262; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 9263; GFX1164_ITERATIVE-NEXT: .LBB19_4: 9264; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 9265; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 9266; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 9267; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 9268; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 9269; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 9270; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 9271; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9272; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 9273; GFX1164_ITERATIVE-NEXT: s_endpgm 9274; 9275; GFX1132_ITERATIVE-LABEL: xor_i32_varying: 9276; GFX1132_ITERATIVE: ; %bb.0: ; %entry 9277; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 9278; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 9279; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 9280; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 9281; GFX1132_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop 9282; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9283; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 9284; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 9285; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 9286; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 9287; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 9288; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 9289; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 9290; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 9291; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 9292; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 9293; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9294; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 9295; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 9296; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 9297; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 9298; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 9299; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 9300; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 9301; GFX1132_ITERATIVE-NEXT: ; %bb.3: 9302; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 9303; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 9304; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9305; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 9306; GFX1132_ITERATIVE-NEXT: .LBB19_4: 9307; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 9308; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 9309; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 9310; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 9311; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 9312; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 9313; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 9314; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9315; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 9316; GFX1132_ITERATIVE-NEXT: s_endpgm 9317; 9318; GFX7LESS_DPP-LABEL: xor_i32_varying: 9319; GFX7LESS_DPP: ; %bb.0: ; %entry 9320; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 9321; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 9322; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 9323; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 9324; GFX7LESS_DPP-NEXT: ds_xor_rtn_b32 v0, v1, v0 9325; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 9326; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 9327; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 9328; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 9329; GFX7LESS_DPP-NEXT: s_endpgm 9330; 9331; GFX8_DPP-LABEL: xor_i32_varying: 9332; GFX8_DPP: ; %bb.0: ; %entry 9333; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 9334; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 9335; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 9336; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9337; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 9338; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 9339; GFX8_DPP-NEXT: s_nop 0 9340; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 9341; GFX8_DPP-NEXT: s_nop 1 9342; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 9343; GFX8_DPP-NEXT: s_nop 1 9344; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 9345; GFX8_DPP-NEXT: s_nop 1 9346; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 9347; GFX8_DPP-NEXT: s_nop 1 9348; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 9349; GFX8_DPP-NEXT: s_nop 1 9350; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 9351; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 9352; GFX8_DPP-NEXT: s_nop 0 9353; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 9354; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 9355; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 9356; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 9357; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 9358; GFX8_DPP-NEXT: s_cbranch_execz .LBB19_2 9359; GFX8_DPP-NEXT: ; %bb.1: 9360; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 9361; GFX8_DPP-NEXT: s_mov_b32 m0, -1 9362; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 9363; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 9364; GFX8_DPP-NEXT: .LBB19_2: 9365; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 9366; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9367; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 9368; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 9369; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 9370; GFX8_DPP-NEXT: s_mov_b32 s2, -1 9371; GFX8_DPP-NEXT: v_xor_b32_e32 v0, s4, v0 9372; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 9373; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 9374; GFX8_DPP-NEXT: s_endpgm 9375; 9376; GFX9_DPP-LABEL: xor_i32_varying: 9377; GFX9_DPP: ; %bb.0: ; %entry 9378; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 9379; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 9380; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 9381; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9382; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 9383; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 9384; GFX9_DPP-NEXT: s_nop 0 9385; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 9386; GFX9_DPP-NEXT: s_nop 1 9387; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 9388; GFX9_DPP-NEXT: s_nop 1 9389; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 9390; GFX9_DPP-NEXT: s_nop 1 9391; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 9392; GFX9_DPP-NEXT: s_nop 1 9393; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 9394; GFX9_DPP-NEXT: s_nop 1 9395; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 9396; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 9397; GFX9_DPP-NEXT: s_nop 0 9398; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 9399; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 9400; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 9401; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 9402; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 9403; GFX9_DPP-NEXT: s_cbranch_execz .LBB19_2 9404; GFX9_DPP-NEXT: ; %bb.1: 9405; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 9406; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 9407; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 9408; GFX9_DPP-NEXT: .LBB19_2: 9409; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 9410; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9411; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 9412; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 9413; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 9414; GFX9_DPP-NEXT: s_mov_b32 s2, -1 9415; GFX9_DPP-NEXT: v_xor_b32_e32 v0, s4, v0 9416; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 9417; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 9418; GFX9_DPP-NEXT: s_endpgm 9419; 9420; GFX1064_DPP-LABEL: xor_i32_varying: 9421; GFX1064_DPP: ; %bb.0: ; %entry 9422; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9423; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 9424; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 9425; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 9426; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 9427; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 9428; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 9429; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 9430; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 9431; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 9432; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 9433; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 9434; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 9435; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 9436; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 9437; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 9438; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 9439; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9440; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9441; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 9442; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 9443; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 9444; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 9445; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 9446; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 9447; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9448; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 9449; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 9450; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9451; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 9452; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 9453; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 9454; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2 9455; GFX1064_DPP-NEXT: ; %bb.1: 9456; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 9457; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 9458; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 9459; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 9460; GFX1064_DPP-NEXT: buffer_gl0_inv 9461; GFX1064_DPP-NEXT: .LBB19_2: 9462; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 9463; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 9464; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9465; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 9466; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 9467; GFX1064_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 9468; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 9469; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 9470; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 9471; GFX1064_DPP-NEXT: s_endpgm 9472; 9473; GFX1032_DPP-LABEL: xor_i32_varying: 9474; GFX1032_DPP: ; %bb.0: ; %entry 9475; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 9476; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 9477; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 9478; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 9479; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 9480; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 9481; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 9482; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 9483; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 9484; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 9485; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 9486; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 9487; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 9488; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9489; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 9490; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 9491; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 9492; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 9493; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 9494; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 9495; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 9496; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 9497; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 9498; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 9499; GFX1032_DPP-NEXT: ; %bb.1: 9500; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 9501; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 9502; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 9503; GFX1032_DPP-NEXT: buffer_gl0_inv 9504; GFX1032_DPP-NEXT: .LBB19_2: 9505; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 9506; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 9507; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9508; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 9509; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 9510; GFX1032_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 9511; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 9512; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 9513; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 9514; GFX1032_DPP-NEXT: s_endpgm 9515; 9516; GFX1164_DPP-LABEL: xor_i32_varying: 9517; GFX1164_DPP: ; %bb.0: ; %entry 9518; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 9519; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9520; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 9521; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 9522; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 9523; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 9524; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 9525; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 9526; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9527; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 9528; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 9529; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9530; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 9531; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 9532; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9533; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 9534; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 9535; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9536; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 9537; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 9538; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 9539; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 9540; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 9541; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 9542; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 9543; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9544; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9545; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 9546; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 9547; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 9548; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 9549; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9550; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 9551; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 9552; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9553; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 9554; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 9555; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9556; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 9557; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 9558; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 9559; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2 9560; GFX1164_DPP-NEXT: ; %bb.1: 9561; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 9562; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 9563; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 9564; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 9565; GFX1164_DPP-NEXT: buffer_gl0_inv 9566; GFX1164_DPP-NEXT: .LBB19_2: 9567; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 9568; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 9569; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 9570; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 9571; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 9572; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 9573; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 9574; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 9575; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 9576; GFX1164_DPP-NEXT: s_endpgm 9577; 9578; GFX1132_DPP-LABEL: xor_i32_varying: 9579; GFX1132_DPP: ; %bb.0: ; %entry 9580; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 9581; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 9582; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 9583; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 9584; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 9585; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 9586; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 9587; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 9588; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9589; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 9590; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 9591; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9592; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 9593; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 9594; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 9595; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 9596; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 9597; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 9598; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 9599; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9600; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 9601; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 9602; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 9603; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 9604; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 9605; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 9606; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 9607; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 9608; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 9609; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 9610; GFX1132_DPP-NEXT: ; %bb.1: 9611; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 9612; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 9613; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 9614; GFX1132_DPP-NEXT: buffer_gl0_inv 9615; GFX1132_DPP-NEXT: .LBB19_2: 9616; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 9617; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 9618; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 9619; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 9620; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 9621; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 9622; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 9623; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 9624; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 9625; GFX1132_DPP-NEXT: s_endpgm 9626entry: 9627 %lane = call i32 @llvm.amdgcn.workitem.id.x() 9628 %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel 9629 store i32 %old, ptr addrspace(1) %out 9630 ret void 9631} 9632 9633define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { 9634; GFX7LESS_ITERATIVE-LABEL: xor_i64_varying: 9635; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 9636; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 9637; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 9638; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 9639; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 9640; GFX7LESS_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop 9641; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9642; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 9643; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 9644; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 9645; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 9646; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 9647; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 9648; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 9649; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 9650; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[2:3], 0 9651; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] 9652; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] 9653; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB20_1 9654; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9655; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 9656; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 9657; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9658; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 9659; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 9660; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 9661; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 9662; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 9663; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9664; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 9665; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 9666; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 9667; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] 9668; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9669; GFX7LESS_ITERATIVE-NEXT: .LBB20_4: 9670; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 9671; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 9672; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 9673; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 9674; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 9675; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 9676; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 9677; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 9678; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9679; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 9680; GFX7LESS_ITERATIVE-NEXT: s_endpgm 9681; 9682; GFX8_ITERATIVE-LABEL: xor_i64_varying: 9683; GFX8_ITERATIVE: ; %bb.0: ; %entry 9684; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 9685; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 9686; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 9687; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 9688; GFX8_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop 9689; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9690; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 9691; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 9692; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 9693; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 9694; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 9695; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 9696; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 9697; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] 9698; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 9699; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 9700; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 9701; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9702; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9703; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 9704; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9705; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 9706; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 9707; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 9708; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 9709; GFX8_ITERATIVE-NEXT: ; %bb.3: 9710; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 9711; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9712; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 9713; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 9714; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] 9715; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9716; GFX8_ITERATIVE-NEXT: .LBB20_4: 9717; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 9718; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9719; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 9720; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 9721; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 9722; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 9723; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 9724; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 9725; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9726; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 9727; GFX8_ITERATIVE-NEXT: s_endpgm 9728; 9729; GFX9_ITERATIVE-LABEL: xor_i64_varying: 9730; GFX9_ITERATIVE: ; %bb.0: ; %entry 9731; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 9732; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 9733; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 9734; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 9735; GFX9_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop 9736; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9737; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 9738; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 9739; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 9740; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 9741; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 9742; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 9743; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 9744; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] 9745; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 9746; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 9747; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 9748; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9749; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9750; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 9751; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9752; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 9753; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 9754; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 9755; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 9756; GFX9_ITERATIVE-NEXT: ; %bb.3: 9757; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 9758; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9759; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 9760; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] 9761; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9762; GFX9_ITERATIVE-NEXT: .LBB20_4: 9763; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 9764; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9765; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 9766; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 9767; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 9768; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 9769; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 9770; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 9771; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9772; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 9773; GFX9_ITERATIVE-NEXT: s_endpgm 9774; 9775; GFX1064_ITERATIVE-LABEL: xor_i64_varying: 9776; GFX1064_ITERATIVE: ; %bb.0: ; %entry 9777; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 9778; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 9779; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 9780; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 9781; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop 9782; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9783; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] 9784; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 9785; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 9786; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 9787; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 9788; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 9789; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] 9790; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] 9791; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 9792; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 9793; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9794; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9795; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 9796; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 9797; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 9798; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 9799; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 9800; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 9801; GFX1064_ITERATIVE-NEXT: ; %bb.3: 9802; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 9803; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9804; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 9805; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] 9806; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9807; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 9808; GFX1064_ITERATIVE-NEXT: .LBB20_4: 9809; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 9810; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 9811; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9812; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 9813; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 9814; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 9815; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1 9816; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 9817; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 9818; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9819; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 9820; GFX1064_ITERATIVE-NEXT: s_endpgm 9821; 9822; GFX1032_ITERATIVE-LABEL: xor_i64_varying: 9823; GFX1032_ITERATIVE: ; %bb.0: ; %entry 9824; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 9825; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 9826; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 9827; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 9828; GFX1032_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop 9829; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9830; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 9831; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 9832; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 9833; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 9834; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 9835; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 9836; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 9837; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] 9838; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 9839; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 9840; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9841; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 9842; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 9843; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 9844; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 9845; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 9846; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 9847; GFX1032_ITERATIVE-NEXT: ; %bb.3: 9848; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 9849; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 9850; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 9851; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] 9852; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9853; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 9854; GFX1032_ITERATIVE-NEXT: .LBB20_4: 9855; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 9856; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 9857; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 9858; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 9859; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 9860; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 9861; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1 9862; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 9863; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 9864; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9865; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 9866; GFX1032_ITERATIVE-NEXT: s_endpgm 9867; 9868; GFX1164_ITERATIVE-LABEL: xor_i64_varying: 9869; GFX1164_ITERATIVE: ; %bb.0: ; %entry 9870; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 9871; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 9872; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 9873; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 9874; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 9875; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop 9876; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9877; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] 9878; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 9879; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 9880; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 9881; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 9882; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 9883; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 9884; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] 9885; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] 9886; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 9887; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 9888; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9889; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 9890; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9891; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 9892; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 9893; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 9894; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 9895; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9896; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 9897; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 9898; GFX1164_ITERATIVE-NEXT: ; %bb.3: 9899; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 9900; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 9901; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 9902; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] 9903; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9904; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 9905; GFX1164_ITERATIVE-NEXT: .LBB20_4: 9906; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 9907; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 9908; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 9909; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 9910; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 9911; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v1, s2, v1 9912; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s3, v0 9913; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 9914; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 9915; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9916; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 9917; GFX1164_ITERATIVE-NEXT: s_endpgm 9918; 9919; GFX1132_ITERATIVE-LABEL: xor_i64_varying: 9920; GFX1132_ITERATIVE: ; %bb.0: ; %entry 9921; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 9922; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 9923; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 9924; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 9925; GFX1132_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop 9926; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 9927; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 9928; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 9929; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 9930; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 9931; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 9932; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 9933; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 9934; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 9935; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] 9936; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 9937; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 9938; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 9939; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 9940; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 9941; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 9942; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 9943; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 9944; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 9945; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 9946; GFX1132_ITERATIVE-NEXT: ; %bb.3: 9947; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 9948; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 9949; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] 9950; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9951; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 9952; GFX1132_ITERATIVE-NEXT: .LBB20_4: 9953; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 9954; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 9955; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 9956; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 9957; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 9958; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v1, s2, v1 9959; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s3, v0 9960; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 9961; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 9962; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 9963; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 9964; GFX1132_ITERATIVE-NEXT: s_endpgm 9965; 9966; GFX7LESS_DPP-LABEL: xor_i64_varying: 9967; GFX7LESS_DPP: ; %bb.0: ; %entry 9968; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 9969; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 9970; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 9971; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 9972; GFX7LESS_DPP-NEXT: ds_xor_rtn_b64 v[0:1], v1, v[0:1] 9973; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 9974; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 9975; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 9976; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 9977; GFX7LESS_DPP-NEXT: s_endpgm 9978; 9979; GFX8_DPP-LABEL: xor_i64_varying: 9980; GFX8_DPP: ; %bb.0: ; %entry 9981; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 9982; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 9983; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 9984; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 9985; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 9986; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 9987; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 9988; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 9989; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 9990; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 9991; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 9992; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 9993; GFX8_DPP-NEXT: s_nop 0 9994; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 9995; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 9996; GFX8_DPP-NEXT: s_nop 0 9997; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 9998; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 9999; GFX8_DPP-NEXT: s_nop 0 10000; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 10001; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 10002; GFX8_DPP-NEXT: s_nop 0 10003; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 10004; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 10005; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 10006; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 10007; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 10008; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 10009; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 10010; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 10011; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 10012; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 10013; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 10014; GFX8_DPP-NEXT: ; %bb.1: 10015; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 10016; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 10017; GFX8_DPP-NEXT: s_mov_b32 m0, -1 10018; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] 10019; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 10020; GFX8_DPP-NEXT: .LBB20_2: 10021; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 10022; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10023; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 10024; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 10025; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 10026; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 10027; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 10028; GFX8_DPP-NEXT: s_mov_b32 s2, -1 10029; GFX8_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 10030; GFX8_DPP-NEXT: v_xor_b32_e32 v5, s5, v5 10031; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 10032; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 10033; GFX8_DPP-NEXT: s_endpgm 10034; 10035; GFX9_DPP-LABEL: xor_i64_varying: 10036; GFX9_DPP: ; %bb.0: ; %entry 10037; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 10038; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 10039; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 10040; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10041; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 10042; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 10043; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 10044; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10045; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10046; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 10047; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10048; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10049; GFX9_DPP-NEXT: s_nop 0 10050; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10051; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10052; GFX9_DPP-NEXT: s_nop 0 10053; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10054; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10055; GFX9_DPP-NEXT: s_nop 0 10056; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 10057; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 10058; GFX9_DPP-NEXT: s_nop 0 10059; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 10060; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 10061; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 10062; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 10063; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 10064; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 10065; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 10066; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 10067; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 10068; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 10069; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 10070; GFX9_DPP-NEXT: ; %bb.1: 10071; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 10072; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 10073; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] 10074; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 10075; GFX9_DPP-NEXT: .LBB20_2: 10076; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 10077; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10078; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 10079; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 10080; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 10081; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 10082; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 10083; GFX9_DPP-NEXT: s_mov_b32 s2, -1 10084; GFX9_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 10085; GFX9_DPP-NEXT: v_xor_b32_e32 v5, s5, v5 10086; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 10087; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 10088; GFX9_DPP-NEXT: s_endpgm 10089; 10090; GFX1064_DPP-LABEL: xor_i64_varying: 10091; GFX1064_DPP: ; %bb.0: ; %entry 10092; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10093; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 10094; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 10095; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 10096; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 10097; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10098; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10099; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10100; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10101; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10102; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10103; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10104; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10105; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 10106; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 10107; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10108; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10109; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 10110; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 10111; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 10112; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 10113; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 10114; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 10115; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 10116; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10117; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10118; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10119; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 10120; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 10121; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 10122; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 10123; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 10124; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 10125; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 10126; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 10127; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 10128; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 10129; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 10130; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 10131; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 10132; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 10133; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 10134; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 10135; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 10136; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 10137; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 10138; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 10139; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 10140; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 10141; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 10142; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 10143; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 10144; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 10145; GFX1064_DPP-NEXT: ; %bb.1: 10146; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 10147; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 10148; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] 10149; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 10150; GFX1064_DPP-NEXT: buffer_gl0_inv 10151; GFX1064_DPP-NEXT: .LBB20_2: 10152; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 10153; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 10154; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10155; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 10156; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 10157; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 10158; GFX1064_DPP-NEXT: s_mov_b32 null, 0 10159; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 10160; GFX1064_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 10161; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 10162; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 10163; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 10164; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 10165; GFX1064_DPP-NEXT: s_endpgm 10166; 10167; GFX1032_DPP-LABEL: xor_i64_varying: 10168; GFX1032_DPP: ; %bb.0: ; %entry 10169; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 10170; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 10171; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 10172; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 10173; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 10174; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10175; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10176; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10177; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10178; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10179; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10180; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10181; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10182; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 10183; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 10184; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10185; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10186; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 10187; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 10188; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 10189; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10190; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 10191; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 10192; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 10193; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 10194; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 10195; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 10196; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 10197; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 10198; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 10199; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 10200; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 10201; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 10202; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 10203; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 10204; GFX1032_DPP-NEXT: ; %bb.1: 10205; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 10206; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 10207; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] 10208; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 10209; GFX1032_DPP-NEXT: buffer_gl0_inv 10210; GFX1032_DPP-NEXT: .LBB20_2: 10211; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 10212; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 10213; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10214; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 10215; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 10216; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 10217; GFX1032_DPP-NEXT: s_mov_b32 null, 0 10218; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 10219; GFX1032_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 10220; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 10221; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 10222; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 10223; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 10224; GFX1032_DPP-NEXT: s_endpgm 10225; 10226; GFX1164_DPP-LABEL: xor_i64_varying: 10227; GFX1164_DPP: ; %bb.0: ; %entry 10228; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 10229; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10230; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 10231; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] 10232; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 10233; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 10234; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 10235; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 10236; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10237; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10238; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10239; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10240; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10241; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10242; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10243; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10244; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10245; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10246; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10247; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10248; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 10249; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 10250; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10251; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10252; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10253; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10254; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 10255; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 10256; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10257; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 10258; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 10259; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10260; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 10261; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 10262; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 10263; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 10264; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10265; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 10266; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10267; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10268; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 10269; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 10270; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 10271; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 10272; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 10273; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 10274; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 10275; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 10276; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 10277; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 10278; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 10279; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 10280; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 10281; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10282; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 10283; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 10284; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 10285; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 10286; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 10287; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 10288; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 10289; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 10290; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 10291; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 10292; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 10293; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 10294; GFX1164_DPP-NEXT: ; %bb.1: 10295; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 10296; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 10297; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] 10298; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 10299; GFX1164_DPP-NEXT: buffer_gl0_inv 10300; GFX1164_DPP-NEXT: .LBB20_2: 10301; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 10302; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 10303; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 10304; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 10305; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 10306; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 10307; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10308; GFX1164_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 10309; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 10310; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 10311; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 10312; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 10313; GFX1164_DPP-NEXT: s_endpgm 10314; 10315; GFX1132_DPP-LABEL: xor_i64_varying: 10316; GFX1132_DPP: ; %bb.0: ; %entry 10317; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 10318; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 10319; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 10320; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 10321; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 10322; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 10323; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 10324; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10325; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 10326; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10327; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10328; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 10329; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10330; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10331; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 10332; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10333; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10334; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 10335; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10336; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 10337; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 10338; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10339; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10340; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10341; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 10342; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 10343; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 10344; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 10345; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10346; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 10347; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 10348; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 10349; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10350; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 10351; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 10352; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 10353; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 10354; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 10355; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 10356; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 10357; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 10358; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 10359; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 10360; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 10361; GFX1132_DPP-NEXT: ; %bb.1: 10362; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 10363; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] 10364; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 10365; GFX1132_DPP-NEXT: buffer_gl0_inv 10366; GFX1132_DPP-NEXT: .LBB20_2: 10367; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 10368; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 10369; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 10370; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 10371; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 10372; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 10373; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 10374; GFX1132_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 10375; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 10376; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 10377; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 10378; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 10379; GFX1132_DPP-NEXT: s_endpgm 10380entry: 10381 %lane = call i32 @llvm.amdgcn.workitem.id.x() 10382 %lane_ext = zext i32 %lane to i64 10383 %old = atomicrmw xor ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel 10384 store i64 %old, ptr addrspace(1) %out 10385 ret void 10386} 10387 10388define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { 10389; GFX7LESS_ITERATIVE-LABEL: max_i32_varying: 10390; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 10391; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 10392; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s2, 1 10393; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 10394; GFX7LESS_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop 10395; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 10396; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 10397; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 10398; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 10399; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 10400; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 10401; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 10402; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 10403; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 10404; GFX7LESS_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 10405; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB21_1 10406; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 10407; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 10408; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 10409; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 10410; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 10411; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 10412; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 10413; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 10414; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 10415; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 10416; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 10417; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 10418; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 10419; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10420; GFX7LESS_ITERATIVE-NEXT: .LBB21_4: 10421; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 10422; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 10423; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 10424; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 10425; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 10426; GFX7LESS_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 10427; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10428; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 10429; GFX7LESS_ITERATIVE-NEXT: s_endpgm 10430; 10431; GFX8_ITERATIVE-LABEL: max_i32_varying: 10432; GFX8_ITERATIVE: ; %bb.0: ; %entry 10433; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 10434; GFX8_ITERATIVE-NEXT: s_brev_b32 s2, 1 10435; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 10436; GFX8_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop 10437; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 10438; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 10439; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 10440; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 10441; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 10442; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 10443; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 10444; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 10445; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 10446; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 10447; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 10448; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10449; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 10450; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 10451; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 10452; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 10453; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 10454; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 10455; GFX8_ITERATIVE-NEXT: ; %bb.3: 10456; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 10457; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 10458; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 10459; GFX8_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 10460; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10461; GFX8_ITERATIVE-NEXT: .LBB21_4: 10462; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 10463; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10464; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 10465; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 10466; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 10467; GFX8_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 10468; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10469; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 10470; GFX8_ITERATIVE-NEXT: s_endpgm 10471; 10472; GFX9_ITERATIVE-LABEL: max_i32_varying: 10473; GFX9_ITERATIVE: ; %bb.0: ; %entry 10474; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 10475; GFX9_ITERATIVE-NEXT: s_brev_b32 s2, 1 10476; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 10477; GFX9_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop 10478; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 10479; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 10480; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 10481; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 10482; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 10483; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 10484; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 10485; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 10486; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 10487; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 10488; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 10489; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10490; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 10491; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 10492; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 10493; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 10494; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 10495; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 10496; GFX9_ITERATIVE-NEXT: ; %bb.3: 10497; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 10498; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 10499; GFX9_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 10500; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10501; GFX9_ITERATIVE-NEXT: .LBB21_4: 10502; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 10503; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10504; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 10505; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 10506; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 10507; GFX9_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 10508; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10509; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 10510; GFX9_ITERATIVE-NEXT: s_endpgm 10511; 10512; GFX1064_ITERATIVE-LABEL: max_i32_varying: 10513; GFX1064_ITERATIVE: ; %bb.0: ; %entry 10514; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 10515; GFX1064_ITERATIVE-NEXT: s_brev_b32 s2, 1 10516; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 10517; GFX1064_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop 10518; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 10519; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 10520; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 10521; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 10522; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 10523; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 10524; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 10525; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 10526; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 10527; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 10528; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10529; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 10530; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 10531; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 10532; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 10533; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 10534; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 10535; GFX1064_ITERATIVE-NEXT: ; %bb.3: 10536; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 10537; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 10538; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 10539; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10540; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 10541; GFX1064_ITERATIVE-NEXT: .LBB21_4: 10542; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 10543; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 10544; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10545; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 10546; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 10547; GFX1064_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 10548; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 10549; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10550; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 10551; GFX1064_ITERATIVE-NEXT: s_endpgm 10552; 10553; GFX1032_ITERATIVE-LABEL: max_i32_varying: 10554; GFX1032_ITERATIVE: ; %bb.0: ; %entry 10555; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 10556; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, 1 10557; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 10558; GFX1032_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop 10559; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 10560; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 10561; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 10562; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 10563; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 10564; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 10565; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 10566; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 10567; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 10568; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 10569; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10570; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 10571; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 10572; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 10573; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 10574; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 10575; GFX1032_ITERATIVE-NEXT: ; %bb.3: 10576; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 10577; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 10578; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 10579; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10580; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 10581; GFX1032_ITERATIVE-NEXT: .LBB21_4: 10582; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 10583; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 10584; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10585; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 10586; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 10587; GFX1032_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 10588; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 10589; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10590; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 10591; GFX1032_ITERATIVE-NEXT: s_endpgm 10592; 10593; GFX1164_ITERATIVE-LABEL: max_i32_varying: 10594; GFX1164_ITERATIVE: ; %bb.0: ; %entry 10595; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 10596; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 10597; GFX1164_ITERATIVE-NEXT: s_brev_b32 s2, 1 10598; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 10599; GFX1164_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop 10600; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 10601; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 10602; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 10603; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 10604; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 10605; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 10606; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 10607; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 10608; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 10609; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 10610; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 10611; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 10612; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 10613; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10614; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 10615; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 10616; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 10617; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 10618; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10619; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 10620; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 10621; GFX1164_ITERATIVE-NEXT: ; %bb.3: 10622; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 10623; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 10624; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 10625; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10626; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 10627; GFX1164_ITERATIVE-NEXT: .LBB21_4: 10628; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 10629; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 10630; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 10631; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 10632; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 10633; GFX1164_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 10634; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 10635; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10636; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 10637; GFX1164_ITERATIVE-NEXT: s_endpgm 10638; 10639; GFX1132_ITERATIVE-LABEL: max_i32_varying: 10640; GFX1132_ITERATIVE: ; %bb.0: ; %entry 10641; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 10642; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 10643; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, 1 10644; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 10645; GFX1132_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop 10646; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 10647; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 10648; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 10649; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 10650; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 10651; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 10652; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 10653; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 10654; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 10655; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 10656; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 10657; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 10658; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 10659; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 10660; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 10661; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 10662; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 10663; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 10664; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 10665; GFX1132_ITERATIVE-NEXT: ; %bb.3: 10666; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 10667; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 10668; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10669; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 10670; GFX1132_ITERATIVE-NEXT: .LBB21_4: 10671; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 10672; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 10673; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 10674; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 10675; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 10676; GFX1132_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 10677; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 10678; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 10679; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 10680; GFX1132_ITERATIVE-NEXT: s_endpgm 10681; 10682; GFX7LESS_DPP-LABEL: max_i32_varying: 10683; GFX7LESS_DPP: ; %bb.0: ; %entry 10684; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 10685; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 10686; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 10687; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 10688; GFX7LESS_DPP-NEXT: ds_max_rtn_i32 v0, v1, v0 10689; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 10690; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 10691; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 10692; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 10693; GFX7LESS_DPP-NEXT: s_endpgm 10694; 10695; GFX8_DPP-LABEL: max_i32_varying: 10696; GFX8_DPP: ; %bb.0: ; %entry 10697; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 10698; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 10699; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10700; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 10701; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] 10702; GFX8_DPP-NEXT: s_nop 1 10703; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 10704; GFX8_DPP-NEXT: s_nop 1 10705; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 10706; GFX8_DPP-NEXT: s_nop 1 10707; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 10708; GFX8_DPP-NEXT: s_nop 1 10709; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 10710; GFX8_DPP-NEXT: s_nop 1 10711; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 10712; GFX8_DPP-NEXT: s_nop 1 10713; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 10714; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 10715; GFX8_DPP-NEXT: s_nop 0 10716; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 10717; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 10718; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 10719; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 10720; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 10721; GFX8_DPP-NEXT: s_cbranch_execz .LBB21_2 10722; GFX8_DPP-NEXT: ; %bb.1: 10723; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 10724; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 10725; GFX8_DPP-NEXT: s_mov_b32 m0, -1 10726; GFX8_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 10727; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 10728; GFX8_DPP-NEXT: .LBB21_2: 10729; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 10730; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10731; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 10732; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 10733; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 10734; GFX8_DPP-NEXT: s_mov_b32 s2, -1 10735; GFX8_DPP-NEXT: v_max_i32_e32 v0, s4, v0 10736; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 10737; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 10738; GFX8_DPP-NEXT: s_endpgm 10739; 10740; GFX9_DPP-LABEL: max_i32_varying: 10741; GFX9_DPP: ; %bb.0: ; %entry 10742; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 10743; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 10744; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10745; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 10746; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] 10747; GFX9_DPP-NEXT: s_nop 1 10748; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 10749; GFX9_DPP-NEXT: s_nop 1 10750; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 10751; GFX9_DPP-NEXT: s_nop 1 10752; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 10753; GFX9_DPP-NEXT: s_nop 1 10754; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 10755; GFX9_DPP-NEXT: s_nop 1 10756; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 10757; GFX9_DPP-NEXT: s_nop 1 10758; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 10759; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 10760; GFX9_DPP-NEXT: s_nop 0 10761; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 10762; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 10763; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 10764; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 10765; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 10766; GFX9_DPP-NEXT: s_cbranch_execz .LBB21_2 10767; GFX9_DPP-NEXT: ; %bb.1: 10768; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 10769; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 10770; GFX9_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 10771; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 10772; GFX9_DPP-NEXT: .LBB21_2: 10773; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 10774; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10775; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 10776; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 10777; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 10778; GFX9_DPP-NEXT: s_mov_b32 s2, -1 10779; GFX9_DPP-NEXT: v_max_i32_e32 v0, s4, v0 10780; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 10781; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 10782; GFX9_DPP-NEXT: s_endpgm 10783; 10784; GFX1064_DPP-LABEL: max_i32_varying: 10785; GFX1064_DPP: ; %bb.0: ; %entry 10786; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10787; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] 10788; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1 10789; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10790; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 10791; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 10792; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 10793; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 10794; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10795; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 10796; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 10797; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 10798; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10799; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 10800; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 10801; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 10802; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 10803; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10804; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10805; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 10806; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 10807; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 10808; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 10809; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 10810; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10811; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 10812; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 10813; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 10814; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 10815; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 10816; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 10817; GFX1064_DPP-NEXT: s_cbranch_execz .LBB21_2 10818; GFX1064_DPP-NEXT: ; %bb.1: 10819; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 10820; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 10821; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 10822; GFX1064_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 10823; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 10824; GFX1064_DPP-NEXT: buffer_gl0_inv 10825; GFX1064_DPP-NEXT: .LBB21_2: 10826; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 10827; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 10828; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10829; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 10830; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 10831; GFX1064_DPP-NEXT: v_max_i32_e32 v0, s3, v0 10832; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 10833; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 10834; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 10835; GFX1064_DPP-NEXT: s_endpgm 10836; 10837; GFX1032_DPP-LABEL: max_i32_varying: 10838; GFX1032_DPP: ; %bb.0: ; %entry 10839; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 10840; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 10841; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1 10842; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10843; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 10844; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 10845; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 10846; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 10847; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10848; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 10849; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 10850; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10851; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 10852; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10853; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 10854; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 10855; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 10856; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 10857; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 10858; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 10859; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 10860; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 10861; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2 10862; GFX1032_DPP-NEXT: ; %bb.1: 10863; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 10864; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 10865; GFX1032_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 10866; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 10867; GFX1032_DPP-NEXT: buffer_gl0_inv 10868; GFX1032_DPP-NEXT: .LBB21_2: 10869; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 10870; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 10871; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10872; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 10873; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 10874; GFX1032_DPP-NEXT: v_max_i32_e32 v0, s3, v0 10875; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 10876; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 10877; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 10878; GFX1032_DPP-NEXT: s_endpgm 10879; 10880; GFX1164_DPP-LABEL: max_i32_varying: 10881; GFX1164_DPP: ; %bb.0: ; %entry 10882; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 10883; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10884; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 10885; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] 10886; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1 10887; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 10888; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10889; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 10890; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10891; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 10892; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 10893; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10894; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 10895; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10896; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10897; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 10898; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 10899; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10900; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 10901; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10902; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 10903; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 10904; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 10905; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 10906; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 10907; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10908; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10909; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 10910; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 10911; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 10912; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 10913; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 10914; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 10915; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 10916; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 10917; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 10918; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 10919; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 10920; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 10921; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 10922; GFX1164_DPP-NEXT: s_cbranch_execz .LBB21_2 10923; GFX1164_DPP-NEXT: ; %bb.1: 10924; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 10925; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 10926; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 10927; GFX1164_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 10928; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 10929; GFX1164_DPP-NEXT: buffer_gl0_inv 10930; GFX1164_DPP-NEXT: .LBB21_2: 10931; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 10932; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 10933; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 10934; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 10935; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 10936; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s3, v0 10937; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 10938; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 10939; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 10940; GFX1164_DPP-NEXT: s_endpgm 10941; 10942; GFX1132_DPP-LABEL: max_i32_varying: 10943; GFX1132_DPP: ; %bb.0: ; %entry 10944; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 10945; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 10946; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 10947; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 10948; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1 10949; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 10950; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10951; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 10952; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10953; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 10954; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 10955; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10956; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 10957; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 10958; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 10959; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 10960; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 10961; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 10962; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 10963; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 10964; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 10965; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 10966; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 10967; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 10968; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 10969; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 10970; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 10971; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 10972; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2 10973; GFX1132_DPP-NEXT: ; %bb.1: 10974; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 10975; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 10976; GFX1132_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 10977; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 10978; GFX1132_DPP-NEXT: buffer_gl0_inv 10979; GFX1132_DPP-NEXT: .LBB21_2: 10980; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 10981; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 10982; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 10983; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 10984; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 10985; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s3, v0 10986; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 10987; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 10988; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 10989; GFX1132_DPP-NEXT: s_endpgm 10990entry: 10991 %lane = call i32 @llvm.amdgcn.workitem.id.x() 10992 %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel 10993 store i32 %old, ptr addrspace(1) %out 10994 ret void 10995} 10996 10997define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { 10998; GFX7LESS-LABEL: max_i64_constant: 10999; GFX7LESS: ; %bb.0: ; %entry 11000; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 11001; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 11002; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11003; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 11004; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 11005; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 11006; GFX7LESS-NEXT: ; %bb.1: 11007; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 11008; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 11009; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 11010; GFX7LESS-NEXT: s_mov_b32 m0, -1 11011; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 11012; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 11013; GFX7LESS-NEXT: .LBB22_2: 11014; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 11015; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 11016; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 11017; GFX7LESS-NEXT: s_mov_b32 s2, -1 11018; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 11019; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 11020; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 11021; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 11022; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 11023; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 11024; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 11025; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 11026; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 11027; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 11028; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 11029; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 11030; GFX7LESS-NEXT: s_endpgm 11031; 11032; GFX8-LABEL: max_i64_constant: 11033; GFX8: ; %bb.0: ; %entry 11034; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11035; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 11036; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11037; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 11038; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 11039; GFX8-NEXT: s_cbranch_execz .LBB22_2 11040; GFX8-NEXT: ; %bb.1: 11041; GFX8-NEXT: v_mov_b32_e32 v0, 5 11042; GFX8-NEXT: v_mov_b32_e32 v1, 0 11043; GFX8-NEXT: v_mov_b32_e32 v2, 0 11044; GFX8-NEXT: s_mov_b32 m0, -1 11045; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 11046; GFX8-NEXT: s_waitcnt lgkmcnt(0) 11047; GFX8-NEXT: .LBB22_2: 11048; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 11049; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11050; GFX8-NEXT: v_readfirstlane_b32 s4, v0 11051; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 11052; GFX8-NEXT: v_readfirstlane_b32 s5, v1 11053; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 11054; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 11055; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 11056; GFX8-NEXT: v_mov_b32_e32 v2, s5 11057; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 11058; GFX8-NEXT: v_mov_b32_e32 v2, s4 11059; GFX8-NEXT: s_mov_b32 s3, 0xf000 11060; GFX8-NEXT: s_mov_b32 s2, -1 11061; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 11062; GFX8-NEXT: s_waitcnt lgkmcnt(0) 11063; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 11064; GFX8-NEXT: s_endpgm 11065; 11066; GFX9-LABEL: max_i64_constant: 11067; GFX9: ; %bb.0: ; %entry 11068; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11069; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 11070; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11071; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 11072; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 11073; GFX9-NEXT: s_cbranch_execz .LBB22_2 11074; GFX9-NEXT: ; %bb.1: 11075; GFX9-NEXT: v_mov_b32_e32 v0, 5 11076; GFX9-NEXT: v_mov_b32_e32 v1, 0 11077; GFX9-NEXT: v_mov_b32_e32 v2, 0 11078; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 11079; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11080; GFX9-NEXT: .LBB22_2: 11081; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 11082; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11083; GFX9-NEXT: v_readfirstlane_b32 s4, v0 11084; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 11085; GFX9-NEXT: v_readfirstlane_b32 s5, v1 11086; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 11087; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 11088; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 11089; GFX9-NEXT: v_mov_b32_e32 v2, s5 11090; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 11091; GFX9-NEXT: v_mov_b32_e32 v2, s4 11092; GFX9-NEXT: s_mov_b32 s3, 0xf000 11093; GFX9-NEXT: s_mov_b32 s2, -1 11094; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 11095; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11096; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 11097; GFX9-NEXT: s_endpgm 11098; 11099; GFX1064-LABEL: max_i64_constant: 11100; GFX1064: ; %bb.0: ; %entry 11101; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11102; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 11103; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11104; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 11105; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 11106; GFX1064-NEXT: s_cbranch_execz .LBB22_2 11107; GFX1064-NEXT: ; %bb.1: 11108; GFX1064-NEXT: v_mov_b32_e32 v0, 5 11109; GFX1064-NEXT: v_mov_b32_e32 v1, 0 11110; GFX1064-NEXT: v_mov_b32_e32 v2, 0 11111; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 11112; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 11113; GFX1064-NEXT: buffer_gl0_inv 11114; GFX1064-NEXT: .LBB22_2: 11115; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 11116; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 11117; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11118; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 11119; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 11120; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 11121; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 11122; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 11123; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 11124; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 11125; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 11126; GFX1064-NEXT: s_mov_b32 s2, -1 11127; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 11128; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 11129; GFX1064-NEXT: s_endpgm 11130; 11131; GFX1032-LABEL: max_i64_constant: 11132; GFX1032: ; %bb.0: ; %entry 11133; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11134; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 11135; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 11136; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 11137; GFX1032-NEXT: s_cbranch_execz .LBB22_2 11138; GFX1032-NEXT: ; %bb.1: 11139; GFX1032-NEXT: v_mov_b32_e32 v0, 5 11140; GFX1032-NEXT: v_mov_b32_e32 v1, 0 11141; GFX1032-NEXT: v_mov_b32_e32 v2, 0 11142; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 11143; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 11144; GFX1032-NEXT: buffer_gl0_inv 11145; GFX1032-NEXT: .LBB22_2: 11146; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 11147; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 11148; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11149; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 11150; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 11151; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 11152; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 11153; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 11154; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 11155; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 11156; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 11157; GFX1032-NEXT: s_mov_b32 s2, -1 11158; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 11159; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 11160; GFX1032-NEXT: s_endpgm 11161; 11162; GFX1164-LABEL: max_i64_constant: 11163; GFX1164: ; %bb.0: ; %entry 11164; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11165; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11166; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 11167; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11168; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 11169; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc 11170; GFX1164-NEXT: s_cbranch_execz .LBB22_2 11171; GFX1164-NEXT: ; %bb.1: 11172; GFX1164-NEXT: v_mov_b32_e32 v0, 5 11173; GFX1164-NEXT: v_mov_b32_e32 v1, 0 11174; GFX1164-NEXT: v_mov_b32_e32 v2, 0 11175; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 11176; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 11177; GFX1164-NEXT: buffer_gl0_inv 11178; GFX1164-NEXT: .LBB22_2: 11179; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 11180; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 11181; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 11182; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 11183; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 11184; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 11185; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 11186; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 11187; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 11188; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 11189; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 11190; GFX1164-NEXT: s_mov_b32 s2, -1 11191; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 11192; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 11193; GFX1164-NEXT: s_endpgm 11194; 11195; GFX1132-LABEL: max_i64_constant: 11196; GFX1132: ; %bb.0: ; %entry 11197; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11198; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 11199; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 11200; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 11201; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo 11202; GFX1132-NEXT: s_cbranch_execz .LBB22_2 11203; GFX1132-NEXT: ; %bb.1: 11204; GFX1132-NEXT: v_mov_b32_e32 v0, 5 11205; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 11206; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 11207; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 11208; GFX1132-NEXT: buffer_gl0_inv 11209; GFX1132-NEXT: .LBB22_2: 11210; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 11211; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 11212; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 11213; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 11214; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 11215; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 11216; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 11217; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 11218; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 11219; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 11220; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 11221; GFX1132-NEXT: s_mov_b32 s2, -1 11222; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 11223; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 11224; GFX1132-NEXT: s_endpgm 11225entry: 11226 %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel 11227 store i64 %old, ptr addrspace(1) %out 11228 ret void 11229} 11230 11231define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { 11232; GFX7LESS_ITERATIVE-LABEL: max_i64_varying: 11233; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 11234; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 11235; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 11236; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, 1 11237; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, 0 11238; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 11239; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop 11240; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 11241; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 11242; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 11243; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 11244; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 11245; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 11246; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 11247; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 11248; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 11249; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] 11250; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 11251; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 11252; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 11253; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 11254; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 11255; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 11256; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 11257; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB23_1 11258; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 11259; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 11260; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 11261; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11262; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 11263; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 11264; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 11265; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 11266; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 11267; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 11268; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 11269; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 11270; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 11271; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] 11272; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11273; GFX7LESS_ITERATIVE-NEXT: .LBB23_4: 11274; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 11275; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 11276; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 11277; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 11278; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 11279; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 11280; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 11281; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] 11282; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 11283; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 11284; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 11285; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11286; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 11287; GFX7LESS_ITERATIVE-NEXT: s_endpgm 11288; 11289; GFX8_ITERATIVE-LABEL: max_i64_varying: 11290; GFX8_ITERATIVE: ; %bb.0: ; %entry 11291; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 11292; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 11293; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, 1 11294; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, 0 11295; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 11296; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop 11297; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 11298; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 11299; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 11300; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 11301; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 11302; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 11303; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] 11304; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 11305; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 11306; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 11307; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 11308; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 11309; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 11310; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 11311; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 11312; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 11313; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 11314; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 11315; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11316; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 11317; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11318; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 11319; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 11320; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 11321; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 11322; GFX8_ITERATIVE-NEXT: ; %bb.3: 11323; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 11324; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 11325; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 11326; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 11327; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] 11328; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11329; GFX8_ITERATIVE-NEXT: .LBB23_4: 11330; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 11331; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11332; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 11333; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 11334; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] 11335; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 11336; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 11337; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 11338; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 11339; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 11340; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 11341; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11342; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 11343; GFX8_ITERATIVE-NEXT: s_endpgm 11344; 11345; GFX9_ITERATIVE-LABEL: max_i64_varying: 11346; GFX9_ITERATIVE: ; %bb.0: ; %entry 11347; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 11348; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 11349; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, 1 11350; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, 0 11351; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 11352; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop 11353; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 11354; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 11355; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 11356; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 11357; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 11358; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 11359; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] 11360; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 11361; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 11362; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 11363; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 11364; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 11365; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 11366; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 11367; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 11368; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 11369; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 11370; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 11371; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11372; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 11373; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11374; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 11375; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 11376; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 11377; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 11378; GFX9_ITERATIVE-NEXT: ; %bb.3: 11379; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 11380; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 11381; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 11382; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] 11383; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11384; GFX9_ITERATIVE-NEXT: .LBB23_4: 11385; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 11386; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11387; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 11388; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 11389; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] 11390; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 11391; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 11392; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 11393; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 11394; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 11395; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 11396; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11397; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 11398; GFX9_ITERATIVE-NEXT: s_endpgm 11399; 11400; GFX1064_ITERATIVE-LABEL: max_i64_varying: 11401; GFX1064_ITERATIVE: ; %bb.0: ; %entry 11402; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 11403; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 11404; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, 1 11405; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, 0 11406; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 11407; GFX1064_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop 11408; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 11409; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] 11410; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 11411; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 11412; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 11413; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 11414; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] 11415; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec 11416; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 11417; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 11418; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 11419; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 11420; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 11421; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 11422; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 11423; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11424; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 11425; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 11426; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 11427; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 11428; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 11429; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 11430; GFX1064_ITERATIVE-NEXT: ; %bb.3: 11431; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 11432; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 11433; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 11434; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] 11435; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11436; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 11437; GFX1064_ITERATIVE-NEXT: .LBB23_4: 11438; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 11439; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 11440; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11441; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 11442; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 11443; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2] 11444; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc 11445; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc 11446; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 11447; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 11448; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11449; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 11450; GFX1064_ITERATIVE-NEXT: s_endpgm 11451; 11452; GFX1032_ITERATIVE-LABEL: max_i64_varying: 11453; GFX1032_ITERATIVE: ; %bb.0: ; %entry 11454; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 11455; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 11456; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, 1 11457; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 11458; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 11459; GFX1032_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop 11460; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 11461; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 11462; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 11463; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 11464; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 11465; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 11466; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] 11467; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo 11468; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 11469; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 11470; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 11471; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 11472; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 11473; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 11474; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 11475; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11476; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 11477; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 11478; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 11479; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 11480; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 11481; GFX1032_ITERATIVE-NEXT: ; %bb.3: 11482; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 11483; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 11484; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 11485; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] 11486; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11487; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 11488; GFX1032_ITERATIVE-NEXT: .LBB23_4: 11489; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 11490; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 11491; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11492; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 11493; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 11494; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2] 11495; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo 11496; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo 11497; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 11498; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 11499; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11500; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 11501; GFX1032_ITERATIVE-NEXT: s_endpgm 11502; 11503; GFX1164_ITERATIVE-LABEL: max_i64_varying: 11504; GFX1164_ITERATIVE: ; %bb.0: ; %entry 11505; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 11506; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 11507; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 11508; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, 1 11509; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, 0 11510; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 11511; GFX1164_ITERATIVE-NEXT: .p2align 6 11512; GFX1164_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop 11513; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 11514; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] 11515; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 11516; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 11517; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 11518; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 11519; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 11520; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] 11521; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 11522; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec 11523; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 11524; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 11525; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 11526; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] 11527; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11528; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 11529; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 11530; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 11531; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 11532; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11533; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 11534; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 11535; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 11536; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 11537; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11538; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 11539; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 11540; GFX1164_ITERATIVE-NEXT: ; %bb.3: 11541; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 11542; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 11543; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 11544; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] 11545; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11546; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 11547; GFX1164_ITERATIVE-NEXT: .LBB23_4: 11548; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 11549; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 11550; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 11551; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 11552; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 11553; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 11554; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 11555; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 11556; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 11557; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 11558; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11559; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 11560; GFX1164_ITERATIVE-NEXT: s_endpgm 11561; 11562; GFX1132_ITERATIVE-LABEL: max_i64_varying: 11563; GFX1132_ITERATIVE: ; %bb.0: ; %entry 11564; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 11565; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 11566; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, 1 11567; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 11568; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 11569; GFX1132_ITERATIVE-NEXT: .p2align 6 11570; GFX1132_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop 11571; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 11572; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 11573; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 11574; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 11575; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 11576; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 11577; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 11578; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] 11579; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 11580; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo 11581; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 11582; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 11583; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 11584; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 11585; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11586; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 11587; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 11588; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 11589; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 11590; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 11591; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 11592; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 11593; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 11594; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 11595; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 11596; GFX1132_ITERATIVE-NEXT: ; %bb.3: 11597; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 11598; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 11599; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] 11600; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11601; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 11602; GFX1132_ITERATIVE-NEXT: .LBB23_4: 11603; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 11604; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 11605; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 11606; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 11607; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 11608; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 11609; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 11610; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 11611; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 11612; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 11613; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 11614; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 11615; GFX1132_ITERATIVE-NEXT: s_endpgm 11616; 11617; GFX7LESS_DPP-LABEL: max_i64_varying: 11618; GFX7LESS_DPP: ; %bb.0: ; %entry 11619; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 11620; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 11621; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 11622; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 11623; GFX7LESS_DPP-NEXT: ds_max_rtn_i64 v[0:1], v1, v[0:1] 11624; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 11625; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 11626; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 11627; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 11628; GFX7LESS_DPP-NEXT: s_endpgm 11629; 11630; GFX8_DPP-LABEL: max_i64_varying: 11631; GFX8_DPP: ; %bb.0: ; %entry 11632; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 11633; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 11634; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 11635; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 11636; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 11637; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[0:1] 11638; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 11639; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11640; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 11641; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 11642; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf 11643; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 11644; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11645; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11646; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11647; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11648; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 11649; GFX8_DPP-NEXT: s_nop 0 11650; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf 11651; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 11652; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11653; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11654; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11655; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11656; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 11657; GFX8_DPP-NEXT: s_nop 0 11658; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf 11659; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 11660; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11661; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11662; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11663; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11664; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 11665; GFX8_DPP-NEXT: s_nop 0 11666; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf 11667; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 11668; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11669; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11670; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11671; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11672; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 11673; GFX8_DPP-NEXT: s_nop 0 11674; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf 11675; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 11676; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11677; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11678; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11679; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11680; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0 11681; GFX8_DPP-NEXT: s_nop 0 11682; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf 11683; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 11684; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11685; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11686; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11687; GFX8_DPP-NEXT: v_readlane_b32 s3, v4, 63 11688; GFX8_DPP-NEXT: v_readlane_b32 s2, v3, 63 11689; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 11690; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 11691; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 11692; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 11693; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 11694; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 11695; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 11696; GFX8_DPP-NEXT: ; %bb.1: 11697; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s3 11698; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s2 11699; GFX8_DPP-NEXT: s_mov_b32 m0, -1 11700; GFX8_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] 11701; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 11702; GFX8_DPP-NEXT: .LBB23_2: 11703; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 11704; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11705; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8 11706; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 11707; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 11708; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 11709; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] 11710; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 11711; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc 11712; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 11713; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 11714; GFX8_DPP-NEXT: s_mov_b32 s2, -1 11715; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc 11716; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 11717; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 11718; GFX8_DPP-NEXT: s_endpgm 11719; 11720; GFX9_DPP-LABEL: max_i64_varying: 11721; GFX9_DPP: ; %bb.0: ; %entry 11722; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 11723; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 11724; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 11725; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 11726; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 11727; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[0:1] 11728; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] 11729; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11730; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 11731; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 11732; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf 11733; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 11734; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11735; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11736; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11737; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11738; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 11739; GFX9_DPP-NEXT: s_nop 0 11740; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf 11741; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 11742; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11743; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11744; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11745; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11746; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 11747; GFX9_DPP-NEXT: s_nop 0 11748; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf 11749; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 11750; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11751; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11752; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11753; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11754; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 11755; GFX9_DPP-NEXT: s_nop 0 11756; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf 11757; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 11758; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11759; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11760; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11761; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11762; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 11763; GFX9_DPP-NEXT: s_nop 0 11764; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf 11765; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 11766; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11767; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11768; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11769; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11770; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 11771; GFX9_DPP-NEXT: s_nop 0 11772; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf 11773; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 11774; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] 11775; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 11776; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 11777; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 11778; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 11779; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 11780; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 11781; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 11782; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 11783; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 11784; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 11785; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 11786; GFX9_DPP-NEXT: ; %bb.1: 11787; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 11788; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s2 11789; GFX9_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] 11790; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 11791; GFX9_DPP-NEXT: .LBB23_2: 11792; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 11793; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11794; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8 11795; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 11796; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 11797; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 11798; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] 11799; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 11800; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc 11801; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 11802; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 11803; GFX9_DPP-NEXT: s_mov_b32 s2, -1 11804; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc 11805; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 11806; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 11807; GFX9_DPP-NEXT: s_endpgm 11808; 11809; GFX1064_DPP-LABEL: max_i64_varying: 11810; GFX1064_DPP: ; %bb.0: ; %entry 11811; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 11812; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s[0:1] 11813; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 11814; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 11815; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 11816; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11817; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 11818; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 11819; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 11820; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] 11821; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 11822; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 11823; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 11824; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 11825; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 11826; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 11827; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] 11828; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 11829; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 11830; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11831; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 11832; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 11833; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 11834; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] 11835; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 11836; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 11837; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 11838; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 11839; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 11840; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 11841; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] 11842; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 11843; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 11844; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 11845; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 11846; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 11847; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 11848; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] 11849; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 11850; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 11851; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 11852; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 11853; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 11854; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 11855; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 11856; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 11857; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 11858; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 11859; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v5, 1 11860; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] 11861; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 11862; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 11863; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 11864; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 11865; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 11866; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 11867; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 11868; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 11869; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 11870; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 11871; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 11872; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 11873; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 11874; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 11875; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 11876; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 11877; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 11878; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 11879; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 11880; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 11881; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 11882; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 11883; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 11884; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 11885; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 11886; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 11887; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 11888; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 11889; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 11890; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 11891; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 11892; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 11893; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 11894; GFX1064_DPP-NEXT: ; %bb.1: 11895; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 11896; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 11897; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] 11898; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 11899; GFX1064_DPP-NEXT: buffer_gl0_inv 11900; GFX1064_DPP-NEXT: .LBB23_2: 11901; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 11902; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 11903; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11904; GFX1064_DPP-NEXT: s_mov_b32 null, 0 11905; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 11906; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 11907; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 11908; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 11909; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 11910; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] 11911; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc 11912; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc 11913; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 11914; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 11915; GFX1064_DPP-NEXT: s_endpgm 11916; 11917; GFX1032_DPP-LABEL: max_i64_varying: 11918; GFX1032_DPP: ; %bb.0: ; %entry 11919; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 11920; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s2 11921; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, 1 11922; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 11923; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 11924; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11925; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 11926; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 11927; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 11928; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] 11929; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 11930; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 11931; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, 1 11932; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 11933; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 11934; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 11935; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6] 11936; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 11937; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 11938; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, 1 11939; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 11940; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 11941; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 11942; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] 11943; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 11944; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 11945; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, 1 11946; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 11947; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 11948; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 11949; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6] 11950; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 11951; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 11952; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 11953; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 11954; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 11955; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 11956; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v5, 1 11957; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] 11958; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 11959; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 11960; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 11961; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 11962; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 11963; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 11964; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 11965; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 11966; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 11967; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 11968; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 11969; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 11970; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 11971; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 11972; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 11973; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 11974; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 11975; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 11976; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 11977; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 11978; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 11979; GFX1032_DPP-NEXT: ; %bb.1: 11980; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 11981; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 11982; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] 11983; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 11984; GFX1032_DPP-NEXT: buffer_gl0_inv 11985; GFX1032_DPP-NEXT: .LBB23_2: 11986; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 11987; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 11988; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11989; GFX1032_DPP-NEXT: s_mov_b32 null, 0 11990; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 11991; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 11992; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 11993; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 11994; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 11995; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8] 11996; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo 11997; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo 11998; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 11999; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 12000; GFX1032_DPP-NEXT: s_endpgm 12001; 12002; GFX1164_DPP-LABEL: max_i64_varying: 12003; GFX1164_DPP: ; %bb.0: ; %entry 12004; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 12005; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12006; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12007; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s[0:1] 12008; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 12009; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 12010; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 12011; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, 1 12012; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 12013; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 12014; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 12015; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12016; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] 12017; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 12018; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 12019; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 12020; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 12021; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 12022; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 12023; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 12024; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) 12025; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] 12026; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 12027; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 12028; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, 1 12029; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 12030; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 12031; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 12032; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 12033; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] 12034; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 12035; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 12036; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 12037; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 12038; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 12039; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 12040; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 12041; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 12042; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] 12043; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 12044; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 12045; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 12046; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 12047; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 12048; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 12049; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 12050; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 12051; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] 12052; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 12053; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 12054; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 12055; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 12056; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 12057; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 12058; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 12059; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 12060; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 12061; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 12062; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 12063; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 12064; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 12065; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, 1 12066; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 12067; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4] 12068; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 12069; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 12070; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 12071; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 12072; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12073; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12074; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 12075; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 12076; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12077; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 12078; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 12079; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 12080; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 12081; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 12082; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 12083; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 12084; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 12085; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 12086; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 12087; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 12088; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 12089; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12090; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 12091; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 12092; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 12093; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 12094; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 12095; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 12096; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 12097; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 12098; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 12099; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 12100; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 12101; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 12102; GFX1164_DPP-NEXT: ; %bb.1: 12103; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 12104; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 12105; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] 12106; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 12107; GFX1164_DPP-NEXT: buffer_gl0_inv 12108; GFX1164_DPP-NEXT: .LBB23_2: 12109; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 12110; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 12111; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 12112; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 12113; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 12114; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 12115; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 12116; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 12117; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] 12118; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc 12119; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc 12120; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 12121; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 12122; GFX1164_DPP-NEXT: s_endpgm 12123; 12124; GFX1132_DPP-LABEL: max_i64_varying: 12125; GFX1132_DPP: ; %bb.0: ; %entry 12126; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 12127; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 12128; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 12129; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s2 12130; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, 1 12131; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 12132; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, 1 12133; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) 12134; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 12135; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 12136; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12137; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 12138; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] 12139; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 12140; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, 1 12141; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 12142; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 12143; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 12144; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12145; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 12146; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6] 12147; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 12148; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, 1 12149; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 12150; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) 12151; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 12152; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 12153; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 12154; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] 12155; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 12156; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, 1 12157; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 12158; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 12159; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 12160; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 12161; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6] 12162; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 12163; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 12164; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 12165; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 12166; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 12167; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 12168; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 12169; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v5, 1 12170; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 12171; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] 12172; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 12173; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v3, v1 12174; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 12175; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 12176; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 12177; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 12178; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 12179; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12180; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 12181; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 12182; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 12183; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 12184; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 12185; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 12186; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 12187; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 12188; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 12189; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 12190; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 12191; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 12192; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 12193; GFX1132_DPP-NEXT: ; %bb.1: 12194; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 12195; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[7:8], v0, v[7:8] 12196; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 12197; GFX1132_DPP-NEXT: buffer_gl0_inv 12198; GFX1132_DPP-NEXT: .LBB23_2: 12199; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 12200; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 12201; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 12202; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 12203; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 12204; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 12205; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 12206; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 12207; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8] 12208; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo 12209; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo 12210; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 12211; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 12212; GFX1132_DPP-NEXT: s_endpgm 12213entry: 12214 %lane = call i32 @llvm.amdgcn.workitem.id.x() 12215 %lane_ext = zext i32 %lane to i64 12216 %old = atomicrmw max ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel 12217 store i64 %old, ptr addrspace(1) %out 12218 ret void 12219} 12220 12221define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { 12222; GFX7LESS_ITERATIVE-LABEL: min_i32_varying: 12223; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 12224; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 12225; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s2, -2 12226; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 12227; GFX7LESS_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop 12228; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 12229; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 12230; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 12231; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 12232; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 12233; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 12234; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 12235; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 12236; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 12237; GFX7LESS_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 12238; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB24_1 12239; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 12240; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 12241; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 12242; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12243; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 12244; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 12245; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 12246; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 12247; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 12248; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 12249; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 12250; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 12251; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 12252; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12253; GFX7LESS_ITERATIVE-NEXT: .LBB24_4: 12254; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 12255; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 12256; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 12257; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 12258; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 12259; GFX7LESS_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 12260; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12261; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 12262; GFX7LESS_ITERATIVE-NEXT: s_endpgm 12263; 12264; GFX8_ITERATIVE-LABEL: min_i32_varying: 12265; GFX8_ITERATIVE: ; %bb.0: ; %entry 12266; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 12267; GFX8_ITERATIVE-NEXT: s_brev_b32 s2, -2 12268; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 12269; GFX8_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop 12270; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 12271; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 12272; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 12273; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 12274; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 12275; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 12276; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 12277; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 12278; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 12279; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 12280; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 12281; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12282; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 12283; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12284; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 12285; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 12286; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 12287; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 12288; GFX8_ITERATIVE-NEXT: ; %bb.3: 12289; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 12290; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 12291; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 12292; GFX8_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 12293; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12294; GFX8_ITERATIVE-NEXT: .LBB24_4: 12295; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 12296; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12297; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 12298; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 12299; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 12300; GFX8_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 12301; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12302; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 12303; GFX8_ITERATIVE-NEXT: s_endpgm 12304; 12305; GFX9_ITERATIVE-LABEL: min_i32_varying: 12306; GFX9_ITERATIVE: ; %bb.0: ; %entry 12307; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 12308; GFX9_ITERATIVE-NEXT: s_brev_b32 s2, -2 12309; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 12310; GFX9_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop 12311; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 12312; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 12313; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 12314; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 12315; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 12316; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 12317; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 12318; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 12319; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 12320; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 12321; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 12322; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12323; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 12324; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12325; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 12326; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 12327; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 12328; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 12329; GFX9_ITERATIVE-NEXT: ; %bb.3: 12330; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 12331; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 12332; GFX9_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 12333; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12334; GFX9_ITERATIVE-NEXT: .LBB24_4: 12335; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 12336; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12337; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 12338; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 12339; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 12340; GFX9_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 12341; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12342; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 12343; GFX9_ITERATIVE-NEXT: s_endpgm 12344; 12345; GFX1064_ITERATIVE-LABEL: min_i32_varying: 12346; GFX1064_ITERATIVE: ; %bb.0: ; %entry 12347; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 12348; GFX1064_ITERATIVE-NEXT: s_brev_b32 s2, -2 12349; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 12350; GFX1064_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop 12351; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 12352; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 12353; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 12354; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 12355; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 12356; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 12357; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 12358; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 12359; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 12360; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 12361; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12362; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 12363; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12364; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 12365; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 12366; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 12367; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 12368; GFX1064_ITERATIVE-NEXT: ; %bb.3: 12369; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 12370; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 12371; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 12372; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12373; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 12374; GFX1064_ITERATIVE-NEXT: .LBB24_4: 12375; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 12376; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 12377; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12378; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 12379; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 12380; GFX1064_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 12381; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 12382; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12383; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 12384; GFX1064_ITERATIVE-NEXT: s_endpgm 12385; 12386; GFX1032_ITERATIVE-LABEL: min_i32_varying: 12387; GFX1032_ITERATIVE: ; %bb.0: ; %entry 12388; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 12389; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, -2 12390; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 12391; GFX1032_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop 12392; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 12393; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 12394; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 12395; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 12396; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 12397; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 12398; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 12399; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 12400; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 12401; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 12402; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12403; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 12404; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 12405; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 12406; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 12407; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 12408; GFX1032_ITERATIVE-NEXT: ; %bb.3: 12409; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 12410; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 12411; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 12412; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12413; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 12414; GFX1032_ITERATIVE-NEXT: .LBB24_4: 12415; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 12416; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 12417; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12418; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 12419; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 12420; GFX1032_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 12421; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 12422; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12423; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 12424; GFX1032_ITERATIVE-NEXT: s_endpgm 12425; 12426; GFX1164_ITERATIVE-LABEL: min_i32_varying: 12427; GFX1164_ITERATIVE: ; %bb.0: ; %entry 12428; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 12429; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 12430; GFX1164_ITERATIVE-NEXT: s_brev_b32 s2, -2 12431; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 12432; GFX1164_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop 12433; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 12434; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 12435; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 12436; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 12437; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 12438; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 12439; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 12440; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 12441; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 12442; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 12443; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 12444; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 12445; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 12446; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12447; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 12448; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 12449; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 12450; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 12451; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12452; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 12453; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 12454; GFX1164_ITERATIVE-NEXT: ; %bb.3: 12455; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 12456; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 12457; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 12458; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12459; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 12460; GFX1164_ITERATIVE-NEXT: .LBB24_4: 12461; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 12462; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 12463; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 12464; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 12465; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 12466; GFX1164_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 12467; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 12468; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12469; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 12470; GFX1164_ITERATIVE-NEXT: s_endpgm 12471; 12472; GFX1132_ITERATIVE-LABEL: min_i32_varying: 12473; GFX1132_ITERATIVE: ; %bb.0: ; %entry 12474; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 12475; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 12476; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, -2 12477; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 12478; GFX1132_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop 12479; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 12480; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 12481; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 12482; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 12483; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 12484; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 12485; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 12486; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 12487; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 12488; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 12489; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 12490; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 12491; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 12492; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 12493; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 12494; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 12495; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 12496; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 12497; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 12498; GFX1132_ITERATIVE-NEXT: ; %bb.3: 12499; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 12500; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 12501; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12502; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 12503; GFX1132_ITERATIVE-NEXT: .LBB24_4: 12504; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 12505; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 12506; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 12507; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 12508; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 12509; GFX1132_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 12510; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 12511; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 12512; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 12513; GFX1132_ITERATIVE-NEXT: s_endpgm 12514; 12515; GFX7LESS_DPP-LABEL: min_i32_varying: 12516; GFX7LESS_DPP: ; %bb.0: ; %entry 12517; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 12518; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 12519; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 12520; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 12521; GFX7LESS_DPP-NEXT: ds_min_rtn_i32 v0, v1, v0 12522; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 12523; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 12524; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 12525; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 12526; GFX7LESS_DPP-NEXT: s_endpgm 12527; 12528; GFX8_DPP-LABEL: min_i32_varying: 12529; GFX8_DPP: ; %bb.0: ; %entry 12530; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 12531; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 12532; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12533; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 12534; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] 12535; GFX8_DPP-NEXT: s_nop 1 12536; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 12537; GFX8_DPP-NEXT: s_nop 1 12538; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 12539; GFX8_DPP-NEXT: s_nop 1 12540; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 12541; GFX8_DPP-NEXT: s_nop 1 12542; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 12543; GFX8_DPP-NEXT: s_nop 1 12544; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 12545; GFX8_DPP-NEXT: s_nop 1 12546; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 12547; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 12548; GFX8_DPP-NEXT: s_nop 0 12549; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 12550; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 12551; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 12552; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 12553; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 12554; GFX8_DPP-NEXT: s_cbranch_execz .LBB24_2 12555; GFX8_DPP-NEXT: ; %bb.1: 12556; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 12557; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 12558; GFX8_DPP-NEXT: s_mov_b32 m0, -1 12559; GFX8_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 12560; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 12561; GFX8_DPP-NEXT: .LBB24_2: 12562; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 12563; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12564; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 12565; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 12566; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 12567; GFX8_DPP-NEXT: s_mov_b32 s2, -1 12568; GFX8_DPP-NEXT: v_min_i32_e32 v0, s4, v0 12569; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 12570; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 12571; GFX8_DPP-NEXT: s_endpgm 12572; 12573; GFX9_DPP-LABEL: min_i32_varying: 12574; GFX9_DPP: ; %bb.0: ; %entry 12575; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 12576; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 12577; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12578; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 12579; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] 12580; GFX9_DPP-NEXT: s_nop 1 12581; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 12582; GFX9_DPP-NEXT: s_nop 1 12583; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 12584; GFX9_DPP-NEXT: s_nop 1 12585; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 12586; GFX9_DPP-NEXT: s_nop 1 12587; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 12588; GFX9_DPP-NEXT: s_nop 1 12589; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 12590; GFX9_DPP-NEXT: s_nop 1 12591; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 12592; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 12593; GFX9_DPP-NEXT: s_nop 0 12594; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 12595; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 12596; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 12597; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 12598; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 12599; GFX9_DPP-NEXT: s_cbranch_execz .LBB24_2 12600; GFX9_DPP-NEXT: ; %bb.1: 12601; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 12602; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 12603; GFX9_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 12604; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 12605; GFX9_DPP-NEXT: .LBB24_2: 12606; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 12607; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12608; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 12609; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 12610; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 12611; GFX9_DPP-NEXT: s_mov_b32 s2, -1 12612; GFX9_DPP-NEXT: v_min_i32_e32 v0, s4, v0 12613; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 12614; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 12615; GFX9_DPP-NEXT: s_endpgm 12616; 12617; GFX1064_DPP-LABEL: min_i32_varying: 12618; GFX1064_DPP: ; %bb.0: ; %entry 12619; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12620; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] 12621; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2 12622; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12623; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 12624; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 12625; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 12626; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 12627; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 12628; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 12629; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 12630; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 12631; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12632; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 12633; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 12634; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 12635; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 12636; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12637; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12638; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 12639; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 12640; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 12641; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 12642; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 12643; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12644; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 12645; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 12646; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12647; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 12648; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 12649; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 12650; GFX1064_DPP-NEXT: s_cbranch_execz .LBB24_2 12651; GFX1064_DPP-NEXT: ; %bb.1: 12652; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 12653; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 12654; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 12655; GFX1064_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 12656; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 12657; GFX1064_DPP-NEXT: buffer_gl0_inv 12658; GFX1064_DPP-NEXT: .LBB24_2: 12659; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 12660; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 12661; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12662; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 12663; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 12664; GFX1064_DPP-NEXT: v_min_i32_e32 v0, s3, v0 12665; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 12666; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 12667; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 12668; GFX1064_DPP-NEXT: s_endpgm 12669; 12670; GFX1032_DPP-LABEL: min_i32_varying: 12671; GFX1032_DPP: ; %bb.0: ; %entry 12672; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 12673; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 12674; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2 12675; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12676; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 12677; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 12678; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 12679; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 12680; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 12681; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 12682; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 12683; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12684; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 12685; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12686; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 12687; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 12688; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 12689; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 12690; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 12691; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 12692; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 12693; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 12694; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2 12695; GFX1032_DPP-NEXT: ; %bb.1: 12696; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 12697; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 12698; GFX1032_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 12699; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 12700; GFX1032_DPP-NEXT: buffer_gl0_inv 12701; GFX1032_DPP-NEXT: .LBB24_2: 12702; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 12703; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 12704; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12705; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 12706; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 12707; GFX1032_DPP-NEXT: v_min_i32_e32 v0, s3, v0 12708; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 12709; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 12710; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 12711; GFX1032_DPP-NEXT: s_endpgm 12712; 12713; GFX1164_DPP-LABEL: min_i32_varying: 12714; GFX1164_DPP: ; %bb.0: ; %entry 12715; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 12716; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12717; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 12718; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] 12719; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2 12720; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 12721; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12722; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 12723; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12724; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 12725; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 12726; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12727; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 12728; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 12729; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12730; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 12731; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 12732; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12733; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 12734; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12735; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 12736; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 12737; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 12738; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 12739; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 12740; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12741; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12742; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 12743; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 12744; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 12745; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 12746; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 12747; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 12748; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 12749; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 12750; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 12751; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12752; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 12753; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 12754; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 12755; GFX1164_DPP-NEXT: s_cbranch_execz .LBB24_2 12756; GFX1164_DPP-NEXT: ; %bb.1: 12757; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 12758; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 12759; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 12760; GFX1164_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 12761; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 12762; GFX1164_DPP-NEXT: buffer_gl0_inv 12763; GFX1164_DPP-NEXT: .LBB24_2: 12764; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 12765; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 12766; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 12767; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 12768; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 12769; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s3, v0 12770; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 12771; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 12772; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 12773; GFX1164_DPP-NEXT: s_endpgm 12774; 12775; GFX1132_DPP-LABEL: min_i32_varying: 12776; GFX1132_DPP: ; %bb.0: ; %entry 12777; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 12778; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 12779; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 12780; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 12781; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2 12782; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 12783; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12784; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 12785; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12786; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 12787; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 12788; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12789; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 12790; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 12791; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 12792; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 12793; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 12794; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 12795; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 12796; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12797; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 12798; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 12799; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 12800; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 12801; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 12802; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 12803; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 12804; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 12805; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2 12806; GFX1132_DPP-NEXT: ; %bb.1: 12807; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 12808; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 12809; GFX1132_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 12810; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 12811; GFX1132_DPP-NEXT: buffer_gl0_inv 12812; GFX1132_DPP-NEXT: .LBB24_2: 12813; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 12814; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 12815; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 12816; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 12817; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 12818; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s3, v0 12819; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 12820; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 12821; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 12822; GFX1132_DPP-NEXT: s_endpgm 12823entry: 12824 %lane = call i32 @llvm.amdgcn.workitem.id.x() 12825 %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel 12826 store i32 %old, ptr addrspace(1) %out 12827 ret void 12828} 12829 12830define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { 12831; GFX7LESS-LABEL: min_i64_constant: 12832; GFX7LESS: ; %bb.0: ; %entry 12833; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 12834; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 12835; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12836; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 12837; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 12838; GFX7LESS-NEXT: s_cbranch_execz .LBB25_2 12839; GFX7LESS-NEXT: ; %bb.1: 12840; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 12841; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 12842; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 12843; GFX7LESS-NEXT: s_mov_b32 m0, -1 12844; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 12845; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 12846; GFX7LESS-NEXT: .LBB25_2: 12847; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 12848; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 12849; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 12850; GFX7LESS-NEXT: s_mov_b32 s2, -1 12851; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 12852; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 12853; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 12854; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 12855; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 12856; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 12857; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 12858; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 12859; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 12860; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 12861; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 12862; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 12863; GFX7LESS-NEXT: s_endpgm 12864; 12865; GFX8-LABEL: min_i64_constant: 12866; GFX8: ; %bb.0: ; %entry 12867; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12868; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 12869; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12870; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 12871; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 12872; GFX8-NEXT: s_cbranch_execz .LBB25_2 12873; GFX8-NEXT: ; %bb.1: 12874; GFX8-NEXT: v_mov_b32_e32 v0, 5 12875; GFX8-NEXT: v_mov_b32_e32 v1, 0 12876; GFX8-NEXT: v_mov_b32_e32 v2, 0 12877; GFX8-NEXT: s_mov_b32 m0, -1 12878; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 12879; GFX8-NEXT: s_waitcnt lgkmcnt(0) 12880; GFX8-NEXT: .LBB25_2: 12881; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 12882; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12883; GFX8-NEXT: v_readfirstlane_b32 s4, v0 12884; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 12885; GFX8-NEXT: v_readfirstlane_b32 s5, v1 12886; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 12887; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 12888; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 12889; GFX8-NEXT: v_mov_b32_e32 v2, s5 12890; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 12891; GFX8-NEXT: v_mov_b32_e32 v2, s4 12892; GFX8-NEXT: s_mov_b32 s3, 0xf000 12893; GFX8-NEXT: s_mov_b32 s2, -1 12894; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 12895; GFX8-NEXT: s_waitcnt lgkmcnt(0) 12896; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 12897; GFX8-NEXT: s_endpgm 12898; 12899; GFX9-LABEL: min_i64_constant: 12900; GFX9: ; %bb.0: ; %entry 12901; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12902; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 12903; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12904; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 12905; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 12906; GFX9-NEXT: s_cbranch_execz .LBB25_2 12907; GFX9-NEXT: ; %bb.1: 12908; GFX9-NEXT: v_mov_b32_e32 v0, 5 12909; GFX9-NEXT: v_mov_b32_e32 v1, 0 12910; GFX9-NEXT: v_mov_b32_e32 v2, 0 12911; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 12912; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12913; GFX9-NEXT: .LBB25_2: 12914; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 12915; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12916; GFX9-NEXT: v_readfirstlane_b32 s4, v0 12917; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 12918; GFX9-NEXT: v_readfirstlane_b32 s5, v1 12919; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 12920; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 12921; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 12922; GFX9-NEXT: v_mov_b32_e32 v2, s5 12923; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 12924; GFX9-NEXT: v_mov_b32_e32 v2, s4 12925; GFX9-NEXT: s_mov_b32 s3, 0xf000 12926; GFX9-NEXT: s_mov_b32 s2, -1 12927; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 12928; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12929; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 12930; GFX9-NEXT: s_endpgm 12931; 12932; GFX1064-LABEL: min_i64_constant: 12933; GFX1064: ; %bb.0: ; %entry 12934; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12935; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 12936; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 12937; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 12938; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 12939; GFX1064-NEXT: s_cbranch_execz .LBB25_2 12940; GFX1064-NEXT: ; %bb.1: 12941; GFX1064-NEXT: v_mov_b32_e32 v0, 5 12942; GFX1064-NEXT: v_mov_b32_e32 v1, 0 12943; GFX1064-NEXT: v_mov_b32_e32 v2, 0 12944; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 12945; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 12946; GFX1064-NEXT: buffer_gl0_inv 12947; GFX1064-NEXT: .LBB25_2: 12948; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 12949; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 12950; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12951; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 12952; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 12953; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 12954; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 12955; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 12956; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 12957; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 12958; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 12959; GFX1064-NEXT: s_mov_b32 s2, -1 12960; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 12961; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 12962; GFX1064-NEXT: s_endpgm 12963; 12964; GFX1032-LABEL: min_i64_constant: 12965; GFX1032: ; %bb.0: ; %entry 12966; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12967; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 12968; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 12969; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 12970; GFX1032-NEXT: s_cbranch_execz .LBB25_2 12971; GFX1032-NEXT: ; %bb.1: 12972; GFX1032-NEXT: v_mov_b32_e32 v0, 5 12973; GFX1032-NEXT: v_mov_b32_e32 v1, 0 12974; GFX1032-NEXT: v_mov_b32_e32 v2, 0 12975; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 12976; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 12977; GFX1032-NEXT: buffer_gl0_inv 12978; GFX1032-NEXT: .LBB25_2: 12979; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 12980; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 12981; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12982; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 12983; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 12984; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 12985; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 12986; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 12987; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 12988; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 12989; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 12990; GFX1032-NEXT: s_mov_b32 s2, -1 12991; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 12992; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 12993; GFX1032-NEXT: s_endpgm 12994; 12995; GFX1164-LABEL: min_i64_constant: 12996; GFX1164: ; %bb.0: ; %entry 12997; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 12998; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12999; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 13000; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 13001; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 13002; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc 13003; GFX1164-NEXT: s_cbranch_execz .LBB25_2 13004; GFX1164-NEXT: ; %bb.1: 13005; GFX1164-NEXT: v_mov_b32_e32 v0, 5 13006; GFX1164-NEXT: v_mov_b32_e32 v1, 0 13007; GFX1164-NEXT: v_mov_b32_e32 v2, 0 13008; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 13009; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 13010; GFX1164-NEXT: buffer_gl0_inv 13011; GFX1164-NEXT: .LBB25_2: 13012; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 13013; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 13014; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 13015; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 13016; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 13017; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 13018; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 13019; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 13020; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 13021; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 13022; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 13023; GFX1164-NEXT: s_mov_b32 s2, -1 13024; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 13025; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 13026; GFX1164-NEXT: s_endpgm 13027; 13028; GFX1132-LABEL: min_i64_constant: 13029; GFX1132: ; %bb.0: ; %entry 13030; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 13031; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 13032; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 13033; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 13034; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo 13035; GFX1132-NEXT: s_cbranch_execz .LBB25_2 13036; GFX1132-NEXT: ; %bb.1: 13037; GFX1132-NEXT: v_mov_b32_e32 v0, 5 13038; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 13039; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 13040; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 13041; GFX1132-NEXT: buffer_gl0_inv 13042; GFX1132-NEXT: .LBB25_2: 13043; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 13044; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 13045; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 13046; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 13047; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 13048; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 13049; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 13050; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 13051; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 13052; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 13053; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 13054; GFX1132-NEXT: s_mov_b32 s2, -1 13055; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 13056; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 13057; GFX1132-NEXT: s_endpgm 13058entry: 13059 %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel 13060 store i64 %old, ptr addrspace(1) %out 13061 ret void 13062} 13063 13064define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { 13065; GFX7LESS_ITERATIVE-LABEL: min_i64_varying: 13066; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 13067; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 13068; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 13069; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, -2 13070; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, -1 13071; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 13072; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop 13073; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 13074; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 13075; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 13076; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 13077; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 13078; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 13079; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 13080; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 13081; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 13082; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] 13083; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 13084; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 13085; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 13086; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 13087; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 13088; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 13089; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 13090; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB26_1 13091; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 13092; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 13093; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 13094; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 13095; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 13096; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 13097; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 13098; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 13099; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 13100; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 13101; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 13102; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 13103; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 13104; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] 13105; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13106; GFX7LESS_ITERATIVE-NEXT: .LBB26_4: 13107; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 13108; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 13109; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 13110; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 13111; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 13112; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 13113; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 13114; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] 13115; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 13116; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 13117; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 13118; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13119; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 13120; GFX7LESS_ITERATIVE-NEXT: s_endpgm 13121; 13122; GFX8_ITERATIVE-LABEL: min_i64_varying: 13123; GFX8_ITERATIVE: ; %bb.0: ; %entry 13124; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 13125; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 13126; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, -2 13127; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, -1 13128; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 13129; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop 13130; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 13131; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 13132; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 13133; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 13134; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 13135; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 13136; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] 13137; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 13138; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 13139; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 13140; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 13141; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 13142; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 13143; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 13144; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 13145; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 13146; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 13147; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 13148; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 13149; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 13150; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 13151; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 13152; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 13153; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 13154; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 13155; GFX8_ITERATIVE-NEXT: ; %bb.3: 13156; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 13157; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 13158; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 13159; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 13160; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] 13161; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13162; GFX8_ITERATIVE-NEXT: .LBB26_4: 13163; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 13164; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 13165; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 13166; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 13167; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] 13168; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 13169; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 13170; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 13171; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 13172; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 13173; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 13174; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13175; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 13176; GFX8_ITERATIVE-NEXT: s_endpgm 13177; 13178; GFX9_ITERATIVE-LABEL: min_i64_varying: 13179; GFX9_ITERATIVE: ; %bb.0: ; %entry 13180; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 13181; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 13182; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, -2 13183; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, -1 13184; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 13185; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop 13186; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 13187; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 13188; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 13189; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 13190; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 13191; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 13192; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] 13193; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 13194; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 13195; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 13196; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 13197; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 13198; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 13199; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 13200; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 13201; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 13202; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 13203; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 13204; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 13205; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 13206; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 13207; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 13208; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 13209; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 13210; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 13211; GFX9_ITERATIVE-NEXT: ; %bb.3: 13212; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 13213; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 13214; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 13215; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] 13216; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13217; GFX9_ITERATIVE-NEXT: .LBB26_4: 13218; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 13219; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 13220; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 13221; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 13222; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] 13223; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 13224; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 13225; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 13226; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 13227; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 13228; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 13229; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13230; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 13231; GFX9_ITERATIVE-NEXT: s_endpgm 13232; 13233; GFX1064_ITERATIVE-LABEL: min_i64_varying: 13234; GFX1064_ITERATIVE: ; %bb.0: ; %entry 13235; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 13236; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 13237; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, -2 13238; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, -1 13239; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 13240; GFX1064_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop 13241; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 13242; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] 13243; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 13244; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 13245; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 13246; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 13247; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] 13248; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec 13249; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 13250; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 13251; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 13252; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 13253; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 13254; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 13255; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 13256; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 13257; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 13258; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 13259; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 13260; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 13261; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 13262; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 13263; GFX1064_ITERATIVE-NEXT: ; %bb.3: 13264; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 13265; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 13266; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 13267; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] 13268; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13269; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 13270; GFX1064_ITERATIVE-NEXT: .LBB26_4: 13271; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 13272; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 13273; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 13274; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 13275; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 13276; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] 13277; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc 13278; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc 13279; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 13280; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 13281; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13282; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 13283; GFX1064_ITERATIVE-NEXT: s_endpgm 13284; 13285; GFX1032_ITERATIVE-LABEL: min_i64_varying: 13286; GFX1032_ITERATIVE: ; %bb.0: ; %entry 13287; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 13288; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 13289; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, -2 13290; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 13291; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 13292; GFX1032_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop 13293; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 13294; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 13295; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 13296; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 13297; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 13298; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 13299; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] 13300; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo 13301; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 13302; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 13303; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 13304; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 13305; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 13306; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 13307; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 13308; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 13309; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 13310; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 13311; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 13312; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 13313; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 13314; GFX1032_ITERATIVE-NEXT: ; %bb.3: 13315; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 13316; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 13317; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 13318; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] 13319; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13320; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 13321; GFX1032_ITERATIVE-NEXT: .LBB26_4: 13322; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 13323; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 13324; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 13325; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 13326; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 13327; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2] 13328; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo 13329; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo 13330; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 13331; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 13332; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13333; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 13334; GFX1032_ITERATIVE-NEXT: s_endpgm 13335; 13336; GFX1164_ITERATIVE-LABEL: min_i64_varying: 13337; GFX1164_ITERATIVE: ; %bb.0: ; %entry 13338; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 13339; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 13340; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 13341; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, -2 13342; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, -1 13343; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 13344; GFX1164_ITERATIVE-NEXT: .p2align 6 13345; GFX1164_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop 13346; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 13347; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] 13348; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 13349; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 13350; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 13351; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 13352; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 13353; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] 13354; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 13355; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec 13356; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 13357; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 13358; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 13359; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] 13360; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13361; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 13362; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 13363; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 13364; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 13365; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13366; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 13367; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 13368; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 13369; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 13370; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13371; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 13372; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 13373; GFX1164_ITERATIVE-NEXT: ; %bb.3: 13374; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 13375; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 13376; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 13377; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] 13378; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13379; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 13380; GFX1164_ITERATIVE-NEXT: .LBB26_4: 13381; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 13382; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 13383; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 13384; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 13385; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 13386; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 13387; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 13388; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 13389; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 13390; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 13391; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13392; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 13393; GFX1164_ITERATIVE-NEXT: s_endpgm 13394; 13395; GFX1132_ITERATIVE-LABEL: min_i64_varying: 13396; GFX1132_ITERATIVE: ; %bb.0: ; %entry 13397; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 13398; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 13399; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, -2 13400; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 13401; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 13402; GFX1132_ITERATIVE-NEXT: .p2align 6 13403; GFX1132_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop 13404; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 13405; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 13406; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 13407; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 13408; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 13409; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 13410; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 13411; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] 13412; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 13413; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo 13414; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 13415; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 13416; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 13417; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 13418; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13419; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 13420; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 13421; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 13422; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 13423; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 13424; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 13425; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 13426; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 13427; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 13428; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 13429; GFX1132_ITERATIVE-NEXT: ; %bb.3: 13430; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 13431; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 13432; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] 13433; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13434; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 13435; GFX1132_ITERATIVE-NEXT: .LBB26_4: 13436; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 13437; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 13438; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 13439; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 13440; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 13441; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 13442; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 13443; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 13444; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 13445; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 13446; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 13447; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 13448; GFX1132_ITERATIVE-NEXT: s_endpgm 13449; 13450; GFX7LESS_DPP-LABEL: min_i64_varying: 13451; GFX7LESS_DPP: ; %bb.0: ; %entry 13452; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 13453; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 13454; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 13455; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 13456; GFX7LESS_DPP-NEXT: ds_min_rtn_i64 v[0:1], v1, v[0:1] 13457; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 13458; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 13459; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 13460; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 13461; GFX7LESS_DPP-NEXT: s_endpgm 13462; 13463; GFX8_DPP-LABEL: min_i64_varying: 13464; GFX8_DPP: ; %bb.0: ; %entry 13465; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 13466; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 13467; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 13468; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 13469; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 13470; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[0:1] 13471; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] 13472; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13473; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 13474; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 13475; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf 13476; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 13477; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13478; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13479; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13480; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13481; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 13482; GFX8_DPP-NEXT: s_nop 0 13483; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf 13484; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 13485; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13486; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13487; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13488; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13489; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 13490; GFX8_DPP-NEXT: s_nop 0 13491; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf 13492; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 13493; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13494; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13495; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13496; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13497; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 13498; GFX8_DPP-NEXT: s_nop 0 13499; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf 13500; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 13501; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13502; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13503; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13504; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13505; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 13506; GFX8_DPP-NEXT: s_nop 0 13507; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf 13508; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 13509; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13510; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13511; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13512; GFX8_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13513; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 13514; GFX8_DPP-NEXT: s_nop 0 13515; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf 13516; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 13517; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13518; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13519; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13520; GFX8_DPP-NEXT: v_readlane_b32 s3, v4, 63 13521; GFX8_DPP-NEXT: v_readlane_b32 s2, v3, 63 13522; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 13523; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 13524; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 13525; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 13526; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 13527; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 13528; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 13529; GFX8_DPP-NEXT: ; %bb.1: 13530; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s3 13531; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s2 13532; GFX8_DPP-NEXT: s_mov_b32 m0, -1 13533; GFX8_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] 13534; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 13535; GFX8_DPP-NEXT: .LBB26_2: 13536; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 13537; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 13538; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8 13539; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 13540; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 13541; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 13542; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] 13543; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 13544; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc 13545; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 13546; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 13547; GFX8_DPP-NEXT: s_mov_b32 s2, -1 13548; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc 13549; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 13550; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 13551; GFX8_DPP-NEXT: s_endpgm 13552; 13553; GFX9_DPP-LABEL: min_i64_varying: 13554; GFX9_DPP: ; %bb.0: ; %entry 13555; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 13556; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 13557; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 13558; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 13559; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 13560; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[0:1] 13561; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] 13562; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13563; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 13564; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 13565; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf 13566; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 13567; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13568; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13569; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13570; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13571; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 13572; GFX9_DPP-NEXT: s_nop 0 13573; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf 13574; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 13575; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13576; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13577; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13578; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13579; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 13580; GFX9_DPP-NEXT: s_nop 0 13581; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf 13582; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 13583; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13584; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13585; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13586; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13587; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 13588; GFX9_DPP-NEXT: s_nop 0 13589; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf 13590; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 13591; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13592; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13593; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13594; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13595; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 13596; GFX9_DPP-NEXT: s_nop 0 13597; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf 13598; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 13599; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13600; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13601; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13602; GFX9_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13603; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 13604; GFX9_DPP-NEXT: s_nop 0 13605; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf 13606; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 13607; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] 13608; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 13609; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 13610; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 13611; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 13612; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 13613; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 13614; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 13615; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 13616; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 13617; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 13618; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 13619; GFX9_DPP-NEXT: ; %bb.1: 13620; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 13621; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s2 13622; GFX9_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] 13623; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 13624; GFX9_DPP-NEXT: .LBB26_2: 13625; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 13626; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 13627; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8 13628; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 13629; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 13630; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 13631; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] 13632; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 13633; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc 13634; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 13635; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 13636; GFX9_DPP-NEXT: s_mov_b32 s2, -1 13637; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc 13638; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 13639; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 13640; GFX9_DPP-NEXT: s_endpgm 13641; 13642; GFX1064_DPP-LABEL: min_i64_varying: 13643; GFX1064_DPP: ; %bb.0: ; %entry 13644; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 13645; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s[0:1] 13646; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13647; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 13648; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] 13649; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13650; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 13651; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 13652; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 13653; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] 13654; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 13655; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 13656; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13657; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 13658; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 13659; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 13660; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] 13661; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 13662; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 13663; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13664; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 13665; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 13666; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 13667; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] 13668; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 13669; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 13670; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13671; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 13672; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 13673; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 13674; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] 13675; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 13676; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 13677; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 13678; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 13679; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 13680; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 13681; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] 13682; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 13683; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 13684; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13685; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 13686; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 13687; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 13688; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 13689; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 13690; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 13691; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 13692; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v5, -2 13693; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] 13694; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 13695; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 13696; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 13697; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 13698; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 13699; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 13700; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 13701; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 13702; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 13703; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 13704; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 13705; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 13706; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 13707; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 13708; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 13709; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 13710; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 13711; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 13712; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 13713; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 13714; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 13715; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 13716; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 13717; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 13718; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 13719; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 13720; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 13721; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 13722; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 13723; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 13724; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 13725; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 13726; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 13727; GFX1064_DPP-NEXT: ; %bb.1: 13728; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 13729; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 13730; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] 13731; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 13732; GFX1064_DPP-NEXT: buffer_gl0_inv 13733; GFX1064_DPP-NEXT: .LBB26_2: 13734; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 13735; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 13736; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 13737; GFX1064_DPP-NEXT: s_mov_b32 null, 0 13738; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 13739; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 13740; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 13741; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 13742; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 13743; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] 13744; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc 13745; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc 13746; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 13747; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 13748; GFX1064_DPP-NEXT: s_endpgm 13749; 13750; GFX1032_DPP-LABEL: min_i64_varying: 13751; GFX1032_DPP: ; %bb.0: ; %entry 13752; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 13753; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s2 13754; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13755; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 13756; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 13757; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13758; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 13759; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 13760; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 13761; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] 13762; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 13763; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 13764; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13765; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 13766; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 13767; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 13768; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6] 13769; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 13770; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 13771; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13772; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 13773; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 13774; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 13775; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] 13776; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 13777; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 13778; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13779; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 13780; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 13781; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 13782; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6] 13783; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 13784; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 13785; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 13786; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 13787; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 13788; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 13789; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v5, -2 13790; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] 13791; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 13792; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 13793; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 13794; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 13795; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 13796; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 13797; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 13798; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 13799; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 13800; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 13801; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 13802; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 13803; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 13804; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 13805; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 13806; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 13807; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 13808; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 13809; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 13810; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 13811; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 13812; GFX1032_DPP-NEXT: ; %bb.1: 13813; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 13814; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 13815; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] 13816; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 13817; GFX1032_DPP-NEXT: buffer_gl0_inv 13818; GFX1032_DPP-NEXT: .LBB26_2: 13819; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 13820; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 13821; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 13822; GFX1032_DPP-NEXT: s_mov_b32 null, 0 13823; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 13824; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 13825; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 13826; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 13827; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 13828; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8] 13829; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo 13830; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo 13831; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 13832; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 13833; GFX1032_DPP-NEXT: s_endpgm 13834; 13835; GFX1164_DPP-LABEL: min_i64_varying: 13836; GFX1164_DPP: ; %bb.0: ; %entry 13837; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 13838; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 13839; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13840; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s[0:1] 13841; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13842; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 13843; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] 13844; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13845; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 13846; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 13847; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 13848; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 13849; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] 13850; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 13851; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 13852; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13853; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 13854; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 13855; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 13856; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 13857; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) 13858; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] 13859; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 13860; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 13861; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13862; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 13863; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 13864; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 13865; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 13866; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] 13867; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 13868; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 13869; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13870; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 13871; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 13872; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 13873; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 13874; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 13875; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] 13876; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 13877; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 13878; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 13879; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 13880; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 13881; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 13882; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 13883; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 13884; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] 13885; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 13886; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 13887; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13888; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 13889; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 13890; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 13891; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 13892; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 13893; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 13894; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 13895; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 13896; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 13897; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 13898; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, -2 13899; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 13900; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4] 13901; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 13902; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 13903; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 13904; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 13905; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 13906; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 13907; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 13908; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 13909; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 13910; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 13911; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 13912; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 13913; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 13914; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 13915; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 13916; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 13917; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 13918; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 13919; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 13920; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 13921; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 13922; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13923; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 13924; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 13925; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 13926; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 13927; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 13928; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 13929; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 13930; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 13931; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 13932; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 13933; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 13934; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 13935; GFX1164_DPP-NEXT: ; %bb.1: 13936; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 13937; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 13938; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] 13939; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 13940; GFX1164_DPP-NEXT: buffer_gl0_inv 13941; GFX1164_DPP-NEXT: .LBB26_2: 13942; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 13943; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 13944; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 13945; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 13946; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 13947; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 13948; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 13949; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 13950; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] 13951; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc 13952; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc 13953; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 13954; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 13955; GFX1164_DPP-NEXT: s_endpgm 13956; 13957; GFX1132_DPP-LABEL: min_i64_varying: 13958; GFX1132_DPP: ; %bb.0: ; %entry 13959; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 13960; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 13961; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 13962; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s2 13963; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13964; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 13965; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13966; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) 13967; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 13968; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 13969; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 13970; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 13971; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] 13972; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 13973; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13974; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 13975; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 13976; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 13977; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13978; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 13979; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6] 13980; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 13981; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, -2 13982; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 13983; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) 13984; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 13985; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 13986; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) 13987; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] 13988; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 13989; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, -2 13990; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 13991; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 13992; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 13993; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 13994; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6] 13995; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 13996; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 13997; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 13998; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 13999; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14000; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 14001; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 14002; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v5, -2 14003; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 14004; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] 14005; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 14006; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_cndmask_b32 v1, v3, v1 14007; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 14008; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 14009; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 14010; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 14011; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 14012; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 14013; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 14014; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 14015; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 14016; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 14017; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 14018; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 14019; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 14020; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 14021; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 14022; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 14023; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 14024; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 14025; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 14026; GFX1132_DPP-NEXT: ; %bb.1: 14027; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 14028; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[7:8], v0, v[7:8] 14029; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 14030; GFX1132_DPP-NEXT: buffer_gl0_inv 14031; GFX1132_DPP-NEXT: .LBB26_2: 14032; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 14033; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 14034; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 14035; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 14036; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 14037; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 14038; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 14039; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 14040; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8] 14041; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo 14042; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo 14043; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 14044; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 14045; GFX1132_DPP-NEXT: s_endpgm 14046entry: 14047 %lane = call i32 @llvm.amdgcn.workitem.id.x() 14048 %lane_ext = zext i32 %lane to i64 14049 %old = atomicrmw min ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel 14050 store i64 %old, ptr addrspace(1) %out 14051 ret void 14052} 14053 14054define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { 14055; GFX7LESS_ITERATIVE-LABEL: umax_i32_varying: 14056; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 14057; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 14058; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 14059; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 14060; GFX7LESS_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop 14061; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14062; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 14063; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 14064; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 14065; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 14066; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 14067; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 14068; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 14069; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 14070; GFX7LESS_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 14071; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB27_1 14072; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14073; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 14074; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 14075; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14076; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 14077; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 14078; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 14079; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 14080; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 14081; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 14082; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 14083; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 14084; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 14085; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14086; GFX7LESS_ITERATIVE-NEXT: .LBB27_4: 14087; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 14088; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 14089; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 14090; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 14091; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 14092; GFX7LESS_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 14093; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14094; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 14095; GFX7LESS_ITERATIVE-NEXT: s_endpgm 14096; 14097; GFX8_ITERATIVE-LABEL: umax_i32_varying: 14098; GFX8_ITERATIVE: ; %bb.0: ; %entry 14099; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 14100; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 14101; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 14102; GFX8_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop 14103; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14104; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 14105; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 14106; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 14107; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 14108; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 14109; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 14110; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 14111; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 14112; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 14113; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14114; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14115; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14116; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14117; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 14118; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 14119; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 14120; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 14121; GFX8_ITERATIVE-NEXT: ; %bb.3: 14122; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 14123; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 14124; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 14125; GFX8_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 14126; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14127; GFX8_ITERATIVE-NEXT: .LBB27_4: 14128; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 14129; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14130; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 14131; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 14132; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 14133; GFX8_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 14134; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14135; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 14136; GFX8_ITERATIVE-NEXT: s_endpgm 14137; 14138; GFX9_ITERATIVE-LABEL: umax_i32_varying: 14139; GFX9_ITERATIVE: ; %bb.0: ; %entry 14140; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 14141; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 14142; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 14143; GFX9_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop 14144; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14145; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 14146; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 14147; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 14148; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 14149; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 14150; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 14151; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 14152; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 14153; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 14154; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14155; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14156; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14157; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14158; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 14159; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 14160; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 14161; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 14162; GFX9_ITERATIVE-NEXT: ; %bb.3: 14163; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 14164; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 14165; GFX9_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 14166; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14167; GFX9_ITERATIVE-NEXT: .LBB27_4: 14168; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 14169; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14170; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 14171; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 14172; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 14173; GFX9_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 14174; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14175; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 14176; GFX9_ITERATIVE-NEXT: s_endpgm 14177; 14178; GFX1064_ITERATIVE-LABEL: umax_i32_varying: 14179; GFX1064_ITERATIVE: ; %bb.0: ; %entry 14180; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 14181; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 14182; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 14183; GFX1064_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop 14184; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14185; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 14186; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 14187; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 14188; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 14189; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 14190; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 14191; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 14192; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 14193; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14194; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14195; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14196; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14197; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 14198; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 14199; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 14200; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 14201; GFX1064_ITERATIVE-NEXT: ; %bb.3: 14202; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 14203; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 14204; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 14205; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14206; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 14207; GFX1064_ITERATIVE-NEXT: .LBB27_4: 14208; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 14209; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 14210; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14211; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 14212; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 14213; GFX1064_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 14214; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 14215; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14216; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 14217; GFX1064_ITERATIVE-NEXT: s_endpgm 14218; 14219; GFX1032_ITERATIVE-LABEL: umax_i32_varying: 14220; GFX1032_ITERATIVE: ; %bb.0: ; %entry 14221; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 14222; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 14223; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 14224; GFX1032_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop 14225; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14226; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 14227; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 14228; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 14229; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 14230; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 14231; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 14232; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 14233; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 14234; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14235; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14236; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 14237; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 14238; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 14239; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 14240; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 14241; GFX1032_ITERATIVE-NEXT: ; %bb.3: 14242; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 14243; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 14244; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 14245; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14246; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 14247; GFX1032_ITERATIVE-NEXT: .LBB27_4: 14248; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 14249; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 14250; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14251; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 14252; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 14253; GFX1032_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 14254; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 14255; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14256; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 14257; GFX1032_ITERATIVE-NEXT: s_endpgm 14258; 14259; GFX1164_ITERATIVE-LABEL: umax_i32_varying: 14260; GFX1164_ITERATIVE: ; %bb.0: ; %entry 14261; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 14262; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 14263; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 14264; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 14265; GFX1164_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop 14266; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14267; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 14268; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 14269; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 14270; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 14271; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 14272; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 14273; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 14274; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 14275; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 14276; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 14277; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14278; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 14279; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14280; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 14281; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 14282; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 14283; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 14284; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14285; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 14286; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 14287; GFX1164_ITERATIVE-NEXT: ; %bb.3: 14288; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 14289; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 14290; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 14291; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14292; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 14293; GFX1164_ITERATIVE-NEXT: .LBB27_4: 14294; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 14295; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 14296; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 14297; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 14298; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 14299; GFX1164_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 14300; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 14301; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14302; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 14303; GFX1164_ITERATIVE-NEXT: s_endpgm 14304; 14305; GFX1132_ITERATIVE-LABEL: umax_i32_varying: 14306; GFX1132_ITERATIVE: ; %bb.0: ; %entry 14307; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 14308; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 14309; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 14310; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 14311; GFX1132_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop 14312; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14313; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 14314; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 14315; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 14316; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 14317; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 14318; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 14319; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 14320; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 14321; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 14322; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 14323; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14324; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 14325; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 14326; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 14327; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 14328; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 14329; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 14330; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 14331; GFX1132_ITERATIVE-NEXT: ; %bb.3: 14332; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 14333; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 14334; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14335; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 14336; GFX1132_ITERATIVE-NEXT: .LBB27_4: 14337; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 14338; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 14339; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 14340; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 14341; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 14342; GFX1132_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 14343; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 14344; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14345; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 14346; GFX1132_ITERATIVE-NEXT: s_endpgm 14347; 14348; GFX7LESS_DPP-LABEL: umax_i32_varying: 14349; GFX7LESS_DPP: ; %bb.0: ; %entry 14350; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 14351; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 14352; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 14353; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 14354; GFX7LESS_DPP-NEXT: ds_max_rtn_u32 v0, v1, v0 14355; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 14356; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 14357; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 14358; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 14359; GFX7LESS_DPP-NEXT: s_endpgm 14360; 14361; GFX8_DPP-LABEL: umax_i32_varying: 14362; GFX8_DPP: ; %bb.0: ; %entry 14363; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 14364; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 14365; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 14366; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 14367; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 14368; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 14369; GFX8_DPP-NEXT: s_nop 0 14370; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 14371; GFX8_DPP-NEXT: s_nop 1 14372; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 14373; GFX8_DPP-NEXT: s_nop 1 14374; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 14375; GFX8_DPP-NEXT: s_nop 1 14376; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 14377; GFX8_DPP-NEXT: s_nop 1 14378; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 14379; GFX8_DPP-NEXT: s_nop 1 14380; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 14381; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 14382; GFX8_DPP-NEXT: s_nop 0 14383; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 14384; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 14385; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 14386; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 14387; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 14388; GFX8_DPP-NEXT: s_cbranch_execz .LBB27_2 14389; GFX8_DPP-NEXT: ; %bb.1: 14390; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 14391; GFX8_DPP-NEXT: s_mov_b32 m0, -1 14392; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 14393; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 14394; GFX8_DPP-NEXT: .LBB27_2: 14395; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 14396; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14397; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 14398; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 14399; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 14400; GFX8_DPP-NEXT: s_mov_b32 s2, -1 14401; GFX8_DPP-NEXT: v_max_u32_e32 v0, s4, v0 14402; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 14403; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 14404; GFX8_DPP-NEXT: s_endpgm 14405; 14406; GFX9_DPP-LABEL: umax_i32_varying: 14407; GFX9_DPP: ; %bb.0: ; %entry 14408; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 14409; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 14410; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 14411; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 14412; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 14413; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 14414; GFX9_DPP-NEXT: s_nop 0 14415; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 14416; GFX9_DPP-NEXT: s_nop 1 14417; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 14418; GFX9_DPP-NEXT: s_nop 1 14419; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 14420; GFX9_DPP-NEXT: s_nop 1 14421; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 14422; GFX9_DPP-NEXT: s_nop 1 14423; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 14424; GFX9_DPP-NEXT: s_nop 1 14425; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 14426; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 14427; GFX9_DPP-NEXT: s_nop 0 14428; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 14429; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 14430; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 14431; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 14432; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 14433; GFX9_DPP-NEXT: s_cbranch_execz .LBB27_2 14434; GFX9_DPP-NEXT: ; %bb.1: 14435; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 14436; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 14437; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 14438; GFX9_DPP-NEXT: .LBB27_2: 14439; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 14440; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14441; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 14442; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 14443; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 14444; GFX9_DPP-NEXT: s_mov_b32 s2, -1 14445; GFX9_DPP-NEXT: v_max_u32_e32 v0, s4, v0 14446; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 14447; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 14448; GFX9_DPP-NEXT: s_endpgm 14449; 14450; GFX1064_DPP-LABEL: umax_i32_varying: 14451; GFX1064_DPP: ; %bb.0: ; %entry 14452; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 14453; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 14454; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 14455; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 14456; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 14457; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 14458; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 14459; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 14460; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 14461; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 14462; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 14463; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 14464; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 14465; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 14466; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 14467; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 14468; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 14469; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14470; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 14471; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 14472; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 14473; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 14474; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 14475; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14476; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 14477; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 14478; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 14479; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 14480; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14481; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 14482; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 14483; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 14484; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2 14485; GFX1064_DPP-NEXT: ; %bb.1: 14486; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 14487; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 14488; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 14489; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 14490; GFX1064_DPP-NEXT: buffer_gl0_inv 14491; GFX1064_DPP-NEXT: .LBB27_2: 14492; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 14493; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 14494; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14495; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 14496; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 14497; GFX1064_DPP-NEXT: v_max_u32_e32 v0, s3, v0 14498; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 14499; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 14500; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 14501; GFX1064_DPP-NEXT: s_endpgm 14502; 14503; GFX1032_DPP-LABEL: umax_i32_varying: 14504; GFX1032_DPP: ; %bb.0: ; %entry 14505; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 14506; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 14507; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 14508; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 14509; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 14510; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 14511; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 14512; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 14513; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 14514; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 14515; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 14516; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 14517; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 14518; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14519; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 14520; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 14521; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 14522; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 14523; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 14524; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 14525; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 14526; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 14527; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 14528; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 14529; GFX1032_DPP-NEXT: ; %bb.1: 14530; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 14531; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 14532; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 14533; GFX1032_DPP-NEXT: buffer_gl0_inv 14534; GFX1032_DPP-NEXT: .LBB27_2: 14535; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 14536; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 14537; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14538; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 14539; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 14540; GFX1032_DPP-NEXT: v_max_u32_e32 v0, s3, v0 14541; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 14542; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 14543; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 14544; GFX1032_DPP-NEXT: s_endpgm 14545; 14546; GFX1164_DPP-LABEL: umax_i32_varying: 14547; GFX1164_DPP: ; %bb.0: ; %entry 14548; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 14549; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 14550; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 14551; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 14552; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 14553; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 14554; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 14555; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 14556; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14557; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 14558; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 14559; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14560; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 14561; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 14562; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14563; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 14564; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 14565; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14566; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 14567; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 14568; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 14569; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 14570; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 14571; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 14572; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 14573; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14574; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 14575; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 14576; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 14577; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 14578; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 14579; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14580; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14581; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 14582; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 14583; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 14584; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 14585; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14586; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 14587; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 14588; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 14589; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2 14590; GFX1164_DPP-NEXT: ; %bb.1: 14591; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 14592; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 14593; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 14594; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 14595; GFX1164_DPP-NEXT: buffer_gl0_inv 14596; GFX1164_DPP-NEXT: .LBB27_2: 14597; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 14598; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 14599; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 14600; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 14601; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 14602; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s3, v0 14603; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 14604; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 14605; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 14606; GFX1164_DPP-NEXT: s_endpgm 14607; 14608; GFX1132_DPP-LABEL: umax_i32_varying: 14609; GFX1132_DPP: ; %bb.0: ; %entry 14610; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 14611; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 14612; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 14613; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 14614; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 14615; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 14616; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 14617; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 14618; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14619; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 14620; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 14621; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14622; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 14623; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 14624; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 14625; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 14626; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 14627; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 14628; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 14629; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14630; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 14631; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 14632; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 14633; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 14634; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 14635; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 14636; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 14637; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 14638; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 14639; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 14640; GFX1132_DPP-NEXT: ; %bb.1: 14641; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 14642; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 14643; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 14644; GFX1132_DPP-NEXT: buffer_gl0_inv 14645; GFX1132_DPP-NEXT: .LBB27_2: 14646; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 14647; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 14648; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 14649; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 14650; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 14651; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s3, v0 14652; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 14653; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 14654; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 14655; GFX1132_DPP-NEXT: s_endpgm 14656entry: 14657 %lane = call i32 @llvm.amdgcn.workitem.id.x() 14658 %old = atomicrmw umax ptr addrspace(3) @local_var32, i32 %lane acq_rel 14659 store i32 %old, ptr addrspace(1) %out 14660 ret void 14661} 14662 14663define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { 14664; GFX7LESS-LABEL: umax_i64_constant: 14665; GFX7LESS: ; %bb.0: ; %entry 14666; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 14667; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 14668; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14669; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 14670; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 14671; GFX7LESS-NEXT: s_cbranch_execz .LBB28_2 14672; GFX7LESS-NEXT: ; %bb.1: 14673; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 14674; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 14675; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 14676; GFX7LESS-NEXT: s_mov_b32 m0, -1 14677; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 14678; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 14679; GFX7LESS-NEXT: .LBB28_2: 14680; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 14681; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 14682; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 14683; GFX7LESS-NEXT: s_mov_b32 s2, -1 14684; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 14685; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 14686; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 14687; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 14688; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 14689; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 14690; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 14691; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 14692; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 14693; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 14694; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 14695; GFX7LESS-NEXT: s_endpgm 14696; 14697; GFX8-LABEL: umax_i64_constant: 14698; GFX8: ; %bb.0: ; %entry 14699; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14700; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14701; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14702; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 14703; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 14704; GFX8-NEXT: s_cbranch_execz .LBB28_2 14705; GFX8-NEXT: ; %bb.1: 14706; GFX8-NEXT: v_mov_b32_e32 v0, 5 14707; GFX8-NEXT: v_mov_b32_e32 v1, 0 14708; GFX8-NEXT: v_mov_b32_e32 v2, 0 14709; GFX8-NEXT: s_mov_b32 m0, -1 14710; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 14711; GFX8-NEXT: s_waitcnt lgkmcnt(0) 14712; GFX8-NEXT: .LBB28_2: 14713; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 14714; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14715; GFX8-NEXT: v_readfirstlane_b32 s5, v1 14716; GFX8-NEXT: v_readfirstlane_b32 s4, v0 14717; GFX8-NEXT: v_mov_b32_e32 v1, 0 14718; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 14719; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 14720; GFX8-NEXT: v_mov_b32_e32 v2, s4 14721; GFX8-NEXT: v_mov_b32_e32 v1, s5 14722; GFX8-NEXT: s_mov_b32 s3, 0xf000 14723; GFX8-NEXT: s_mov_b32 s2, -1 14724; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 14725; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 14726; GFX8-NEXT: s_waitcnt lgkmcnt(0) 14727; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 14728; GFX8-NEXT: s_endpgm 14729; 14730; GFX9-LABEL: umax_i64_constant: 14731; GFX9: ; %bb.0: ; %entry 14732; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14733; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14734; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14735; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 14736; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 14737; GFX9-NEXT: s_cbranch_execz .LBB28_2 14738; GFX9-NEXT: ; %bb.1: 14739; GFX9-NEXT: v_mov_b32_e32 v0, 5 14740; GFX9-NEXT: v_mov_b32_e32 v1, 0 14741; GFX9-NEXT: v_mov_b32_e32 v2, 0 14742; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 14743; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14744; GFX9-NEXT: .LBB28_2: 14745; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 14746; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14747; GFX9-NEXT: v_readfirstlane_b32 s5, v1 14748; GFX9-NEXT: v_readfirstlane_b32 s4, v0 14749; GFX9-NEXT: v_mov_b32_e32 v1, 0 14750; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 14751; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 14752; GFX9-NEXT: v_mov_b32_e32 v2, s4 14753; GFX9-NEXT: v_mov_b32_e32 v1, s5 14754; GFX9-NEXT: s_mov_b32 s3, 0xf000 14755; GFX9-NEXT: s_mov_b32 s2, -1 14756; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 14757; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 14758; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14759; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 14760; GFX9-NEXT: s_endpgm 14761; 14762; GFX1064-LABEL: umax_i64_constant: 14763; GFX1064: ; %bb.0: ; %entry 14764; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14765; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14766; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14767; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 14768; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 14769; GFX1064-NEXT: s_cbranch_execz .LBB28_2 14770; GFX1064-NEXT: ; %bb.1: 14771; GFX1064-NEXT: v_mov_b32_e32 v0, 5 14772; GFX1064-NEXT: v_mov_b32_e32 v1, 0 14773; GFX1064-NEXT: v_mov_b32_e32 v2, 0 14774; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 14775; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 14776; GFX1064-NEXT: buffer_gl0_inv 14777; GFX1064-NEXT: .LBB28_2: 14778; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 14779; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 14780; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14781; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 14782; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 14783; GFX1064-NEXT: v_mov_b32_e32 v1, 0 14784; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 14785; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 14786; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 14787; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 14788; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 14789; GFX1064-NEXT: s_mov_b32 s2, -1 14790; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 14791; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 14792; GFX1064-NEXT: s_endpgm 14793; 14794; GFX1032-LABEL: umax_i64_constant: 14795; GFX1032: ; %bb.0: ; %entry 14796; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14797; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 14798; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 14799; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 14800; GFX1032-NEXT: s_cbranch_execz .LBB28_2 14801; GFX1032-NEXT: ; %bb.1: 14802; GFX1032-NEXT: v_mov_b32_e32 v0, 5 14803; GFX1032-NEXT: v_mov_b32_e32 v1, 0 14804; GFX1032-NEXT: v_mov_b32_e32 v2, 0 14805; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 14806; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 14807; GFX1032-NEXT: buffer_gl0_inv 14808; GFX1032-NEXT: .LBB28_2: 14809; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 14810; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 14811; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14812; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 14813; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 14814; GFX1032-NEXT: v_mov_b32_e32 v1, 0 14815; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 14816; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 14817; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 14818; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 14819; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 14820; GFX1032-NEXT: s_mov_b32 s2, -1 14821; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 14822; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 14823; GFX1032-NEXT: s_endpgm 14824; 14825; GFX1164-LABEL: umax_i64_constant: 14826; GFX1164: ; %bb.0: ; %entry 14827; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14828; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14829; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14830; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14831; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 14832; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc 14833; GFX1164-NEXT: s_cbranch_execz .LBB28_2 14834; GFX1164-NEXT: ; %bb.1: 14835; GFX1164-NEXT: v_mov_b32_e32 v0, 5 14836; GFX1164-NEXT: v_mov_b32_e32 v1, 0 14837; GFX1164-NEXT: v_mov_b32_e32 v2, 0 14838; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 14839; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 14840; GFX1164-NEXT: buffer_gl0_inv 14841; GFX1164-NEXT: .LBB28_2: 14842; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 14843; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 14844; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 14845; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 14846; GFX1164-NEXT: v_mov_b32_e32 v1, 0 14847; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 14848; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 14849; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 14850; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 14851; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 14852; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 14853; GFX1164-NEXT: s_mov_b32 s2, -1 14854; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 14855; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 14856; GFX1164-NEXT: s_endpgm 14857; 14858; GFX1132-LABEL: umax_i64_constant: 14859; GFX1132: ; %bb.0: ; %entry 14860; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14861; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 14862; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 14863; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 14864; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo 14865; GFX1132-NEXT: s_cbranch_execz .LBB28_2 14866; GFX1132-NEXT: ; %bb.1: 14867; GFX1132-NEXT: v_mov_b32_e32 v0, 5 14868; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 14869; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 14870; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 14871; GFX1132-NEXT: buffer_gl0_inv 14872; GFX1132-NEXT: .LBB28_2: 14873; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 14874; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 14875; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 14876; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 14877; GFX1132-NEXT: v_mov_b32_e32 v1, 0 14878; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 14879; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 14880; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 14881; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 14882; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 14883; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 14884; GFX1132-NEXT: s_mov_b32 s2, -1 14885; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 14886; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 14887; GFX1132-NEXT: s_endpgm 14888entry: 14889 %old = atomicrmw umax ptr addrspace(3) @local_var64, i64 5 acq_rel 14890 store i64 %old, ptr addrspace(1) %out 14891 ret void 14892} 14893 14894define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { 14895; GFX7LESS_ITERATIVE-LABEL: umax_i64_varying: 14896; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 14897; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 14898; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 14899; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 14900; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 14901; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop 14902; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14903; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 14904; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 14905; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 14906; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 14907; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 14908; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 14909; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 14910; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 14911; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] 14912; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 14913; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 14914; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 14915; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 14916; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 14917; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 14918; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 14919; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB29_1 14920; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14921; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 14922; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 14923; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14924; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 14925; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 14926; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 14927; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 14928; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 14929; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 14930; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 14931; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 14932; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 14933; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] 14934; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14935; GFX7LESS_ITERATIVE-NEXT: .LBB29_4: 14936; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 14937; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 14938; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 14939; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 14940; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 14941; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 14942; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 14943; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] 14944; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 14945; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 14946; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 14947; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14948; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 14949; GFX7LESS_ITERATIVE-NEXT: s_endpgm 14950; 14951; GFX8_ITERATIVE-LABEL: umax_i64_varying: 14952; GFX8_ITERATIVE: ; %bb.0: ; %entry 14953; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 14954; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 14955; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 14956; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 14957; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop 14958; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 14959; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 14960; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 14961; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 14962; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 14963; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 14964; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] 14965; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 14966; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 14967; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 14968; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 14969; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 14970; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 14971; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 14972; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 14973; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 14974; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 14975; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 14976; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 14977; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 14978; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 14979; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 14980; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 14981; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 14982; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 14983; GFX8_ITERATIVE-NEXT: ; %bb.3: 14984; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 14985; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 14986; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 14987; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 14988; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] 14989; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 14990; GFX8_ITERATIVE-NEXT: .LBB29_4: 14991; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 14992; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 14993; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 14994; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 14995; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] 14996; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 14997; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 14998; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 14999; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 15000; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 15001; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 15002; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15003; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 15004; GFX8_ITERATIVE-NEXT: s_endpgm 15005; 15006; GFX9_ITERATIVE-LABEL: umax_i64_varying: 15007; GFX9_ITERATIVE: ; %bb.0: ; %entry 15008; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 15009; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 15010; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 15011; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 15012; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop 15013; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 15014; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 15015; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 15016; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 15017; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 15018; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 15019; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] 15020; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 15021; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 15022; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 15023; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 15024; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 15025; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 15026; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 15027; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 15028; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 15029; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 15030; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 15031; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 15032; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 15033; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 15034; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 15035; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 15036; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 15037; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 15038; GFX9_ITERATIVE-NEXT: ; %bb.3: 15039; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 15040; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 15041; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 15042; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] 15043; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15044; GFX9_ITERATIVE-NEXT: .LBB29_4: 15045; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 15046; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15047; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 15048; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 15049; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] 15050; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 15051; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 15052; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 15053; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 15054; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 15055; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 15056; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15057; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 15058; GFX9_ITERATIVE-NEXT: s_endpgm 15059; 15060; GFX1064_ITERATIVE-LABEL: umax_i64_varying: 15061; GFX1064_ITERATIVE: ; %bb.0: ; %entry 15062; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 15063; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 15064; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 15065; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 15066; GFX1064_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop 15067; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 15068; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] 15069; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 15070; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 15071; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 15072; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 15073; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] 15074; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec 15075; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 15076; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 15077; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 15078; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 15079; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 15080; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 15081; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 15082; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 15083; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 15084; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 15085; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 15086; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 15087; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 15088; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 15089; GFX1064_ITERATIVE-NEXT: ; %bb.3: 15090; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 15091; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 15092; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 15093; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] 15094; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15095; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 15096; GFX1064_ITERATIVE-NEXT: .LBB29_4: 15097; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 15098; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 15099; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15100; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 15101; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 15102; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2] 15103; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc 15104; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc 15105; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 15106; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 15107; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15108; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 15109; GFX1064_ITERATIVE-NEXT: s_endpgm 15110; 15111; GFX1032_ITERATIVE-LABEL: umax_i64_varying: 15112; GFX1032_ITERATIVE: ; %bb.0: ; %entry 15113; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 15114; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 15115; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 15116; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 15117; GFX1032_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop 15118; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 15119; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 15120; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 15121; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 15122; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 15123; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 15124; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] 15125; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo 15126; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 15127; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 15128; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 15129; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 15130; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 15131; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 15132; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 15133; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 15134; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 15135; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 15136; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 15137; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 15138; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 15139; GFX1032_ITERATIVE-NEXT: ; %bb.3: 15140; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 15141; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 15142; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 15143; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] 15144; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15145; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 15146; GFX1032_ITERATIVE-NEXT: .LBB29_4: 15147; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 15148; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 15149; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15150; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 15151; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 15152; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2] 15153; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo 15154; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo 15155; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 15156; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 15157; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15158; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 15159; GFX1032_ITERATIVE-NEXT: s_endpgm 15160; 15161; GFX1164_ITERATIVE-LABEL: umax_i64_varying: 15162; GFX1164_ITERATIVE: ; %bb.0: ; %entry 15163; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 15164; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 15165; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 15166; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 15167; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 15168; GFX1164_ITERATIVE-NEXT: .p2align 6 15169; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop 15170; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 15171; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] 15172; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 15173; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 15174; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 15175; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 15176; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 15177; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 15178; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] 15179; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec 15180; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 15181; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 15182; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 15183; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 15184; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] 15185; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 15186; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 15187; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 15188; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 15189; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 15190; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 15191; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 15192; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 15193; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 15194; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15195; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 15196; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 15197; GFX1164_ITERATIVE-NEXT: ; %bb.3: 15198; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 15199; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 15200; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 15201; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] 15202; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15203; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 15204; GFX1164_ITERATIVE-NEXT: .LBB29_4: 15205; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 15206; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 15207; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 15208; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 15209; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 15210; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 15211; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 15212; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 15213; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 15214; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 15215; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15216; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 15217; GFX1164_ITERATIVE-NEXT: s_endpgm 15218; 15219; GFX1132_ITERATIVE-LABEL: umax_i64_varying: 15220; GFX1132_ITERATIVE: ; %bb.0: ; %entry 15221; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 15222; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 15223; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 15224; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 15225; GFX1132_ITERATIVE-NEXT: .p2align 6 15226; GFX1132_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop 15227; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 15228; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 15229; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 15230; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 15231; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 15232; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 15233; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 15234; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 15235; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] 15236; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo 15237; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 15238; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 15239; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 15240; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 15241; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 15242; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 15243; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 15244; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 15245; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 15246; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 15247; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 15248; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 15249; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 15250; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 15251; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 15252; GFX1132_ITERATIVE-NEXT: ; %bb.3: 15253; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 15254; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 15255; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] 15256; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15257; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 15258; GFX1132_ITERATIVE-NEXT: .LBB29_4: 15259; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 15260; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 15261; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 15262; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 15263; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 15264; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 15265; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 15266; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 15267; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 15268; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 15269; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15270; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 15271; GFX1132_ITERATIVE-NEXT: s_endpgm 15272; 15273; GFX7LESS_DPP-LABEL: umax_i64_varying: 15274; GFX7LESS_DPP: ; %bb.0: ; %entry 15275; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 15276; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 15277; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 15278; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 15279; GFX7LESS_DPP-NEXT: ds_max_rtn_u64 v[0:1], v1, v[0:1] 15280; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 15281; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 15282; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 15283; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 15284; GFX7LESS_DPP-NEXT: s_endpgm 15285; 15286; GFX8_DPP-LABEL: umax_i64_varying: 15287; GFX8_DPP: ; %bb.0: ; %entry 15288; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 15289; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 15290; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 15291; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 15292; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] 15293; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 15294; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 15295; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 15296; GFX8_DPP-NEXT: s_nop 0 15297; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15298; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15299; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15300; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15301; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15302; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 15303; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 15304; GFX8_DPP-NEXT: s_nop 0 15305; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 15306; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf 15307; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15308; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15309; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15310; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 15311; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 15312; GFX8_DPP-NEXT: s_nop 0 15313; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 15314; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 15315; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15316; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15317; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15318; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 15319; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 15320; GFX8_DPP-NEXT: s_nop 0 15321; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 15322; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf 15323; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15324; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15325; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15326; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 15327; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 15328; GFX8_DPP-NEXT: s_nop 0 15329; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 15330; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 15331; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15332; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15333; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15334; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 15335; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 15336; GFX8_DPP-NEXT: s_nop 0 15337; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 15338; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 15339; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15340; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15341; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15342; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 15343; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 15344; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 15345; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 15346; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 15347; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 15348; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 15349; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 15350; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 15351; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 15352; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 15353; GFX8_DPP-NEXT: ; %bb.1: 15354; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 15355; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 15356; GFX8_DPP-NEXT: s_mov_b32 m0, -1 15357; GFX8_DPP-NEXT: ds_max_rtn_u64 v[5:6], v7, v[5:6] 15358; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 15359; GFX8_DPP-NEXT: .LBB29_2: 15360; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 15361; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15362; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v6 15363; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v5 15364; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 15365; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 15366; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6] 15367; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 15368; GFX8_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc 15369; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 15370; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 15371; GFX8_DPP-NEXT: s_mov_b32 s2, -1 15372; GFX8_DPP-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc 15373; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 15374; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 15375; GFX8_DPP-NEXT: s_endpgm 15376; 15377; GFX9_DPP-LABEL: umax_i64_varying: 15378; GFX9_DPP: ; %bb.0: ; %entry 15379; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 15380; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 15381; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 15382; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 15383; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] 15384; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 15385; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 15386; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 15387; GFX9_DPP-NEXT: s_nop 0 15388; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15389; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15390; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15391; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15392; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15393; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 15394; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 15395; GFX9_DPP-NEXT: s_nop 0 15396; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 15397; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf 15398; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15399; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15400; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15401; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 15402; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 15403; GFX9_DPP-NEXT: s_nop 0 15404; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 15405; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 15406; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15407; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15408; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15409; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 15410; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 15411; GFX9_DPP-NEXT: s_nop 0 15412; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 15413; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf 15414; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15415; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15416; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15417; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 15418; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 15419; GFX9_DPP-NEXT: s_nop 0 15420; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 15421; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 15422; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15423; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15424; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15425; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 15426; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 15427; GFX9_DPP-NEXT: s_nop 0 15428; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 15429; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 15430; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15431; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15432; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15433; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 15434; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 15435; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 15436; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 15437; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 15438; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf 15439; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 15440; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 15441; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 15442; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 15443; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 15444; GFX9_DPP-NEXT: ; %bb.1: 15445; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 15446; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 15447; GFX9_DPP-NEXT: ds_max_rtn_u64 v[5:6], v7, v[5:6] 15448; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 15449; GFX9_DPP-NEXT: .LBB29_2: 15450; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 15451; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15452; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 15453; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v5 15454; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 15455; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 15456; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6] 15457; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 15458; GFX9_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc 15459; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 15460; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 15461; GFX9_DPP-NEXT: s_mov_b32 s2, -1 15462; GFX9_DPP-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc 15463; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 15464; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 15465; GFX9_DPP-NEXT: s_endpgm 15466; 15467; GFX1064_DPP-LABEL: umax_i64_varying: 15468; GFX1064_DPP: ; %bb.0: ; %entry 15469; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 15470; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] 15471; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 15472; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 15473; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 15474; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 15475; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 15476; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15477; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15478; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15479; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15480; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15481; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 15482; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 15483; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 15484; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 15485; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] 15486; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 15487; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 15488; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 15489; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 15490; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 15491; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 15492; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15493; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15494; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15495; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 15496; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 15497; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 15498; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 15499; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] 15500; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 15501; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 15502; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 15503; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 15504; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 15505; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 15506; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15507; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15508; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15509; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 15510; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 15511; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 15512; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 15513; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 15514; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 15515; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 15516; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 15517; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 15518; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15519; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15520; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 15521; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15522; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 15523; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 15524; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 15525; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15526; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 15527; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15528; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 15529; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 15530; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 15531; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 15532; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 15533; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 15534; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 15535; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 15536; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 15537; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 15538; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 15539; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 15540; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 15541; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 15542; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 15543; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 15544; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 15545; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 15546; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 15547; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 15548; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 15549; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 15550; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 15551; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 15552; GFX1064_DPP-NEXT: ; %bb.1: 15553; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 15554; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 15555; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[7:8], v0, v[7:8] 15556; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 15557; GFX1064_DPP-NEXT: buffer_gl0_inv 15558; GFX1064_DPP-NEXT: .LBB29_2: 15559; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 15560; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 15561; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15562; GFX1064_DPP-NEXT: s_mov_b32 null, 0 15563; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 15564; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 15565; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 15566; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 15567; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 15568; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] 15569; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc 15570; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc 15571; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 15572; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 15573; GFX1064_DPP-NEXT: s_endpgm 15574; 15575; GFX1032_DPP-LABEL: umax_i64_varying: 15576; GFX1032_DPP: ; %bb.0: ; %entry 15577; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 15578; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s2 15579; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 15580; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 15581; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 15582; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 15583; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 15584; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15585; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15586; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] 15587; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 15588; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 15589; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 15590; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 15591; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 15592; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 15593; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6] 15594; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 15595; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 15596; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 15597; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 15598; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 15599; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 15600; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] 15601; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 15602; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 15603; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 15604; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 15605; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 15606; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 15607; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6] 15608; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 15609; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 15610; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 15611; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 15612; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 15613; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 15614; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 15615; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] 15616; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 15617; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 15618; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 15619; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 15620; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 15621; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 15622; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15623; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15624; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 15625; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 15626; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 15627; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 15628; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 15629; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 15630; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 15631; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 15632; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 15633; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 15634; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 15635; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 15636; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 15637; GFX1032_DPP-NEXT: ; %bb.1: 15638; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 15639; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 15640; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[7:8], v0, v[7:8] 15641; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 15642; GFX1032_DPP-NEXT: buffer_gl0_inv 15643; GFX1032_DPP-NEXT: .LBB29_2: 15644; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 15645; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 15646; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15647; GFX1032_DPP-NEXT: s_mov_b32 null, 0 15648; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 15649; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 15650; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 15651; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 15652; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 15653; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8] 15654; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo 15655; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo 15656; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 15657; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 15658; GFX1032_DPP-NEXT: s_endpgm 15659; 15660; GFX1164_DPP-LABEL: umax_i64_varying: 15661; GFX1164_DPP: ; %bb.0: ; %entry 15662; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 15663; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 15664; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15665; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] 15666; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 15667; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 15668; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] 15669; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 15670; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 15671; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15672; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 15673; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15674; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15675; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15676; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15677; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 15678; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 15679; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15680; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 15681; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 15682; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) 15683; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] 15684; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 15685; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 15686; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 15687; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 15688; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 15689; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 15690; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 15691; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15692; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15693; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15694; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 15695; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 15696; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15697; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 15698; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 15699; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 15700; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] 15701; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 15702; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 15703; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 15704; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15705; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 15706; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 15707; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 15708; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 15709; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15710; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15711; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15712; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 15713; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 15714; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15715; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 15716; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 15717; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15718; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 15719; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 15720; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15721; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 15722; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 15723; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 15724; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 15725; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4] 15726; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 15727; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 15728; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 15729; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 15730; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 15731; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 15732; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15733; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 15734; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15735; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 15736; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 15737; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 15738; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 15739; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 15740; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 15741; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 15742; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 15743; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 15744; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 15745; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 15746; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 15747; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15748; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 15749; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 15750; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 15751; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 15752; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 15753; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 15754; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 15755; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 15756; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 15757; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 15758; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 15759; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 15760; GFX1164_DPP-NEXT: ; %bb.1: 15761; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 15762; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 15763; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[7:8], v0, v[7:8] 15764; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 15765; GFX1164_DPP-NEXT: buffer_gl0_inv 15766; GFX1164_DPP-NEXT: .LBB29_2: 15767; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 15768; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 15769; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 15770; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 15771; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 15772; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 15773; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 15774; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 15775; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] 15776; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc 15777; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc 15778; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 15779; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 15780; GFX1164_DPP-NEXT: s_endpgm 15781; 15782; GFX1132_DPP-LABEL: umax_i64_varying: 15783; GFX1132_DPP: ; %bb.0: ; %entry 15784; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 15785; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 15786; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 15787; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s2 15788; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0 15789; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 15790; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 15791; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 15792; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15793; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15794; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 15795; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] 15796; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 15797; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0 15798; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 15799; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 15800; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 15801; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6] 15802; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 15803; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 15804; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 15805; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 15806; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 15807; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 15808; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] 15809; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 15810; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0 15811; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 15812; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 15813; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 15814; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6] 15815; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 15816; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 15817; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 15818; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 15819; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15820; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 15821; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 15822; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 15823; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] 15824; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_cndmask_b32 v2, v4, v2 15825; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v3, v1 15826; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 15827; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 15828; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 15829; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 15830; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 15831; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 15832; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 15833; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 15834; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 15835; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 15836; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 15837; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 15838; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 15839; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 15840; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 15841; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 15842; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 15843; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 15844; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 15845; GFX1132_DPP-NEXT: ; %bb.1: 15846; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 15847; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[7:8], v0, v[7:8] 15848; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 15849; GFX1132_DPP-NEXT: buffer_gl0_inv 15850; GFX1132_DPP-NEXT: .LBB29_2: 15851; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 15852; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 15853; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 15854; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 15855; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 15856; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 15857; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 15858; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 15859; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8] 15860; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo 15861; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo 15862; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 15863; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 15864; GFX1132_DPP-NEXT: s_endpgm 15865entry: 15866 %lane = call i32 @llvm.amdgcn.workitem.id.x() 15867 %lane_ext = zext i32 %lane to i64 15868 %old = atomicrmw umax ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel 15869 store i64 %old, ptr addrspace(1) %out 15870 ret void 15871} 15872 15873define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { 15874; GFX7LESS_ITERATIVE-LABEL: umin_i32_varying: 15875; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 15876; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 15877; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 15878; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 15879; GFX7LESS_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop 15880; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 15881; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 15882; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 15883; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 15884; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 15885; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 15886; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 15887; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 15888; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 15889; GFX7LESS_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 15890; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB30_1 15891; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 15892; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 15893; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 15894; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 15895; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 15896; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 15897; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 15898; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 15899; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 15900; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 15901; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 15902; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 15903; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 15904; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15905; GFX7LESS_ITERATIVE-NEXT: .LBB30_4: 15906; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 15907; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 15908; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 15909; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 15910; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 15911; GFX7LESS_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 15912; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15913; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 15914; GFX7LESS_ITERATIVE-NEXT: s_endpgm 15915; 15916; GFX8_ITERATIVE-LABEL: umin_i32_varying: 15917; GFX8_ITERATIVE: ; %bb.0: ; %entry 15918; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 15919; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 15920; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 15921; GFX8_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop 15922; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 15923; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 15924; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 15925; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 15926; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 15927; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 15928; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 15929; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 15930; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 15931; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 15932; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 15933; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 15934; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 15935; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 15936; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 15937; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 15938; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 15939; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 15940; GFX8_ITERATIVE-NEXT: ; %bb.3: 15941; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 15942; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 15943; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 15944; GFX8_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 15945; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15946; GFX8_ITERATIVE-NEXT: .LBB30_4: 15947; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 15948; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15949; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 15950; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 15951; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 15952; GFX8_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 15953; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15954; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 15955; GFX8_ITERATIVE-NEXT: s_endpgm 15956; 15957; GFX9_ITERATIVE-LABEL: umin_i32_varying: 15958; GFX9_ITERATIVE: ; %bb.0: ; %entry 15959; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 15960; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 15961; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 15962; GFX9_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop 15963; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 15964; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 15965; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 15966; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 15967; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 15968; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 15969; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 15970; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 15971; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 15972; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 15973; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 15974; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 15975; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 15976; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 15977; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 15978; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 15979; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 15980; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 15981; GFX9_ITERATIVE-NEXT: ; %bb.3: 15982; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 15983; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 15984; GFX9_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 15985; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15986; GFX9_ITERATIVE-NEXT: .LBB30_4: 15987; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 15988; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 15989; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 15990; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 15991; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 15992; GFX9_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 15993; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 15994; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 15995; GFX9_ITERATIVE-NEXT: s_endpgm 15996; 15997; GFX1064_ITERATIVE-LABEL: umin_i32_varying: 15998; GFX1064_ITERATIVE: ; %bb.0: ; %entry 15999; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 16000; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 16001; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 16002; GFX1064_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop 16003; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16004; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] 16005; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 16006; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 16007; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 16008; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] 16009; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 16010; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 16011; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 16012; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16013; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16014; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16015; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16016; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 16017; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 16018; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 16019; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 16020; GFX1064_ITERATIVE-NEXT: ; %bb.3: 16021; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 16022; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 16023; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 16024; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16025; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 16026; GFX1064_ITERATIVE-NEXT: .LBB30_4: 16027; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 16028; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 16029; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16030; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 16031; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 16032; GFX1064_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 16033; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 16034; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16035; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 16036; GFX1064_ITERATIVE-NEXT: s_endpgm 16037; 16038; GFX1032_ITERATIVE-LABEL: umin_i32_varying: 16039; GFX1032_ITERATIVE: ; %bb.0: ; %entry 16040; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 16041; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 16042; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 16043; GFX1032_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop 16044; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16045; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 16046; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 16047; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 16048; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 16049; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 16050; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 16051; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 16052; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 16053; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16054; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16055; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 16056; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 16057; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 16058; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 16059; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 16060; GFX1032_ITERATIVE-NEXT: ; %bb.3: 16061; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 16062; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 16063; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 16064; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16065; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 16066; GFX1032_ITERATIVE-NEXT: .LBB30_4: 16067; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 16068; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 16069; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16070; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 16071; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 16072; GFX1032_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 16073; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 16074; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16075; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 16076; GFX1032_ITERATIVE-NEXT: s_endpgm 16077; 16078; GFX1164_ITERATIVE-LABEL: umin_i32_varying: 16079; GFX1164_ITERATIVE: ; %bb.0: ; %entry 16080; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 16081; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec 16082; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 16083; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 16084; GFX1164_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop 16085; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16086; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] 16087; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 16088; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 16089; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 16090; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 16091; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] 16092; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 16093; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 16094; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 16095; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 16096; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16097; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 16098; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16099; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 16100; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 16101; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 16102; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc 16103; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16104; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 16105; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 16106; GFX1164_ITERATIVE-NEXT: ; %bb.3: 16107; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 16108; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 16109; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 16110; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16111; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 16112; GFX1164_ITERATIVE-NEXT: .LBB30_4: 16113; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] 16114; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 16115; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 16116; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 16117; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 16118; GFX1164_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 16119; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 16120; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16121; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 16122; GFX1164_ITERATIVE-NEXT: s_endpgm 16123; 16124; GFX1132_ITERATIVE-LABEL: umin_i32_varying: 16125; GFX1132_ITERATIVE: ; %bb.0: ; %entry 16126; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 16127; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo 16128; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 16129; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 16130; GFX1132_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop 16131; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16132; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 16133; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 16134; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 16135; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 16136; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 16137; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 16138; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) 16139; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 16140; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 16141; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 16142; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16143; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 16144; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 16145; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 16146; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 16147; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo 16148; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 16149; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 16150; GFX1132_ITERATIVE-NEXT: ; %bb.3: 16151; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 16152; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 16153; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16154; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 16155; GFX1132_ITERATIVE-NEXT: .LBB30_4: 16156; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 16157; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 16158; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 16159; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 16160; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 16161; GFX1132_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 16162; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 16163; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16164; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 16165; GFX1132_ITERATIVE-NEXT: s_endpgm 16166; 16167; GFX7LESS_DPP-LABEL: umin_i32_varying: 16168; GFX7LESS_DPP: ; %bb.0: ; %entry 16169; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 16170; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 16171; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 16172; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 16173; GFX7LESS_DPP-NEXT: ds_min_rtn_u32 v0, v1, v0 16174; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 16175; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 16176; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 16177; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 16178; GFX7LESS_DPP-NEXT: s_endpgm 16179; 16180; GFX8_DPP-LABEL: umin_i32_varying: 16181; GFX8_DPP: ; %bb.0: ; %entry 16182; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 16183; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 16184; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 16185; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] 16186; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 16187; GFX8_DPP-NEXT: s_nop 0 16188; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 16189; GFX8_DPP-NEXT: s_nop 1 16190; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 16191; GFX8_DPP-NEXT: s_nop 1 16192; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 16193; GFX8_DPP-NEXT: s_nop 1 16194; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 16195; GFX8_DPP-NEXT: s_nop 1 16196; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 16197; GFX8_DPP-NEXT: s_nop 1 16198; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 16199; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 16200; GFX8_DPP-NEXT: s_nop 0 16201; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 16202; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 16203; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 16204; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 16205; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 16206; GFX8_DPP-NEXT: s_cbranch_execz .LBB30_2 16207; GFX8_DPP-NEXT: ; %bb.1: 16208; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 16209; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 16210; GFX8_DPP-NEXT: s_mov_b32 m0, -1 16211; GFX8_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 16212; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 16213; GFX8_DPP-NEXT: .LBB30_2: 16214; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 16215; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16216; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 16217; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 16218; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 16219; GFX8_DPP-NEXT: s_mov_b32 s2, -1 16220; GFX8_DPP-NEXT: v_min_u32_e32 v0, s4, v0 16221; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 16222; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 16223; GFX8_DPP-NEXT: s_endpgm 16224; 16225; GFX9_DPP-LABEL: umin_i32_varying: 16226; GFX9_DPP: ; %bb.0: ; %entry 16227; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 16228; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 16229; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 16230; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] 16231; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 16232; GFX9_DPP-NEXT: s_nop 0 16233; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 16234; GFX9_DPP-NEXT: s_nop 1 16235; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 16236; GFX9_DPP-NEXT: s_nop 1 16237; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 16238; GFX9_DPP-NEXT: s_nop 1 16239; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 16240; GFX9_DPP-NEXT: s_nop 1 16241; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 16242; GFX9_DPP-NEXT: s_nop 1 16243; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 16244; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 16245; GFX9_DPP-NEXT: s_nop 0 16246; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 16247; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 16248; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 16249; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 16250; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 16251; GFX9_DPP-NEXT: s_cbranch_execz .LBB30_2 16252; GFX9_DPP-NEXT: ; %bb.1: 16253; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 16254; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 16255; GFX9_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 16256; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 16257; GFX9_DPP-NEXT: .LBB30_2: 16258; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 16259; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16260; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 16261; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 16262; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 16263; GFX9_DPP-NEXT: s_mov_b32 s2, -1 16264; GFX9_DPP-NEXT: v_min_u32_e32 v0, s4, v0 16265; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 16266; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 16267; GFX9_DPP-NEXT: s_endpgm 16268; 16269; GFX1064_DPP-LABEL: umin_i32_varying: 16270; GFX1064_DPP: ; %bb.0: ; %entry 16271; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 16272; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] 16273; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 16274; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 16275; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 16276; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 16277; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 16278; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 16279; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 16280; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 16281; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 16282; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 16283; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 16284; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 16285; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 16286; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 16287; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 16288; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16289; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 16290; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 16291; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 16292; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 16293; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 16294; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16295; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 16296; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 16297; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 16298; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16299; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 16300; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 16301; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 16302; GFX1064_DPP-NEXT: s_cbranch_execz .LBB30_2 16303; GFX1064_DPP-NEXT: ; %bb.1: 16304; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 16305; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 16306; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 16307; GFX1064_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 16308; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 16309; GFX1064_DPP-NEXT: buffer_gl0_inv 16310; GFX1064_DPP-NEXT: .LBB30_2: 16311; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 16312; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 16313; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16314; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 16315; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 16316; GFX1064_DPP-NEXT: v_min_u32_e32 v0, s3, v0 16317; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 16318; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 16319; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 16320; GFX1064_DPP-NEXT: s_endpgm 16321; 16322; GFX1032_DPP-LABEL: umin_i32_varying: 16323; GFX1032_DPP: ; %bb.0: ; %entry 16324; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 16325; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 16326; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 16327; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 16328; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 16329; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 16330; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 16331; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 16332; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 16333; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 16334; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 16335; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 16336; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 16337; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16338; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 16339; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 16340; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 16341; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 16342; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 16343; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 16344; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 16345; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 16346; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2 16347; GFX1032_DPP-NEXT: ; %bb.1: 16348; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 16349; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 16350; GFX1032_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 16351; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 16352; GFX1032_DPP-NEXT: buffer_gl0_inv 16353; GFX1032_DPP-NEXT: .LBB30_2: 16354; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 16355; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 16356; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16357; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 16358; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 16359; GFX1032_DPP-NEXT: v_min_u32_e32 v0, s3, v0 16360; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 16361; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 16362; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 16363; GFX1032_DPP-NEXT: s_endpgm 16364; 16365; GFX1164_DPP-LABEL: umin_i32_varying: 16366; GFX1164_DPP: ; %bb.0: ; %entry 16367; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 16368; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 16369; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 16370; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] 16371; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 16372; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 16373; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 16374; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 16375; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16376; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 16377; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 16378; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16379; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 16380; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 16381; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16382; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 16383; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 16384; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16385; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 16386; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 16387; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 16388; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 16389; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 16390; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 16391; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 16392; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16393; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 16394; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 16395; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 16396; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 16397; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 16398; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 16399; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16400; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 16401; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 16402; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 16403; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16404; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 16405; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 16406; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 16407; GFX1164_DPP-NEXT: s_cbranch_execz .LBB30_2 16408; GFX1164_DPP-NEXT: ; %bb.1: 16409; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 16410; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 16411; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 16412; GFX1164_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 16413; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 16414; GFX1164_DPP-NEXT: buffer_gl0_inv 16415; GFX1164_DPP-NEXT: .LBB30_2: 16416; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 16417; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 16418; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 16419; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 16420; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 16421; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s3, v0 16422; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 16423; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 16424; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 16425; GFX1164_DPP-NEXT: s_endpgm 16426; 16427; GFX1132_DPP-LABEL: umin_i32_varying: 16428; GFX1132_DPP: ; %bb.0: ; %entry 16429; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 16430; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 16431; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 16432; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 16433; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 16434; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 16435; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 16436; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 16437; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16438; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 16439; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 16440; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16441; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 16442; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 16443; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 16444; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 16445; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 16446; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 16447; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 16448; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16449; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 16450; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 16451; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 16452; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 16453; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 16454; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 16455; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 16456; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo 16457; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2 16458; GFX1132_DPP-NEXT: ; %bb.1: 16459; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 16460; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 16461; GFX1132_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 16462; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 16463; GFX1132_DPP-NEXT: buffer_gl0_inv 16464; GFX1132_DPP-NEXT: .LBB30_2: 16465; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 16466; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 16467; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 16468; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 16469; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 16470; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s3, v0 16471; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 16472; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 16473; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 16474; GFX1132_DPP-NEXT: s_endpgm 16475entry: 16476 %lane = call i32 @llvm.amdgcn.workitem.id.x() 16477 %old = atomicrmw umin ptr addrspace(3) @local_var32, i32 %lane acq_rel 16478 store i32 %old, ptr addrspace(1) %out 16479 ret void 16480} 16481 16482define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { 16483; GFX7LESS-LABEL: umin_i64_constant: 16484; GFX7LESS: ; %bb.0: ; %entry 16485; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 16486; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 16487; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16488; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 16489; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 16490; GFX7LESS-NEXT: s_cbranch_execz .LBB31_2 16491; GFX7LESS-NEXT: ; %bb.1: 16492; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 16493; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 16494; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 16495; GFX7LESS-NEXT: s_mov_b32 m0, -1 16496; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 16497; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 16498; GFX7LESS-NEXT: .LBB31_2: 16499; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 16500; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 16501; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 16502; GFX7LESS-NEXT: s_mov_b32 s2, -1 16503; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 16504; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 16505; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 16506; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 16507; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 16508; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 16509; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 16510; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 16511; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 16512; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 16513; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 16514; GFX7LESS-NEXT: s_endpgm 16515; 16516; GFX8-LABEL: umin_i64_constant: 16517; GFX8: ; %bb.0: ; %entry 16518; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16519; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16520; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16521; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 16522; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 16523; GFX8-NEXT: s_cbranch_execz .LBB31_2 16524; GFX8-NEXT: ; %bb.1: 16525; GFX8-NEXT: v_mov_b32_e32 v0, 5 16526; GFX8-NEXT: v_mov_b32_e32 v1, 0 16527; GFX8-NEXT: v_mov_b32_e32 v2, 0 16528; GFX8-NEXT: s_mov_b32 m0, -1 16529; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 16530; GFX8-NEXT: s_waitcnt lgkmcnt(0) 16531; GFX8-NEXT: .LBB31_2: 16532; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 16533; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16534; GFX8-NEXT: v_readfirstlane_b32 s5, v1 16535; GFX8-NEXT: v_readfirstlane_b32 s4, v0 16536; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 16537; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 16538; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 16539; GFX8-NEXT: v_mov_b32_e32 v2, s5 16540; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 16541; GFX8-NEXT: v_mov_b32_e32 v2, s4 16542; GFX8-NEXT: s_mov_b32 s3, 0xf000 16543; GFX8-NEXT: s_mov_b32 s2, -1 16544; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 16545; GFX8-NEXT: s_waitcnt lgkmcnt(0) 16546; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 16547; GFX8-NEXT: s_endpgm 16548; 16549; GFX9-LABEL: umin_i64_constant: 16550; GFX9: ; %bb.0: ; %entry 16551; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16552; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16553; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16554; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 16555; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 16556; GFX9-NEXT: s_cbranch_execz .LBB31_2 16557; GFX9-NEXT: ; %bb.1: 16558; GFX9-NEXT: v_mov_b32_e32 v0, 5 16559; GFX9-NEXT: v_mov_b32_e32 v1, 0 16560; GFX9-NEXT: v_mov_b32_e32 v2, 0 16561; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 16562; GFX9-NEXT: s_waitcnt lgkmcnt(0) 16563; GFX9-NEXT: .LBB31_2: 16564; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 16565; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16566; GFX9-NEXT: v_readfirstlane_b32 s5, v1 16567; GFX9-NEXT: v_readfirstlane_b32 s4, v0 16568; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 16569; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 16570; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 16571; GFX9-NEXT: v_mov_b32_e32 v2, s5 16572; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 16573; GFX9-NEXT: v_mov_b32_e32 v2, s4 16574; GFX9-NEXT: s_mov_b32 s3, 0xf000 16575; GFX9-NEXT: s_mov_b32 s2, -1 16576; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 16577; GFX9-NEXT: s_waitcnt lgkmcnt(0) 16578; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 16579; GFX9-NEXT: s_endpgm 16580; 16581; GFX1064-LABEL: umin_i64_constant: 16582; GFX1064: ; %bb.0: ; %entry 16583; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16584; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16585; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16586; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 16587; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 16588; GFX1064-NEXT: s_cbranch_execz .LBB31_2 16589; GFX1064-NEXT: ; %bb.1: 16590; GFX1064-NEXT: v_mov_b32_e32 v0, 5 16591; GFX1064-NEXT: v_mov_b32_e32 v1, 0 16592; GFX1064-NEXT: v_mov_b32_e32 v2, 0 16593; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 16594; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 16595; GFX1064-NEXT: buffer_gl0_inv 16596; GFX1064-NEXT: .LBB31_2: 16597; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 16598; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 16599; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16600; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 16601; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 16602; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 16603; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 16604; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 16605; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 16606; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 16607; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 16608; GFX1064-NEXT: s_mov_b32 s2, -1 16609; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 16610; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 16611; GFX1064-NEXT: s_endpgm 16612; 16613; GFX1032-LABEL: umin_i64_constant: 16614; GFX1032: ; %bb.0: ; %entry 16615; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16616; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 16617; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 16618; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 16619; GFX1032-NEXT: s_cbranch_execz .LBB31_2 16620; GFX1032-NEXT: ; %bb.1: 16621; GFX1032-NEXT: v_mov_b32_e32 v0, 5 16622; GFX1032-NEXT: v_mov_b32_e32 v1, 0 16623; GFX1032-NEXT: v_mov_b32_e32 v2, 0 16624; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 16625; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 16626; GFX1032-NEXT: buffer_gl0_inv 16627; GFX1032-NEXT: .LBB31_2: 16628; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 16629; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 16630; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16631; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 16632; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 16633; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 16634; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 16635; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 16636; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 16637; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 16638; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 16639; GFX1032-NEXT: s_mov_b32 s2, -1 16640; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 16641; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 16642; GFX1032-NEXT: s_endpgm 16643; 16644; GFX1164-LABEL: umin_i64_constant: 16645; GFX1164: ; %bb.0: ; %entry 16646; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16647; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16648; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16649; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16650; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 16651; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc 16652; GFX1164-NEXT: s_cbranch_execz .LBB31_2 16653; GFX1164-NEXT: ; %bb.1: 16654; GFX1164-NEXT: v_mov_b32_e32 v0, 5 16655; GFX1164-NEXT: v_mov_b32_e32 v1, 0 16656; GFX1164-NEXT: v_mov_b32_e32 v2, 0 16657; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 16658; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 16659; GFX1164-NEXT: buffer_gl0_inv 16660; GFX1164-NEXT: .LBB31_2: 16661; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 16662; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 16663; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 16664; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 16665; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 16666; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 16667; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 16668; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 16669; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 16670; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 16671; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 16672; GFX1164-NEXT: s_mov_b32 s2, -1 16673; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 16674; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 16675; GFX1164-NEXT: s_endpgm 16676; 16677; GFX1132-LABEL: umin_i64_constant: 16678; GFX1132: ; %bb.0: ; %entry 16679; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16680; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 16681; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 16682; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 16683; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo 16684; GFX1132-NEXT: s_cbranch_execz .LBB31_2 16685; GFX1132-NEXT: ; %bb.1: 16686; GFX1132-NEXT: v_mov_b32_e32 v0, 5 16687; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 16688; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 16689; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 16690; GFX1132-NEXT: buffer_gl0_inv 16691; GFX1132-NEXT: .LBB31_2: 16692; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 16693; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 16694; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 16695; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 16696; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 16697; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 16698; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 16699; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 16700; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 16701; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 16702; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 16703; GFX1132-NEXT: s_mov_b32 s2, -1 16704; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 16705; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 16706; GFX1132-NEXT: s_endpgm 16707entry: 16708 %old = atomicrmw umin ptr addrspace(3) @local_var64, i64 5 acq_rel 16709 store i64 %old, ptr addrspace(1) %out 16710 ret void 16711} 16712 16713define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { 16714; GFX7LESS_ITERATIVE-LABEL: umin_i64_varying: 16715; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry 16716; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 16717; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 16718; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 16719; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 16720; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop 16721; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16722; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 16723; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 16724; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 16725; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 16726; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 16727; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 16728; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 16729; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 16730; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] 16731; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 16732; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 16733; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 16734; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 16735; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 16736; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 16737; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] 16738; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB32_1 16739; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16740; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 16741; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 16742; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16743; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 16744; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 16745; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 16746; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 16747; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: 16748; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 16749; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 16750; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 16751; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 16752; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] 16753; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16754; GFX7LESS_ITERATIVE-NEXT: .LBB32_4: 16755; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 16756; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 16757; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 16758; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 16759; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 16760; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 16761; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 16762; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] 16763; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 16764; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 16765; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 16766; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16767; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 16768; GFX7LESS_ITERATIVE-NEXT: s_endpgm 16769; 16770; GFX8_ITERATIVE-LABEL: umin_i64_varying: 16771; GFX8_ITERATIVE: ; %bb.0: ; %entry 16772; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 16773; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 16774; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 16775; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 16776; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop 16777; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16778; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 16779; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 16780; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 16781; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 16782; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 16783; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] 16784; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 16785; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 16786; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 16787; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 16788; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 16789; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 16790; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 16791; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 16792; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 16793; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 16794; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16795; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16796; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16797; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16798; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 16799; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 16800; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 16801; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 16802; GFX8_ITERATIVE-NEXT: ; %bb.3: 16803; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 16804; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 16805; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 16806; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 16807; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] 16808; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16809; GFX8_ITERATIVE-NEXT: .LBB32_4: 16810; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 16811; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16812; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 16813; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 16814; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] 16815; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 16816; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 16817; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 16818; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 16819; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 16820; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 16821; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16822; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 16823; GFX8_ITERATIVE-NEXT: s_endpgm 16824; 16825; GFX9_ITERATIVE-LABEL: umin_i64_varying: 16826; GFX9_ITERATIVE: ; %bb.0: ; %entry 16827; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 16828; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 16829; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 16830; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 16831; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop 16832; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16833; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] 16834; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 16835; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 16836; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 16837; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 16838; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] 16839; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 16840; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec 16841; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 16842; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 16843; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 16844; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 16845; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 16846; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 16847; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 16848; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 16849; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16850; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16851; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16852; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16853; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 16854; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 16855; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 16856; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 16857; GFX9_ITERATIVE-NEXT: ; %bb.3: 16858; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 16859; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 16860; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 16861; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] 16862; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16863; GFX9_ITERATIVE-NEXT: .LBB32_4: 16864; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 16865; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16866; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 16867; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 16868; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] 16869; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 16870; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 16871; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 16872; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 16873; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 16874; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 16875; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16876; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 16877; GFX9_ITERATIVE-NEXT: s_endpgm 16878; 16879; GFX1064_ITERATIVE-LABEL: umin_i64_varying: 16880; GFX1064_ITERATIVE: ; %bb.0: ; %entry 16881; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 16882; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 16883; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 16884; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 16885; GFX1064_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop 16886; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16887; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] 16888; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 16889; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 16890; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 16891; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 16892; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] 16893; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec 16894; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 16895; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 16896; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 16897; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 16898; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 16899; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 16900; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16901; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16902; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 16903; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 16904; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 16905; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 16906; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 16907; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 16908; GFX1064_ITERATIVE-NEXT: ; %bb.3: 16909; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 16910; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 16911; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 16912; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] 16913; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16914; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv 16915; GFX1064_ITERATIVE-NEXT: .LBB32_4: 16916; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 16917; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 16918; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16919; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 16920; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 16921; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2] 16922; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc 16923; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc 16924; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 16925; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 16926; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16927; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 16928; GFX1064_ITERATIVE-NEXT: s_endpgm 16929; 16930; GFX1032_ITERATIVE-LABEL: umin_i64_varying: 16931; GFX1032_ITERATIVE: ; %bb.0: ; %entry 16932; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 16933; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 16934; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 16935; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 16936; GFX1032_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop 16937; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16938; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 16939; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 16940; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 16941; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 16942; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 16943; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] 16944; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo 16945; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 16946; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 16947; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 16948; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 16949; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 16950; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 16951; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 16952; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 16953; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 16954; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 16955; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 16956; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 16957; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 16958; GFX1032_ITERATIVE-NEXT: ; %bb.3: 16959; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 16960; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 16961; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 16962; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] 16963; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16964; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv 16965; GFX1032_ITERATIVE-NEXT: .LBB32_4: 16966; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 16967; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 16968; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 16969; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 16970; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 16971; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2] 16972; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo 16973; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo 16974; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 16975; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 16976; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 16977; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 16978; GFX1032_ITERATIVE-NEXT: s_endpgm 16979; 16980; GFX1164_ITERATIVE-LABEL: umin_i64_varying: 16981; GFX1164_ITERATIVE: ; %bb.0: ; %entry 16982; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 16983; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 16984; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec 16985; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 16986; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 16987; GFX1164_ITERATIVE-NEXT: .p2align 6 16988; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop 16989; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 16990; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] 16991; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 16992; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 16993; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 16994; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 16995; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 16996; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 16997; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] 16998; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec 16999; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 17000; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 17001; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 17002; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 17003; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] 17004; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 17005; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 17006; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 17007; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 17008; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 17009; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 17010; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 17011; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 17012; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc 17013; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17014; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 17015; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 17016; GFX1164_ITERATIVE-NEXT: ; %bb.3: 17017; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 17018; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 17019; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 17020; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] 17021; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 17022; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv 17023; GFX1164_ITERATIVE-NEXT: .LBB32_4: 17024; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] 17025; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 17026; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 17027; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 17028; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 17029; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 17030; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 17031; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 17032; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 17033; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 17034; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 17035; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 17036; GFX1164_ITERATIVE-NEXT: s_endpgm 17037; 17038; GFX1132_ITERATIVE-LABEL: umin_i64_varying: 17039; GFX1132_ITERATIVE: ; %bb.0: ; %entry 17040; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 17041; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo 17042; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 17043; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 17044; GFX1132_ITERATIVE-NEXT: .p2align 6 17045; GFX1132_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop 17046; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 17047; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 17048; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 17049; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 17050; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 17051; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 17052; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 17053; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 17054; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] 17055; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo 17056; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 17057; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 17058; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 17059; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 17060; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 17061; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 17062; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 17063; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd 17064; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 17065; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 17066; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 17067; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 17068; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo 17069; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 17070; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 17071; GFX1132_ITERATIVE-NEXT: ; %bb.3: 17072; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 17073; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 17074; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] 17075; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 17076; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv 17077; GFX1132_ITERATIVE-NEXT: .LBB32_4: 17078; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 17079; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 17080; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 17081; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 17082; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) 17083; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 17084; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 17085; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 17086; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 17087; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 17088; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) 17089; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 17090; GFX1132_ITERATIVE-NEXT: s_endpgm 17091; 17092; GFX7LESS_DPP-LABEL: umin_i64_varying: 17093; GFX7LESS_DPP: ; %bb.0: ; %entry 17094; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 17095; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 17096; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 17097; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 17098; GFX7LESS_DPP-NEXT: ds_min_rtn_u64 v[0:1], v1, v[0:1] 17099; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) 17100; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 17101; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 17102; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 17103; GFX7LESS_DPP-NEXT: s_endpgm 17104; 17105; GFX8_DPP-LABEL: umin_i64_varying: 17106; GFX8_DPP: ; %bb.0: ; %entry 17107; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 17108; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 17109; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 17110; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 17111; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, 0, s[0:1] 17112; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] 17113; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 17114; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 17115; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 17116; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 17117; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17118; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17119; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17120; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17121; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 17122; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 17123; GFX8_DPP-NEXT: s_nop 0 17124; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 17125; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 17126; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17127; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17128; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17129; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 17130; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 17131; GFX8_DPP-NEXT: s_nop 0 17132; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 17133; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 17134; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17135; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17136; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17137; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 17138; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 17139; GFX8_DPP-NEXT: s_nop 0 17140; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 17141; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 17142; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17143; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17144; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17145; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 17146; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 17147; GFX8_DPP-NEXT: s_nop 0 17148; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 17149; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 17150; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17151; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17152; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17153; GFX8_DPP-NEXT: v_mov_b32_e32 v5, -1 17154; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 17155; GFX8_DPP-NEXT: s_nop 0 17156; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 17157; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 17158; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17159; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc 17160; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17161; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 17162; GFX8_DPP-NEXT: v_readlane_b32 s3, v3, 63 17163; GFX8_DPP-NEXT: v_readlane_b32 s2, v4, 63 17164; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 17165; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 17166; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] 17167; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 17168; GFX8_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 17169; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 17170; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 17171; GFX8_DPP-NEXT: ; %bb.1: 17172; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s3 17173; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s2 17174; GFX8_DPP-NEXT: s_mov_b32 m0, -1 17175; GFX8_DPP-NEXT: ds_min_rtn_u64 v[6:7], v8, v[6:7] 17176; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 17177; GFX8_DPP-NEXT: .LBB32_2: 17178; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 17179; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 17180; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 17181; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 17182; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v1 17183; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 17184; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] 17185; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 17186; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc 17187; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 17188; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 17189; GFX8_DPP-NEXT: s_mov_b32 s2, -1 17190; GFX8_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc 17191; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) 17192; GFX8_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 17193; GFX8_DPP-NEXT: s_endpgm 17194; 17195; GFX9_DPP-LABEL: umin_i64_varying: 17196; GFX9_DPP: ; %bb.0: ; %entry 17197; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 17198; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 17199; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 17200; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 17201; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, 0, s[0:1] 17202; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] 17203; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 17204; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 17205; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 17206; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf 17207; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17208; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17209; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17210; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17211; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 17212; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 17213; GFX9_DPP-NEXT: s_nop 0 17214; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf 17215; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf 17216; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17217; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17218; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17219; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 17220; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 17221; GFX9_DPP-NEXT: s_nop 0 17222; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf 17223; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 17224; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17225; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17226; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17227; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 17228; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 17229; GFX9_DPP-NEXT: s_nop 0 17230; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf 17231; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf 17232; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17233; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17234; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17235; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 17236; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 17237; GFX9_DPP-NEXT: s_nop 0 17238; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf 17239; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 17240; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17241; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17242; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17243; GFX9_DPP-NEXT: v_mov_b32_e32 v5, -1 17244; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 17245; GFX9_DPP-NEXT: s_nop 0 17246; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 17247; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 17248; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] 17249; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc 17250; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 17251; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 17252; GFX9_DPP-NEXT: v_readlane_b32 s3, v3, 63 17253; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 17254; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf 17255; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf 17256; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] 17257; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 17258; GFX9_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 17259; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc 17260; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 17261; GFX9_DPP-NEXT: ; %bb.1: 17262; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s3 17263; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s2 17264; GFX9_DPP-NEXT: ds_min_rtn_u64 v[6:7], v8, v[6:7] 17265; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 17266; GFX9_DPP-NEXT: .LBB32_2: 17267; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] 17268; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 17269; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 17270; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 17271; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 17272; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 17273; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] 17274; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 17275; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc 17276; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 17277; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 17278; GFX9_DPP-NEXT: s_mov_b32 s2, -1 17279; GFX9_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc 17280; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) 17281; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 17282; GFX9_DPP-NEXT: s_endpgm 17283; 17284; GFX1064_DPP-LABEL: umin_i64_varying: 17285; GFX1064_DPP: ; %bb.0: ; %entry 17286; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 17287; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1] 17288; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 17289; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 17290; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] 17291; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 17292; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 17293; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17294; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 17295; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] 17296; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17297; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 17298; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 17299; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 17300; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 17301; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 17302; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] 17303; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 17304; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 17305; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 17306; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 17307; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 17308; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 17309; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] 17310; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17311; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 17312; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 17313; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 17314; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 17315; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 17316; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] 17317; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 17318; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 17319; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 17320; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 17321; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 17322; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 17323; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] 17324; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17325; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 17326; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 17327; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 17328; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 17329; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 17330; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 17331; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 17332; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 17333; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 17334; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 17335; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] 17336; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17337; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 17338; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 17339; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 17340; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 17341; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 17342; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17343; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 17344; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 17345; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 17346; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 17347; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 17348; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 17349; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 17350; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 17351; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 17352; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 17353; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 17354; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 17355; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 17356; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] 17357; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 17358; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 17359; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 17360; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 17361; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 17362; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 17363; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] 17364; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 17365; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 17366; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 17367; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 17368; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 17369; GFX1064_DPP-NEXT: ; %bb.1: 17370; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 17371; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 17372; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[7:8], v0, v[7:8] 17373; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 17374; GFX1064_DPP-NEXT: buffer_gl0_inv 17375; GFX1064_DPP-NEXT: .LBB32_2: 17376; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 17377; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 17378; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 17379; GFX1064_DPP-NEXT: s_mov_b32 null, 0 17380; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 17381; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 17382; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 17383; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 17384; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 17385; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] 17386; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc 17387; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc 17388; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) 17389; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 17390; GFX1064_DPP-NEXT: s_endpgm 17391; 17392; GFX1032_DPP-LABEL: umin_i64_varying: 17393; GFX1032_DPP: ; %bb.0: ; %entry 17394; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 17395; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s2 17396; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 17397; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 17398; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 17399; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 17400; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 17401; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17402; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 17403; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] 17404; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 17405; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 17406; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 17407; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 17408; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 17409; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 17410; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6] 17411; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 17412; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 17413; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 17414; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 17415; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 17416; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 17417; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] 17418; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 17419; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 17420; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 17421; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 17422; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 17423; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 17424; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6] 17425; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 17426; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 17427; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 17428; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 17429; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 17430; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 17431; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 17432; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] 17433; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo 17434; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 17435; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 17436; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 17437; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 17438; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 17439; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17440; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 17441; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 17442; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 17443; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 17444; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 17445; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 17446; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 17447; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 17448; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 17449; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 17450; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 17451; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 17452; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 17453; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 17454; GFX1032_DPP-NEXT: ; %bb.1: 17455; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 17456; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 17457; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[7:8], v0, v[7:8] 17458; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 17459; GFX1032_DPP-NEXT: buffer_gl0_inv 17460; GFX1032_DPP-NEXT: .LBB32_2: 17461; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 17462; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 17463; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 17464; GFX1032_DPP-NEXT: s_mov_b32 null, 0 17465; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 17466; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 17467; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 17468; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 17469; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 17470; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8] 17471; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo 17472; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo 17473; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) 17474; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 17475; GFX1032_DPP-NEXT: s_endpgm 17476; 17477; GFX1164_DPP-LABEL: umin_i64_varying: 17478; GFX1164_DPP: ; %bb.0: ; %entry 17479; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 17480; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 17481; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17482; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1] 17483; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 17484; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 17485; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] 17486; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 17487; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 17488; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17489; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 17490; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 17491; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] 17492; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17493; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 17494; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 17495; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 17496; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 17497; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 17498; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 17499; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) 17500; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] 17501; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 17502; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 17503; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 17504; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 17505; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 17506; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 17507; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 17508; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] 17509; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17510; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 17511; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 17512; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 17513; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 17514; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 17515; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 17516; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 17517; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] 17518; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 17519; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 17520; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 17521; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17522; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 17523; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 17524; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 17525; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 17526; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] 17527; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17528; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 17529; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 17530; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 17531; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 17532; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 17533; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 17534; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17535; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 17536; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 17537; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17538; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 17539; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 17540; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 17541; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 17542; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4] 17543; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 17544; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 17545; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 17546; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 17547; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 17548; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 17549; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17550; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 17551; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 17552; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 17553; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 17554; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 17555; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 17556; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 17557; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 17558; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 17559; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 17560; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 17561; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 17562; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 17563; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] 17564; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17565; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 17566; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 17567; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 17568; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] 17569; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 17570; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 17571; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] 17572; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 17573; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 17574; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 17575; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc 17576; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 17577; GFX1164_DPP-NEXT: ; %bb.1: 17578; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 17579; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 17580; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[7:8], v0, v[7:8] 17581; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 17582; GFX1164_DPP-NEXT: buffer_gl0_inv 17583; GFX1164_DPP-NEXT: .LBB32_2: 17584; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] 17585; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 17586; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 17587; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 17588; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 17589; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 17590; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 17591; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 17592; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] 17593; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc 17594; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc 17595; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) 17596; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 17597; GFX1164_DPP-NEXT: s_endpgm 17598; 17599; GFX1132_DPP-LABEL: umin_i64_varying: 17600; GFX1132_DPP: ; %bb.0: ; %entry 17601; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 17602; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 17603; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 17604; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s2 17605; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1 17606; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 17607; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1 17608; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 17609; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17610; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 17611; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 17612; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] 17613; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 17614; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1 17615; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf 17616; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 17617; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf 17618; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6] 17619; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 17620; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1 17621; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 17622; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf 17623; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf 17624; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 17625; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] 17626; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1 17627; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1 17628; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf 17629; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 17630; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf 17631; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6] 17632; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1 17633; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 17634; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 17635; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v1, -1, -1 17636; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17637; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 17638; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 17639; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 17640; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] 17641; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, -1 :: v_dual_cndmask_b32 v2, v4, v2 17642; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_cndmask_b32 v1, v3, v1 17643; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 17644; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 17645; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 17646; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 17647; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf 17648; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf 17649; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 17650; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 17651; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 17652; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 17653; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 17654; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 17655; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 17656; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 17657; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 17658; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 17659; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 17660; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo 17661; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 17662; GFX1132_DPP-NEXT: ; %bb.1: 17663; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 17664; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[7:8], v0, v[7:8] 17665; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 17666; GFX1132_DPP-NEXT: buffer_gl0_inv 17667; GFX1132_DPP-NEXT: .LBB32_2: 17668; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 17669; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 17670; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 17671; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 17672; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 17673; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 17674; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 17675; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) 17676; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8] 17677; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo 17678; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo 17679; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) 17680; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 17681; GFX1132_DPP-NEXT: s_endpgm 17682entry: 17683 %lane = call i32 @llvm.amdgcn.workitem.id.x() 17684 %lane_ext = zext i32 %lane to i64 17685 %old = atomicrmw umin ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel 17686 store i64 %old, ptr addrspace(1) %out 17687 ret void 17688} 17689