1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s 3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s 5 6define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { 7; GFX8-LABEL: dpp_test: 8; GFX8: ; %bb.0: 9; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 10; GFX8-NEXT: s_waitcnt lgkmcnt(0) 11; GFX8-NEXT: v_mov_b32_e32 v2, s2 12; GFX8-NEXT: v_mov_b32_e32 v0, s3 13; GFX8-NEXT: s_nop 1 14; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 15; GFX8-NEXT: v_mov_b32_e32 v0, s0 16; GFX8-NEXT: v_mov_b32_e32 v1, s1 17; GFX8-NEXT: flat_store_dword v[0:1], v2 18; GFX8-NEXT: s_endpgm 19; 20; GFX10-LABEL: dpp_test: 21; GFX10: ; %bb.0: 22; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 23; GFX10-NEXT: s_waitcnt lgkmcnt(0) 24; GFX10-NEXT: v_mov_b32_e32 v0, s2 25; GFX10-NEXT: v_mov_b32_e32 v1, s3 26; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 27; GFX10-NEXT: v_mov_b32_e32 v1, 0 28; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 29; GFX10-NEXT: s_endpgm 30; 31; GFX11-LABEL: dpp_test: 32; GFX11: ; %bb.0: 33; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 34; GFX11-NEXT: s_waitcnt lgkmcnt(0) 35; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 36; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 37; GFX11-NEXT: v_mov_b32_e32 v1, 0 38; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 39; GFX11-NEXT: s_endpgm 40 %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) 41 store i32 %tmp0, ptr addrspace(1) %out 42 ret void 43} 44define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { 45; GFX8-LABEL: update_dppi64_test: 46; GFX8: ; %bb.0: 47; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 48; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 49; GFX8-NEXT: s_waitcnt lgkmcnt(0) 50; GFX8-NEXT: v_mov_b32_e32 v0, s0 51; GFX8-NEXT: v_mov_b32_e32 v1, s1 52; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 53; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 54; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 55; GFX8-NEXT: v_mov_b32_e32 v4, s2 56; GFX8-NEXT: v_mov_b32_e32 v5, s3 57; GFX8-NEXT: s_waitcnt vmcnt(0) 58; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 59; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 60; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] 61; GFX8-NEXT: s_endpgm 62; 63; GFX10-LABEL: update_dppi64_test: 64; GFX10: ; %bb.0: 65; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 66; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 67; GFX10-NEXT: s_waitcnt lgkmcnt(0) 68; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 69; GFX10-NEXT: v_mov_b32_e32 v2, s2 70; GFX10-NEXT: v_mov_b32_e32 v3, s3 71; GFX10-NEXT: s_waitcnt vmcnt(0) 72; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 73; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 74; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] 75; GFX10-NEXT: s_endpgm 76; 77; GFX11-LABEL: update_dppi64_test: 78; GFX11: ; %bb.0: 79; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 80; GFX11-NEXT: s_waitcnt lgkmcnt(0) 81; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 82; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 83; GFX11-NEXT: v_mov_b32_e32 v2, s2 84; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] 85; GFX11-NEXT: s_waitcnt vmcnt(0) 86; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 87; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 88; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] 89; GFX11-NEXT: s_endpgm 90 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 91 %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id 92 %load = load i64, ptr addrspace(1) %gep 93 %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #1 94 store i64 %tmp0, ptr addrspace(1) %gep 95 ret void 96} 97 98define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { 99; GFX8-LABEL: update_dppf64_test: 100; GFX8: ; %bb.0: 101; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 102; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 103; GFX8-NEXT: s_waitcnt lgkmcnt(0) 104; GFX8-NEXT: v_mov_b32_e32 v0, s0 105; GFX8-NEXT: v_mov_b32_e32 v1, s1 106; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 107; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 108; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 109; GFX8-NEXT: v_mov_b32_e32 v4, s2 110; GFX8-NEXT: v_mov_b32_e32 v5, s3 111; GFX8-NEXT: s_waitcnt vmcnt(0) 112; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 113; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 114; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] 115; GFX8-NEXT: s_endpgm 116; 117; GFX10-LABEL: update_dppf64_test: 118; GFX10: ; %bb.0: 119; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 120; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 121; GFX10-NEXT: s_waitcnt lgkmcnt(0) 122; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 123; GFX10-NEXT: v_mov_b32_e32 v2, s2 124; GFX10-NEXT: v_mov_b32_e32 v3, s3 125; GFX10-NEXT: s_waitcnt vmcnt(0) 126; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 127; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 128; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] 129; GFX10-NEXT: s_endpgm 130; 131; GFX11-LABEL: update_dppf64_test: 132; GFX11: ; %bb.0: 133; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 134; GFX11-NEXT: s_waitcnt lgkmcnt(0) 135; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 136; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 137; GFX11-NEXT: v_mov_b32_e32 v2, s2 138; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] 139; GFX11-NEXT: s_waitcnt vmcnt(0) 140; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 141; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 142; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] 143; GFX11-NEXT: s_endpgm 144 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 145 %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id 146 %load = load double, ptr addrspace(1) %gep 147 %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #1 148 store double %tmp0, ptr addrspace(1) %gep 149 ret void 150} 151 152define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { 153; GFX8-LABEL: update_dppv2i32_test: 154; GFX8: ; %bb.0: 155; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 156; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 157; GFX8-NEXT: s_waitcnt lgkmcnt(0) 158; GFX8-NEXT: v_mov_b32_e32 v0, s0 159; GFX8-NEXT: v_mov_b32_e32 v1, s1 160; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 161; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 162; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 163; GFX8-NEXT: v_mov_b32_e32 v4, s2 164; GFX8-NEXT: v_mov_b32_e32 v5, s3 165; GFX8-NEXT: s_waitcnt vmcnt(0) 166; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 167; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 168; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] 169; GFX8-NEXT: s_endpgm 170; 171; GFX10-LABEL: update_dppv2i32_test: 172; GFX10: ; %bb.0: 173; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 174; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 175; GFX10-NEXT: s_waitcnt lgkmcnt(0) 176; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 177; GFX10-NEXT: v_mov_b32_e32 v2, s2 178; GFX10-NEXT: v_mov_b32_e32 v3, s3 179; GFX10-NEXT: s_waitcnt vmcnt(0) 180; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 181; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 182; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] 183; GFX10-NEXT: s_endpgm 184; 185; GFX11-LABEL: update_dppv2i32_test: 186; GFX11: ; %bb.0: 187; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 188; GFX11-NEXT: s_waitcnt lgkmcnt(0) 189; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 190; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 191; GFX11-NEXT: v_mov_b32_e32 v2, s2 192; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] 193; GFX11-NEXT: s_waitcnt vmcnt(0) 194; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 195; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 196; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] 197; GFX11-NEXT: s_endpgm 198 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 199 %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id 200 %load = load <2 x i32>, ptr addrspace(1) %gep 201 %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #1 202 store <2 x i32> %tmp0, ptr addrspace(1) %gep 203 ret void 204} 205 206define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { 207; GFX8-LABEL: update_dppv2f32_test: 208; GFX8: ; %bb.0: 209; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 210; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 211; GFX8-NEXT: s_waitcnt lgkmcnt(0) 212; GFX8-NEXT: v_mov_b32_e32 v0, s0 213; GFX8-NEXT: v_mov_b32_e32 v1, s1 214; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 215; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 216; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 217; GFX8-NEXT: v_mov_b32_e32 v4, s2 218; GFX8-NEXT: v_mov_b32_e32 v5, s3 219; GFX8-NEXT: s_waitcnt vmcnt(0) 220; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 221; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 222; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] 223; GFX8-NEXT: s_endpgm 224; 225; GFX10-LABEL: update_dppv2f32_test: 226; GFX10: ; %bb.0: 227; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 228; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 229; GFX10-NEXT: s_waitcnt lgkmcnt(0) 230; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 231; GFX10-NEXT: v_mov_b32_e32 v2, s2 232; GFX10-NEXT: v_mov_b32_e32 v3, s3 233; GFX10-NEXT: s_waitcnt vmcnt(0) 234; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 235; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 236; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] 237; GFX10-NEXT: s_endpgm 238; 239; GFX11-LABEL: update_dppv2f32_test: 240; GFX11: ; %bb.0: 241; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 242; GFX11-NEXT: s_waitcnt lgkmcnt(0) 243; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 244; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 245; GFX11-NEXT: v_mov_b32_e32 v2, s2 246; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] 247; GFX11-NEXT: s_waitcnt vmcnt(0) 248; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 249; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 250; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] 251; GFX11-NEXT: s_endpgm 252 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 253 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id 254 %load = load <2 x float>, ptr addrspace(1) %gep 255 %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #1 256 store <2 x float> %tmp0, ptr addrspace(1) %gep 257 ret void 258} 259 260define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { 261; GFX8-LABEL: update_dpp_p0_test: 262; GFX8: ; %bb.0: 263; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 264; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 265; GFX8-NEXT: s_waitcnt lgkmcnt(0) 266; GFX8-NEXT: v_mov_b32_e32 v0, s0 267; GFX8-NEXT: v_mov_b32_e32 v1, s1 268; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 269; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 270; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 271; GFX8-NEXT: v_mov_b32_e32 v4, s2 272; GFX8-NEXT: v_mov_b32_e32 v5, s3 273; GFX8-NEXT: s_waitcnt vmcnt(0) 274; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 275; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 276; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] 277; GFX8-NEXT: s_endpgm 278; 279; GFX10-LABEL: update_dpp_p0_test: 280; GFX10: ; %bb.0: 281; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 282; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 283; GFX10-NEXT: s_waitcnt lgkmcnt(0) 284; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 285; GFX10-NEXT: v_mov_b32_e32 v2, s2 286; GFX10-NEXT: v_mov_b32_e32 v3, s3 287; GFX10-NEXT: s_waitcnt vmcnt(0) 288; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 289; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 290; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] 291; GFX10-NEXT: s_endpgm 292; 293; GFX11-LABEL: update_dpp_p0_test: 294; GFX11: ; %bb.0: 295; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 296; GFX11-NEXT: s_waitcnt lgkmcnt(0) 297; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 298; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 299; GFX11-NEXT: v_mov_b32_e32 v2, s2 300; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] 301; GFX11-NEXT: s_waitcnt vmcnt(0) 302; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 303; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 304; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] 305; GFX11-NEXT: s_endpgm 306 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 307 %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id 308 %load = load ptr, ptr addrspace(1) %gep 309 %tmp0 = call ptr @llvm.amdgcn.update.dpp.v2f32(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #1 310 store ptr %tmp0, ptr addrspace(1) %gep 311 ret void 312} 313 314define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { 315; GFX8-LABEL: update_dpp_p3_test: 316; GFX8: ; %bb.0: 317; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 318; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 319; GFX8-NEXT: s_mov_b32 m0, -1 320; GFX8-NEXT: s_waitcnt lgkmcnt(0) 321; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 322; GFX8-NEXT: ds_read_b32 v1, v0 323; GFX8-NEXT: v_mov_b32_e32 v2, s1 324; GFX8-NEXT: s_waitcnt lgkmcnt(0) 325; GFX8-NEXT: s_nop 0 326; GFX8-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 327; GFX8-NEXT: ds_write_b32 v0, v2 328; GFX8-NEXT: s_endpgm 329; 330; GFX10-LABEL: update_dpp_p3_test: 331; GFX10: ; %bb.0: 332; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 333; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 334; GFX10-NEXT: s_waitcnt lgkmcnt(0) 335; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 336; GFX10-NEXT: v_mov_b32_e32 v2, s1 337; GFX10-NEXT: ds_read_b32 v1, v0 338; GFX10-NEXT: s_waitcnt lgkmcnt(0) 339; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 340; GFX10-NEXT: ds_write_b32 v0, v2 341; GFX10-NEXT: s_endpgm 342; 343; GFX11-LABEL: update_dpp_p3_test: 344; GFX11: ; %bb.0: 345; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 346; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 347; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 348; GFX11-NEXT: s_waitcnt lgkmcnt(0) 349; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 350; GFX11-NEXT: v_mov_b32_e32 v2, s1 351; GFX11-NEXT: ds_load_b32 v1, v0 352; GFX11-NEXT: s_waitcnt lgkmcnt(0) 353; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 354; GFX11-NEXT: ds_store_b32 v0, v2 355; GFX11-NEXT: s_endpgm 356 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 357 %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id 358 %load = load ptr addrspace(3), ptr addrspace(3) %gep 359 %tmp0 = call ptr addrspace(3) @llvm.amdgcn.update.dpp.p3(ptr addrspace(3) %in1, ptr addrspace(3) %load, i32 1, i32 1, i32 1, i1 false) #1 360 store ptr addrspace(3) %tmp0, ptr addrspace(3) %gep 361 ret void 362} 363 364define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspace(5) %in1, ptr %in2) { 365; GFX8-LABEL: update_dpp_p5_test: 366; GFX8: ; %bb.0: 367; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 368; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 369; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 370; GFX8-NEXT: s_mov_b32 s90, -1 371; GFX8-NEXT: s_mov_b32 s91, 0xe80000 372; GFX8-NEXT: s_add_u32 s88, s88, s11 373; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 374; GFX8-NEXT: s_addc_u32 s89, s89, 0 375; GFX8-NEXT: s_waitcnt lgkmcnt(0) 376; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 377; GFX8-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen 378; GFX8-NEXT: v_mov_b32_e32 v2, s1 379; GFX8-NEXT: s_waitcnt vmcnt(0) 380; GFX8-NEXT: s_nop 0 381; GFX8-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 382; GFX8-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen 383; GFX8-NEXT: s_endpgm 384; 385; GFX10-LABEL: update_dpp_p5_test: 386; GFX10: ; %bb.0: 387; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 388; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 389; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 390; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 391; GFX10-NEXT: s_mov_b32 s14, -1 392; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 393; GFX10-NEXT: s_add_u32 s12, s12, s11 394; GFX10-NEXT: s_addc_u32 s13, s13, 0 395; GFX10-NEXT: s_waitcnt lgkmcnt(0) 396; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 397; GFX10-NEXT: v_mov_b32_e32 v2, s1 398; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen 399; GFX10-NEXT: s_waitcnt vmcnt(0) 400; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 401; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen 402; GFX10-NEXT: s_endpgm 403; 404; GFX11-LABEL: update_dpp_p5_test: 405; GFX11: ; %bb.0: 406; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 407; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 408; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 409; GFX11-NEXT: s_waitcnt lgkmcnt(0) 410; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 411; GFX11-NEXT: v_mov_b32_e32 v2, s1 412; GFX11-NEXT: scratch_load_b32 v1, v0, off 413; GFX11-NEXT: s_waitcnt vmcnt(0) 414; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 415; GFX11-NEXT: scratch_store_b32 v0, v2, off 416; GFX11-NEXT: s_endpgm 417 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 418 %gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id 419 %load = load ptr addrspace(5), ptr addrspace(5) %gep 420 %tmp0 = call ptr addrspace(5) @llvm.amdgcn.update.dpp.p5(ptr addrspace(5) %in1, ptr addrspace(5) %load, i32 1, i32 1, i32 1, i1 false) #1 421 store ptr addrspace(5) %tmp0, ptr addrspace(5) %gep 422 ret void 423} 424 425declare i32 @llvm.amdgcn.workitem.id.x() #0 426declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1 427declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1 428 429attributes #0 = { nounwind readnone speculatable } 430attributes #1 = { convergent nounwind readnone } 431