1; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN 5 6; GCN-LABEL: {{^}}dpp_add: 7; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], 8; GCN: v_add_{{(nc_)?}}u32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 9define amdgpu_kernel void @dpp_add(ptr addrspace(1) %arg) { 10 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 11 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id 12 %load = load i32, ptr addrspace(1) %gep 13 %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0 14 %add = add i32 %tmp0, %load 15 store i32 %add, ptr addrspace(1) %gep 16 ret void 17} 18 19; GCN-LABEL: {{^}}dpp_ceil: 20; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], 21; GCN: v_ceil_f32_dpp [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 22define amdgpu_kernel void @dpp_ceil(ptr addrspace(1) %arg) { 23 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 24 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id 25 %load = load i32, ptr addrspace(1) %gep 26 %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0 27 %tmp1 = bitcast i32 %tmp0 to float 28 %round = tail call float @llvm.ceil.f32(float %tmp1) 29 %tmp2 = bitcast float %round to i32 30 store i32 %tmp2, ptr addrspace(1) %gep 31 ret void 32} 33 34; GCN-LABEL: {{^}}dpp_fadd: 35; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], 36; GCN: v_add_f32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 37define amdgpu_kernel void @dpp_fadd(ptr addrspace(1) %arg) { 38 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 39 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id 40 %load = load i32, ptr addrspace(1) %gep 41 %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0 42 %tmp1 = bitcast i32 %tmp0 to float 43 %t = bitcast i32 %load to float 44 %add = fadd float %tmp1, %t 45 %tmp2 = bitcast float %add to i32 46 store i32 %tmp2, ptr addrspace(1) %gep 47 ret void 48} 49 50; Fails to combine because v_mul_lo_u32 has no e32 or dpp form. 51; GCN-LABEL: {{^}}dpp_mul: 52; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], 53; GCN: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]] 54; GCN: v_mov_b32_dpp [[V2]], [[V2]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 55; GCN: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}} 56define amdgpu_kernel void @dpp_mul(ptr addrspace(1) %arg) { 57 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 58 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id 59 %load = load i32, ptr addrspace(1) %gep 60 %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) 61 %mul = mul i32 %tmp0, %load 62 store i32 %mul, ptr addrspace(1) %gep 63 ret void 64} 65 66declare i32 @llvm.amdgcn.workitem.id.x() 67declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 68declare float @llvm.ceil.f32(float) 69 70attributes #0 = { nounwind readnone convergent } 71