1; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906 2; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940-SDAG 3; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940-GISEL 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 6 7declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp) 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp: 11; GFX9: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} 12; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} 13define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp( 14 ptr addrspace(1) %r, 15 ptr addrspace(1) %a, 16 ptr addrspace(1) %b, 17 ptr addrspace(1) %c) { 18entry: 19 %a.val = load <2 x i16>, ptr addrspace(1) %a 20 %b.val = load <2 x i16>, ptr addrspace(1) %b 21 %c.val = load i32, ptr addrspace(1) %c 22 %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1) 23 store i32 %r.val, ptr addrspace(1) %r 24 ret void 25} 26 27; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp: 28; GFX9: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 29; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} 30define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp( 31 ptr addrspace(1) %r, 32 ptr addrspace(1) %a, 33 ptr addrspace(1) %b, 34 ptr addrspace(1) %c) { 35entry: 36 %a.val = load <2 x i16>, ptr addrspace(1) %a 37 %b.val = load <2 x i16>, ptr addrspace(1) %b 38 %c.val = load i32, ptr addrspace(1) %c 39 %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0) 40 store i32 %r.val, ptr addrspace(1) %r 41 ret void 42} 43 44; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_op_sel: 45; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}} 46; GFX940-SDAG: s_mov_b32 [[K:s[0-9]+]], 0x10001 47; GFX940-SDAG: v_dot2_u32_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 48; GFX940-GISEL: v_mov_b32_e32 [[K:v[0-9]+]], 0x10001 49; GFX940-GISEL: v_dot2_u32_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, s{{[0-9]+}}{{$}} 50; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}} 51define amdgpu_kernel void @test_llvm_amdgcn_udot2_op_sel( 52 ptr addrspace(1) %r, 53 ptr addrspace(1) %b, 54 i32 %c) { 55entry: 56 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 57 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b, i32 %id 58 %b.val = load <2 x i16>, ptr addrspace(1) %b.gep 59 %b.elt0 = extractelement <2 x i16> %b.val, i32 0 60 %b.elt1 = extractelement <2 x i16> %b.val, i32 1 61 %b0 = insertelement <2 x i16> undef, i16 %b.elt1, i32 0 62 %b1 = insertelement <2 x i16> %b0, i16 %b.elt0, i32 1 63 %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 1, i16 1>, <2 x i16> %b1, i32 %c, i1 0) 64 store i32 %r.val, ptr addrspace(1) %r 65 ret void 66} 67