1; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s 3 4; GCN-LABEL: {{^}}addMul2D: 5; GFX1010: v_fmac_f16 6; GFX1010: v_fmac_f16 7define hidden <4 x half> @addMul2D(ptr nocapture readonly %arg, ptr addrspace(4) nocapture readonly %arg1, <2 x i32> %arg2, i32 %arg3) local_unnamed_addr #0 { 8bb: 9 %tmp = extractelement <2 x i32> %arg2, i64 1 10 %tmp4 = icmp sgt i32 %tmp, 0 11 br i1 %tmp4, label %bb5, label %bb36 12 13bb5: ; preds = %bb 14 %tmp6 = extractelement <2 x i32> %arg2, i64 0 15 %tmp7 = icmp sgt i32 %tmp6, 0 16 br label %bb8 17 18bb8: ; preds = %bb32, %bb5 19 %tmp9 = phi <4 x half> [ zeroinitializer, %bb5 ], [ %tmp33, %bb32 ] 20 %tmp10 = phi i32 [ 0, %bb5 ], [ %tmp34, %bb32 ] 21 br i1 %tmp7, label %bb11, label %bb32 22 23bb11: ; preds = %bb8 24 %tmp12 = mul nsw i32 %tmp10, %arg3 25 %tmp13 = mul nsw i32 %tmp10, %tmp6 26 br label %bb14 27 28bb14: ; preds = %bb14, %bb11 29 %tmp15 = phi <4 x half> [ %tmp9, %bb11 ], [ %tmp29, %bb14 ] 30 %tmp16 = phi i32 [ 0, %bb11 ], [ %tmp30, %bb14 ] 31 %tmp17 = add nsw i32 %tmp16, %tmp12 32 %tmp18 = sext i32 %tmp17 to i64 33 %tmp19 = getelementptr inbounds <4 x i8>, ptr %arg, i64 %tmp18 34 %tmp20 = load <4 x i8>, ptr %tmp19, align 4 35 %tmp21 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %tmp20) 36 %tmp22 = add nsw i32 %tmp16, %tmp13 37 %tmp23 = sext i32 %tmp22 to i64 38 %tmp24 = getelementptr inbounds float, ptr addrspace(4) %arg1, i64 %tmp23 39 %tmp25 = load float, ptr addrspace(4) %tmp24, align 4 40 %tmp26 = fptrunc float %tmp25 to half 41 %tmp27 = insertelement <4 x half> undef, half %tmp26, i32 0 42 %tmp28 = shufflevector <4 x half> %tmp27, <4 x half> undef, <4 x i32> zeroinitializer 43 %vec.A.0 = extractelement <4 x half> %tmp21, i32 0 44 %vec.B.0 = extractelement <4 x half> %tmp28, i32 0 45 %vec.C.0 = extractelement <4 x half> %tmp15, i32 0 46 %vec.res.0 = tail call half @llvm.fmuladd.f16(half %vec.A.0, half %vec.B.0, half %vec.C.0) 47 %vec.A.1 = extractelement <4 x half> %tmp21, i32 1 48 %vec.B.1 = extractelement <4 x half> %tmp28, i32 1 49 %vec.C.1 = extractelement <4 x half> %tmp15, i32 1 50 %vec.res.1 = tail call half @llvm.fmuladd.f16(half %vec.A.1, half %vec.B.1, half %vec.C.1) 51 %vec.A.2 = extractelement <4 x half> %tmp21, i32 2 52 %vec.B.2 = extractelement <4 x half> %tmp28, i32 2 53 %vec.C.2 = extractelement <4 x half> %tmp15, i32 2 54 %vec.res.2 = tail call half @llvm.fmuladd.f16(half %vec.A.2, half %vec.B.2, half %vec.C.2) 55 %vec.A.3 = extractelement <4 x half> %tmp21, i32 3 56 %vec.B.3 = extractelement <4 x half> %tmp28, i32 3 57 %vec.C.3 = extractelement <4 x half> %tmp15, i32 3 58 %vec.res.3 = tail call half @llvm.fmuladd.f16(half %vec.A.3, half %vec.B.3, half %vec.C.3) 59 %full.res.0 = insertelement <4 x half> undef, half %vec.res.0, i32 0 60 %full.res.1 = insertelement <4 x half> %full.res.0, half %vec.res.1, i32 1 61 %full.res.2 = insertelement <4 x half> %full.res.1, half %vec.res.2, i32 2 62 %tmp29 = insertelement <4 x half> %full.res.2, half %vec.res.3, i32 3 63 %tmp30 = add nuw nsw i32 %tmp16, 1 64 %tmp31 = icmp eq i32 %tmp30, %tmp6 65 br i1 %tmp31, label %bb32, label %bb14 66 67bb32: ; preds = %bb14, %bb8 68 %tmp33 = phi <4 x half> [ %tmp9, %bb8 ], [ %tmp29, %bb14 ] 69 %tmp34 = add nuw nsw i32 %tmp10, 1 70 %tmp35 = icmp eq i32 %tmp34, %tmp 71 br i1 %tmp35, label %bb36, label %bb8 72 73bb36: ; preds = %bb32, %bb 74 %tmp37 = phi <4 x half> [ zeroinitializer, %bb ], [ %tmp33, %bb32 ] 75 ret <4 x half> %tmp37 76} 77 78; Function Attrs: norecurse nounwind readnone 79define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %arg) local_unnamed_addr #1 { 80bb: 81 %tmp = extractelement <4 x i8> %arg, i64 0 82 %tmp1 = uitofp i8 %tmp to half 83 %tmp2 = insertelement <4 x half> undef, half %tmp1, i32 0 84 %tmp3 = extractelement <4 x i8> %arg, i64 1 85 %tmp4 = uitofp i8 %tmp3 to half 86 %tmp5 = insertelement <4 x half> %tmp2, half %tmp4, i32 1 87 %tmp6 = extractelement <4 x i8> %arg, i64 2 88 %tmp7 = uitofp i8 %tmp6 to half 89 %tmp8 = insertelement <4 x half> %tmp5, half %tmp7, i32 2 90 %tmp9 = extractelement <4 x i8> %arg, i64 3 91 %tmp10 = uitofp i8 %tmp9 to half 92 %tmp11 = insertelement <4 x half> %tmp8, half %tmp10, i32 3 93 ret <4 x half> %tmp11 94} 95 96declare half @llvm.fmuladd.f16(half, half, half) 97 98attributes #0 = { convergent nounwind readonly} 99attributes #1 = { norecurse nounwind readnone } 100