1; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 2; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 6; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT 7; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT 8; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED 9; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) 10 11; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions 12; are not converted from f16 to f32. 13; GCN-LABEL: {{^}}dotproduct_f16 14; GFX900: v_fma_f16 15; GFX900: v_fma_f16 16 17; GFX906: v_mul_f16_e32 18; GFX906: v_mul_f16_e32 19 20; GFX906-DL-UNSAFE: v_fma_f16 21; GFX10-CONTRACT: v_fmac_f16 22 23; GFX906-CONTRACT: v_mac_f16_e32 24; GFX906-DENORM-CONTRACT: v_fma_f16 25; GFX906-DOT10-DISABLED: v_fma_f16 26define amdgpu_kernel void @dotproduct_f16(ptr addrspace(1) %src1, 27 ptr addrspace(1) %src2, 28 ptr addrspace(1) nocapture %dst) { 29entry: 30 %src1.vec = load <2 x half>, ptr addrspace(1) %src1 31 %src2.vec = load <2 x half>, ptr addrspace(1) %src2 32 33 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 34 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 35 36 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 37 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 38 39 %mul2 = fmul half %src1.el2, %src2.el2 40 %mul1 = fmul half %src1.el1, %src2.el1 41 %acc = load half, ptr addrspace(1) %dst, align 2 42 %acc1 = fadd half %mul2, %acc 43 %acc2 = fadd half %mul1, %acc1 44 store half %acc2, ptr addrspace(1) %dst, align 2 45 ret void 46} 47 48 49; We only want to generate fdot2 if: 50; - vector element of dot product is converted from f16 to f32, and 51; - the vectors are of type <2 x half>, and 52; - "dot10-insts" is enabled 53 54; GCN-LABEL: {{^}}dotproduct_f16_f32 55; GFX900: v_mad_mix_f32 56; GFX900: v_mad_mix_f32 57 58; GFX906: v_mad_f32 59; GFX906: v_mac_f32_e32 60 61; GFX906-DL-UNSAFE: v_dot2_f32_f16 62; GFX10-DL-UNSAFE: v_dot2c_f32_f16 63 64; GFX906-CONTRACT: v_dot2_f32_f16 65 66; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 67; GFX906-DOT10-DISABLED: v_fma_mix_f32 68define amdgpu_kernel void @dotproduct_f16_f32(ptr addrspace(1) %src1, 69 ptr addrspace(1) %src2, 70 ptr addrspace(1) nocapture %dst) { 71entry: 72 %src1.vec = load <2 x half>, ptr addrspace(1) %src1 73 %src2.vec = load <2 x half>, ptr addrspace(1) %src2 74 75 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 76 %csrc1.el1 = fpext half %src1.el1 to float 77 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 78 %csrc2.el1 = fpext half %src2.el1 to float 79 80 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 81 %csrc1.el2 = fpext half %src1.el2 to float 82 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 83 %csrc2.el2 = fpext half %src2.el2 to float 84 85 %mul2 = fmul float %csrc1.el2, %csrc2.el2 86 %mul1 = fmul float %csrc1.el1, %csrc2.el1 87 %acc = load float, ptr addrspace(1) %dst, align 4 88 %acc1 = fadd float %mul2, %acc 89 %acc2 = fadd float %mul1, %acc1 90 store float %acc2, ptr addrspace(1) %dst, align 4 91 ret void 92} 93 94; We only want to generate fdot2 if: 95; - vector element of dot product is converted from f16 to f32, and 96; - the vectors are of type <2 x half>, and 97; - "dot10-insts" is enabled 98 99; GCN-LABEL: {{^}}dotproduct_diffvecorder 100; GFX900: v_mad_mix_f32 101; GFX900: v_mad_mix_f32 102 103; GFX906: v_mad_f32 104; GFX906: v_mac_f32_e32 105 106; GFX906-DL-UNSAFE: v_dot2_f32_f16 107; GFX10-DL-UNSAFE: v_dot2c_f32_f16 108 109; GFX906-CONTRACT: v_dot2_f32_f16 110; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 111; GFX906-DOT10-DISABLED: v_fma_mix_f32 112define amdgpu_kernel void @dotproduct_diffvecorder(ptr addrspace(1) %src1, 113 ptr addrspace(1) %src2, 114 ptr addrspace(1) nocapture %dst) { 115entry: 116 %src1.vec = load <2 x half>, ptr addrspace(1) %src1 117 %src2.vec = load <2 x half>, ptr addrspace(1) %src2 118 119 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 120 %csrc1.el1 = fpext half %src1.el1 to float 121 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 122 %csrc2.el1 = fpext half %src2.el1 to float 123 124 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 125 %csrc1.el2 = fpext half %src1.el2 to float 126 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 127 %csrc2.el2 = fpext half %src2.el2 to float 128 129 %mul2 = fmul float %csrc2.el2, %csrc1.el2 130 %mul1 = fmul float %csrc1.el1, %csrc2.el1 131 %acc = load float, ptr addrspace(1) %dst, align 4 132 %acc1 = fadd float %mul2, %acc 133 %acc2 = fadd float %mul1, %acc1 134 store float %acc2, ptr addrspace(1) %dst, align 4 135 ret void 136} 137 138; Tests to make sure dot product is not generated when the vectors are not of <2 x half>. 139; GCN-LABEL: {{^}}dotproduct_v4f16 140; GFX900: v_mad_mix_f32 141 142; GFX906: v_mad_f32 143; GFX906: v_mac_f32_e32 144 145; GCN-DL-UNSAFE: v_fma_mix_f32 146 147; GFX906-CONTRACT: v_fma_mix_f32 148; GFX906-DENORM-CONTRACT: v_fma_mix_f32 149; GFX906-DOT10-DISABLED: v_fma_mix_f32 150define amdgpu_kernel void @dotproduct_v4f16(ptr addrspace(1) %src1, 151 ptr addrspace(1) %src2, 152 ptr addrspace(1) nocapture %dst) { 153entry: 154 %src1.vec = load <4 x half>, ptr addrspace(1) %src1 155 %src2.vec = load <4 x half>, ptr addrspace(1) %src2 156 157 %src1.el1 = extractelement <4 x half> %src1.vec, i64 0 158 %csrc1.el1 = fpext half %src1.el1 to float 159 %src2.el1 = extractelement <4 x half> %src2.vec, i64 0 160 %csrc2.el1 = fpext half %src2.el1 to float 161 162 %src1.el2 = extractelement <4 x half> %src1.vec, i64 1 163 %csrc1.el2 = fpext half %src1.el2 to float 164 %src2.el2 = extractelement <4 x half> %src2.vec, i64 1 165 %csrc2.el2 = fpext half %src2.el2 to float 166 167 %mul2 = fmul float %csrc1.el2, %csrc2.el2 168 %mul1 = fmul float %csrc1.el1, %csrc2.el1 169 %acc = load float, ptr addrspace(1) %dst, align 4 170 %acc1 = fadd float %mul2, %acc 171 %acc2 = fadd float %mul1, %acc1 172 store float %acc2, ptr addrspace(1) %dst, align 4 173 ret void 174} 175 176; GCN-LABEL: {{^}}NotAdotproduct 177; GFX900: v_mad_mix_f32 178; GFX900: v_mad_mix_f32 179 180; GFX906: v_mad_f32 181; GFX906: v_mac_f32_e32 182 183; GCN-DL-UNSAFE: v_fma_mix_f32 184 185; GFX906-CONTRACT: v_fma_mix_f32 186; GFX906-DENORM-CONTRACT: v_fma_mix_f32 187; GFX906-DOT10-DISABLED: v_fma_mix_f32 188define amdgpu_kernel void @NotAdotproduct(ptr addrspace(1) %src1, 189 ptr addrspace(1) %src2, 190 ptr addrspace(1) nocapture %dst) { 191entry: 192 %src1.vec = load <2 x half>, ptr addrspace(1) %src1 193 %src2.vec = load <2 x half>, ptr addrspace(1) %src2 194 195 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 196 %csrc1.el1 = fpext half %src1.el1 to float 197 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 198 %csrc2.el1 = fpext half %src2.el1 to float 199 200 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 201 %csrc1.el2 = fpext half %src1.el2 to float 202 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 203 %csrc2.el2 = fpext half %src2.el2 to float 204 205 %mul2 = fmul float %csrc1.el2, %csrc1.el1 206 %mul1 = fmul float %csrc2.el1, %csrc2.el2 207 %acc = load float, ptr addrspace(1) %dst, align 4 208 %acc1 = fadd float %mul2, %acc 209 %acc2 = fadd float %mul1, %acc1 210 store float %acc2, ptr addrspace(1) %dst, align 4 211 ret void 212} 213 214; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct 215; GFX900: v_mad_mix_f32 216; GFX900: v_mad_mix_f32 217 218; GFX906: v_mad_f32 219; GFX906: v_mac_f32_e32 220 221; GCN-DL-UNSAFE: v_fma_mix_f32 222 223; GFX906-CONTRACT: v_fma_mix_f32 224; GFX906-DENORM-CONTRACT: v_fma_mix_f32 225; GFX906-DOT10-DISABLED: v_fma_mix_f32 226define amdgpu_kernel void @Diff_Idx_NotAdotproduct(ptr addrspace(1) %src1, 227 ptr addrspace(1) %src2, 228 ptr addrspace(1) nocapture %dst) { 229entry: 230 %src1.vec = load <2 x half>, ptr addrspace(1) %src1 231 %src2.vec = load <2 x half>, ptr addrspace(1) %src2 232 233 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 234 %csrc1.el1 = fpext half %src1.el1 to float 235 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 236 %csrc2.el1 = fpext half %src2.el1 to float 237 238 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 239 %csrc1.el2 = fpext half %src1.el2 to float 240 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 241 %csrc2.el2 = fpext half %src2.el2 to float 242 243 %mul2 = fmul float %csrc1.el2, %csrc2.el1 244 %mul1 = fmul float %csrc1.el1, %csrc2.el2 245 %acc = load float, ptr addrspace(1) %dst, align 4 246 %acc1 = fadd float %mul2, %acc 247 %acc2 = fadd float %mul1, %acc1 248 store float %acc2, ptr addrspace(1) %dst, align 4 249 ret void 250} 251