xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fdot2.ll (revision a2d086af2cdac8c22685551d4d3d0928e40e1a0f)
1; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900
2; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906
6; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-CONTRACT
7; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DENORM-CONTRACT
8; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DOT10-DISABLED
9; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
10
11; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
12; are not converted from f16 to f32.
13; GCN-LABEL: {{^}}dotproduct_f16
14; GFX900: v_fma_f16
15; GFX900: v_fma_f16
16
17; GFX906: v_mul_f16_e32
18; GFX906: v_mul_f16_e32
19
20; GFX906-DL-UNSAFE:  v_fma_f16
21; GFX10-CONTRACT: v_fmac_f16
22
23; GFX906-CONTRACT: v_mac_f16_e32
24; GFX906-DENORM-CONTRACT: v_fma_f16
25; GFX906-DOT10-DISABLED: v_fma_f16
26define amdgpu_kernel void @dotproduct_f16(ptr addrspace(1) %src1,
27                                          ptr addrspace(1) %src2,
28                                          ptr addrspace(1) nocapture %dst) {
29entry:
30  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
31  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
32
33  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
34  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
35
36  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
37  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
38
39  %mul2 = fmul half %src1.el2, %src2.el2
40  %mul1 = fmul half %src1.el1, %src2.el1
41  %acc = load half, ptr addrspace(1) %dst, align 2
42  %acc1 = fadd half %mul2, %acc
43  %acc2 = fadd half %mul1, %acc1
44  store half %acc2, ptr addrspace(1) %dst, align 2
45  ret void
46}
47
48
49; We only want to generate fdot2 if:
50; - vector element of dot product is converted from f16 to f32, and
51; - the vectors are of type <2 x half>, and
52; - "dot10-insts" is enabled
53
54; GCN-LABEL: {{^}}dotproduct_f16_f32
55; GFX900: v_mad_mix_f32
56; GFX900: v_mad_mix_f32
57
58; GFX906: v_mad_f32
59; GFX906: v_mac_f32_e32
60
61; GFX906-DL-UNSAFE: v_dot2_f32_f16
62; GFX10-DL-UNSAFE: v_dot2c_f32_f16
63
64; GFX906-CONTRACT: v_dot2_f32_f16
65
66; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
67; GFX906-DOT10-DISABLED: v_fma_mix_f32
68define amdgpu_kernel void @dotproduct_f16_f32(ptr addrspace(1) %src1,
69                                              ptr addrspace(1) %src2,
70                                              ptr addrspace(1) nocapture %dst) {
71entry:
72  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
73  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
74
75  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
76  %csrc1.el1 = fpext half %src1.el1 to float
77  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
78  %csrc2.el1 = fpext half %src2.el1 to float
79
80  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
81  %csrc1.el2 = fpext half %src1.el2 to float
82  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
83  %csrc2.el2 = fpext half %src2.el2 to float
84
85  %mul2 = fmul float %csrc1.el2, %csrc2.el2
86  %mul1 = fmul float %csrc1.el1, %csrc2.el1
87  %acc = load float, ptr addrspace(1) %dst, align 4
88  %acc1 = fadd float %mul2, %acc
89  %acc2 = fadd float %mul1, %acc1
90  store float %acc2, ptr addrspace(1) %dst, align 4
91  ret void
92}
93
94; We only want to generate fdot2 if:
95; - vector element of dot product is converted from f16 to f32, and
96; - the vectors are of type <2 x half>, and
97; - "dot10-insts" is enabled
98
99; GCN-LABEL: {{^}}dotproduct_diffvecorder
100; GFX900: v_mad_mix_f32
101; GFX900: v_mad_mix_f32
102
103; GFX906: v_mad_f32
104; GFX906: v_mac_f32_e32
105
106; GFX906-DL-UNSAFE: v_dot2_f32_f16
107; GFX10-DL-UNSAFE: v_dot2c_f32_f16
108
109; GFX906-CONTRACT: v_dot2_f32_f16
110; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
111; GFX906-DOT10-DISABLED: v_fma_mix_f32
112define amdgpu_kernel void @dotproduct_diffvecorder(ptr addrspace(1) %src1,
113                                                   ptr addrspace(1) %src2,
114                                                   ptr addrspace(1) nocapture %dst) {
115entry:
116  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
117  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
118
119  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
120  %csrc1.el1 = fpext half %src1.el1 to float
121  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
122  %csrc2.el1 = fpext half %src2.el1 to float
123
124  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
125  %csrc1.el2 = fpext half %src1.el2 to float
126  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
127  %csrc2.el2 = fpext half %src2.el2 to float
128
129  %mul2 = fmul float %csrc2.el2, %csrc1.el2
130  %mul1 = fmul float %csrc1.el1, %csrc2.el1
131  %acc = load float, ptr addrspace(1) %dst, align 4
132  %acc1 = fadd float %mul2, %acc
133  %acc2 = fadd float %mul1, %acc1
134  store float %acc2, ptr addrspace(1) %dst, align 4
135  ret void
136}
137
138; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
139; GCN-LABEL: {{^}}dotproduct_v4f16
140; GFX900: v_mad_mix_f32
141
142; GFX906: v_mad_f32
143; GFX906: v_mac_f32_e32
144
145; GCN-DL-UNSAFE: v_fma_mix_f32
146
147; GFX906-CONTRACT: v_fma_mix_f32
148; GFX906-DENORM-CONTRACT: v_fma_mix_f32
149; GFX906-DOT10-DISABLED: v_fma_mix_f32
150define amdgpu_kernel void @dotproduct_v4f16(ptr addrspace(1) %src1,
151                                            ptr addrspace(1) %src2,
152                                            ptr addrspace(1) nocapture %dst) {
153entry:
154  %src1.vec = load <4 x half>, ptr addrspace(1) %src1
155  %src2.vec = load <4 x half>, ptr addrspace(1) %src2
156
157  %src1.el1 = extractelement <4 x half> %src1.vec, i64 0
158  %csrc1.el1 = fpext half %src1.el1 to float
159  %src2.el1 = extractelement <4 x half> %src2.vec, i64 0
160  %csrc2.el1 = fpext half %src2.el1 to float
161
162  %src1.el2 = extractelement <4 x half> %src1.vec, i64 1
163  %csrc1.el2 = fpext half %src1.el2 to float
164  %src2.el2 = extractelement <4 x half> %src2.vec, i64 1
165  %csrc2.el2 = fpext half %src2.el2 to float
166
167  %mul2 = fmul float %csrc1.el2, %csrc2.el2
168  %mul1 = fmul float %csrc1.el1, %csrc2.el1
169  %acc = load float, ptr addrspace(1) %dst, align 4
170  %acc1 = fadd float %mul2, %acc
171  %acc2 = fadd float %mul1, %acc1
172  store float %acc2, ptr addrspace(1) %dst, align 4
173  ret void
174}
175
176; GCN-LABEL: {{^}}NotAdotproduct
177; GFX900: v_mad_mix_f32
178; GFX900: v_mad_mix_f32
179
180; GFX906: v_mad_f32
181; GFX906: v_mac_f32_e32
182
183; GCN-DL-UNSAFE: v_fma_mix_f32
184
185; GFX906-CONTRACT: v_fma_mix_f32
186; GFX906-DENORM-CONTRACT: v_fma_mix_f32
187; GFX906-DOT10-DISABLED: v_fma_mix_f32
188define amdgpu_kernel void @NotAdotproduct(ptr addrspace(1) %src1,
189                                          ptr addrspace(1) %src2,
190                                          ptr addrspace(1) nocapture %dst) {
191entry:
192  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
193  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
194
195  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
196  %csrc1.el1 = fpext half %src1.el1 to float
197  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
198  %csrc2.el1 = fpext half %src2.el1 to float
199
200  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
201  %csrc1.el2 = fpext half %src1.el2 to float
202  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
203  %csrc2.el2 = fpext half %src2.el2 to float
204
205  %mul2 = fmul float %csrc1.el2, %csrc1.el1
206  %mul1 = fmul float %csrc2.el1, %csrc2.el2
207  %acc = load float, ptr addrspace(1) %dst, align 4
208  %acc1 = fadd float %mul2, %acc
209  %acc2 = fadd float %mul1, %acc1
210  store float %acc2, ptr addrspace(1) %dst, align 4
211  ret void
212}
213
214; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
215; GFX900: v_mad_mix_f32
216; GFX900: v_mad_mix_f32
217
218; GFX906: v_mad_f32
219; GFX906: v_mac_f32_e32
220
221; GCN-DL-UNSAFE: v_fma_mix_f32
222
223; GFX906-CONTRACT: v_fma_mix_f32
224; GFX906-DENORM-CONTRACT: v_fma_mix_f32
225; GFX906-DOT10-DISABLED: v_fma_mix_f32
226define amdgpu_kernel void @Diff_Idx_NotAdotproduct(ptr addrspace(1) %src1,
227                                                   ptr addrspace(1) %src2,
228                                                   ptr addrspace(1) nocapture %dst) {
229entry:
230  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
231  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
232
233  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
234  %csrc1.el1 = fpext half %src1.el1 to float
235  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
236  %csrc2.el1 = fpext half %src2.el1 to float
237
238  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
239  %csrc1.el2 = fpext half %src1.el2 to float
240  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
241  %csrc2.el2 = fpext half %src2.el2 to float
242
243  %mul2 = fmul float %csrc1.el2, %csrc2.el1
244  %mul1 = fmul float %csrc1.el1, %csrc2.el2
245  %acc = load float, ptr addrspace(1) %dst, align 4
246  %acc1 = fadd float %mul2, %acc
247  %acc2 = fadd float %mul1, %acc1
248  store float %acc2, ptr addrspace(1) %dst, align 4
249  ret void
250}
251