xref: /llvm-project/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll (revision 76c22b18eafd2156568d72e9df2ff7bd3457888a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11 %s
6
7define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 {
8; GCN-LABEL: v_constained_fma_f16_fpexcept_strict:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GCN-NEXT:    v_fma_f16 v0, v0, v1, v2
12; GCN-NEXT:    s_setpc_b64 s[30:31]
13;
14; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict:
15; GFX10:       ; %bb.0:
16; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
18; GFX10-NEXT:    s_setpc_b64 s[30:31]
19;
20; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict:
21; GFX11:       ; %bb.0:
22; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23; GFX11-NEXT:    v_fma_f16 v0, v0, v1, v2
24; GFX11-NEXT:    s_setpc_b64 s[30:31]
25  %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
26  ret half %val
27}
28
29define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
30; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict:
31; GFX9:       ; %bb.0:
32; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
34; GFX9-NEXT:    s_setpc_b64 s[30:31]
35;
36; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict:
37; GFX8:       ; %bb.0:
38; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
40; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
41; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
42; GFX8-NEXT:    v_fma_f16 v3, v5, v4, v3
43; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
44; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
45; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
46; GFX8-NEXT:    s_setpc_b64 s[30:31]
47;
48; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict:
49; GFX10:       ; %bb.0:
50; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
52; GFX10-NEXT:    s_setpc_b64 s[30:31]
53;
54; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict:
55; GFX11:       ; %bb.0:
56; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
58; GFX11-NEXT:    s_setpc_b64 s[30:31]
59  %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
60  ret <2 x half> %val
61}
62
63define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y, <3 x half> %z) #0 {
64; GFX9-LABEL: v_constained_fma_v3f16_fpexcept_strict:
65; GFX9:       ; %bb.0:
66; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
68; GFX9-NEXT:    v_fma_f16 v1, v1, v3, v5
69; GFX9-NEXT:    s_setpc_b64 s[30:31]
70;
71; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict:
72; GFX8:       ; %bb.0:
73; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
75; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
76; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
77; GFX8-NEXT:    v_fma_f16 v6, v8, v7, v6
78; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
79; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
80; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
81; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
82; GFX8-NEXT:    s_setpc_b64 s[30:31]
83;
84; GFX10-LABEL: v_constained_fma_v3f16_fpexcept_strict:
85; GFX10:       ; %bb.0:
86; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
88; GFX10-NEXT:    v_fma_f16 v1, v1, v3, v5
89; GFX10-NEXT:    s_setpc_b64 s[30:31]
90;
91; GFX11-LABEL: v_constained_fma_v3f16_fpexcept_strict:
92; GFX11:       ; %bb.0:
93; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
95; GFX11-NEXT:    v_fma_f16 v1, v1, v3, v5
96; GFX11-NEXT:    s_setpc_b64 s[30:31]
97  %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
98  ret <3 x half> %val
99}
100
101define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y, <4 x half> %z) #0 {
102; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict:
103; GFX9:       ; %bb.0:
104; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
106; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
107; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
108; GFX9-NEXT:    v_fma_f16 v6, v8, v7, v6
109; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
110; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
111; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
112; GFX9-NEXT:    v_fma_f16 v7, v9, v8, v7
113; GFX9-NEXT:    v_fma_f16 v1, v1, v3, v5
114; GFX9-NEXT:    v_fma_f16 v0, v0, v2, v4
115; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
116; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
117; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
118; GFX9-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict:
121; GFX8:       ; %bb.0:
122; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
124; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
125; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
126; GFX8-NEXT:    v_fma_f16 v6, v8, v7, v6
127; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
128; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
129; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
130; GFX8-NEXT:    v_fma_f16 v7, v9, v8, v7
131; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
132; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
133; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
134; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
135; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
136; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
137; GFX8-NEXT:    s_setpc_b64 s[30:31]
138;
139; GFX10-LABEL: v_constained_fma_v4f16_fpexcept_strict:
140; GFX10:       ; %bb.0:
141; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
143; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
144; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
145; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
146; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
147; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
148; GFX10-NEXT:    v_fmac_f16_e32 v4, v0, v2
149; GFX10-NEXT:    v_fmac_f16_e32 v6, v8, v7
150; GFX10-NEXT:    v_fmac_f16_e32 v5, v1, v3
151; GFX10-NEXT:    v_fmac_f16_e32 v9, v11, v10
152; GFX10-NEXT:    v_perm_b32 v1, v6, v5, 0x5040100
153; GFX10-NEXT:    v_perm_b32 v0, v9, v4, 0x5040100
154; GFX10-NEXT:    s_setpc_b64 s[30:31]
155;
156; GFX11-LABEL: v_constained_fma_v4f16_fpexcept_strict:
157; GFX11:       ; %bb.0:
158; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
160; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
161; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
162; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
163; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
164; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
165; GFX11-NEXT:    v_fmac_f16_e32 v4, v0, v2
166; GFX11-NEXT:    v_fmac_f16_e32 v6, v8, v7
167; GFX11-NEXT:    v_fmac_f16_e32 v5, v1, v3
168; GFX11-NEXT:    v_fmac_f16_e32 v9, v11, v10
169; GFX11-NEXT:    v_perm_b32 v1, v6, v5, 0x5040100
170; GFX11-NEXT:    v_perm_b32 v0, v9, v4, 0x5040100
171; GFX11-NEXT:    s_setpc_b64 s[30:31]
172  %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
173  ret <4 x half> %val
174}
175
176define half @v_constained_fma_f16_fpexcept_strict_fneg(half %x, half %y, half %z) #0 {
177; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg:
178; GCN:       ; %bb.0:
179; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; GCN-NEXT:    v_fma_f16 v0, v0, v1, -v2
181; GCN-NEXT:    s_setpc_b64 s[30:31]
182;
183; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fneg:
184; GFX10:       ; %bb.0:
185; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GFX10-NEXT:    v_fma_f16 v0, v0, v1, -v2
187; GFX10-NEXT:    s_setpc_b64 s[30:31]
188;
189; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg:
190; GFX11:       ; %bb.0:
191; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192; GFX11-NEXT:    v_fma_f16 v0, v0, v1, -v2
193; GFX11-NEXT:    s_setpc_b64 s[30:31]
194  %neg.z = fneg half %z
195  %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
196  ret half %val
197}
198
199define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, half %z) #0 {
200; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg:
201; GCN:       ; %bb.0:
202; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203; GCN-NEXT:    v_fma_f16 v0, -v0, -v1, v2
204; GCN-NEXT:    s_setpc_b64 s[30:31]
205;
206; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg:
207; GFX10:       ; %bb.0:
208; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209; GFX10-NEXT:    v_fma_f16 v0, -v0, -v1, v2
210; GFX10-NEXT:    s_setpc_b64 s[30:31]
211;
212; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg:
213; GFX11:       ; %bb.0:
214; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215; GFX11-NEXT:    v_fma_f16 v0, -v0, -v1, v2
216; GFX11-NEXT:    s_setpc_b64 s[30:31]
217  %neg.x = fneg half %x
218  %neg.y = fneg half %y
219  %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
220  ret half %val
221}
222
223define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, half %z) #0 {
224; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs:
225; GCN:       ; %bb.0:
226; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; GCN-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
228; GCN-NEXT:    s_setpc_b64 s[30:31]
229;
230; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs:
231; GFX10:       ; %bb.0:
232; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233; GFX10-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
234; GFX10-NEXT:    s_setpc_b64 s[30:31]
235;
236; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs:
237; GFX11:       ; %bb.0:
238; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239; GFX11-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
240; GFX11-NEXT:    s_setpc_b64 s[30:31]
241  %neg.x = call half @llvm.fabs.f16(half %x) #0
242  %neg.y = call half @llvm.fabs.f16(half %y) #0
243  %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
244  ret half %val
245}
246
247define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
248; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
249; GFX9:       ; %bb.0:
250; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
252; GFX9-NEXT:    s_setpc_b64 s[30:31]
253;
254; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
255; GFX8:       ; %bb.0:
256; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
258; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
259; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
260; GFX8-NEXT:    v_fma_f16 v3, -v5, -v4, v3
261; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
262; GFX8-NEXT:    v_fma_f16 v0, -v0, -v1, v2
263; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
264; GFX8-NEXT:    s_setpc_b64 s[30:31]
265;
266; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
267; GFX10:       ; %bb.0:
268; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
270; GFX10-NEXT:    s_setpc_b64 s[30:31]
271;
272; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
273; GFX11:       ; %bb.0:
274; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
276; GFX11-NEXT:    s_setpc_b64 s[30:31]
277  %neg.x = fneg <2 x half> %x
278  %neg.y = fneg <2 x half> %y
279  %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %neg.x, <2 x half> %neg.y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
280  ret <2 x half> %val
281}
282
283declare half @llvm.fabs.f16(half)
284declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata)
285declare <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half>, <2 x half>, <2 x half>, metadata, metadata)
286declare <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half>, <3 x half>, <3 x half>, metadata, metadata)
287declare <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half>, <4 x half>, <4 x half>, metadata, metadata)
288
289attributes #0 = { strictfp }
290