xref: /llvm-project/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll (revision f2c164c8150548d983565c4ddc0fde790f9e2a5b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
3; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
4
5; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s
6; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
7
8; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-SDAG %s
9; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-GISEL %s
10
11; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG %s
12; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX1-GISEL %s
13
14
15; FIXME: promotion not handled without f16 insts
16
17define half @v_constained_fmul_f16_fpexcept_strict(half %x, half %y) #0 {
18; GCN-LABEL: v_constained_fmul_f16_fpexcept_strict:
19; GCN:       ; %bb.0:
20; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GCN-NEXT:    v_mul_f16_e32 v0, v0, v1
22; GCN-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_strict:
25; GFX10PLUS:       ; %bb.0:
26; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX10PLUS-NEXT:    v_mul_f16_e32 v0, v0, v1
28; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
29  %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
30  ret half %val
31}
32
33define half @v_constained_fmul_f16_fpexcept_ignore(half %x, half %y) #0 {
34; GCN-LABEL: v_constained_fmul_f16_fpexcept_ignore:
35; GCN:       ; %bb.0:
36; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; GCN-NEXT:    v_mul_f16_e32 v0, v0, v1
38; GCN-NEXT:    s_setpc_b64 s[30:31]
39;
40; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_ignore:
41; GFX10PLUS:       ; %bb.0:
42; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43; GFX10PLUS-NEXT:    v_mul_f16_e32 v0, v0, v1
44; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
45  %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
46  ret half %val
47}
48
49define half @v_constained_fmul_f16_fpexcept_maytrap(half %x, half %y) #0 {
50; GCN-LABEL: v_constained_fmul_f16_fpexcept_maytrap:
51; GCN:       ; %bb.0:
52; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53; GCN-NEXT:    v_mul_f16_e32 v0, v0, v1
54; GCN-NEXT:    s_setpc_b64 s[30:31]
55;
56; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_maytrap:
57; GFX10PLUS:       ; %bb.0:
58; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; GFX10PLUS-NEXT:    v_mul_f16_e32 v0, v0, v1
60; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
61  %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
62  ret half %val
63}
64
65define <2 x half> @v_constained_fmul_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 {
66; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
67; GFX9:       ; %bb.0:
68; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
70; GFX9-NEXT:    s_setpc_b64 s[30:31]
71;
72; GFX8-SDAG-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
73; GFX8-SDAG:       ; %bb.0:
74; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
76; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
77; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
78; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX8-GISEL-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
81; GFX8-GISEL:       ; %bb.0:
82; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX8-GISEL-NEXT:    v_mul_f16_e32 v2, v0, v1
84; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
85; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
86; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
87;
88; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
89; GFX10PLUS:       ; %bb.0:
90; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91; GFX10PLUS-NEXT:    v_pk_mul_f16 v0, v0, v1
92; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
93  %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
94  ret <2 x half> %val
95}
96
97define <2 x half> @v_constained_fmul_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 {
98; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
99; GFX9:       ; %bb.0:
100; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
102; GFX9-NEXT:    s_setpc_b64 s[30:31]
103;
104; GFX8-SDAG-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
105; GFX8-SDAG:       ; %bb.0:
106; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
108; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
109; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
110; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
111;
112; GFX8-GISEL-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
113; GFX8-GISEL:       ; %bb.0:
114; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115; GFX8-GISEL-NEXT:    v_mul_f16_e32 v2, v0, v1
116; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
117; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
118; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
121; GFX10PLUS:       ; %bb.0:
122; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX10PLUS-NEXT:    v_pk_mul_f16 v0, v0, v1
124; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
125  %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
126  ret <2 x half> %val
127}
128
129define <2 x half> @v_constained_fmul_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 {
130; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
131; GFX9:       ; %bb.0:
132; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
133; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
134; GFX9-NEXT:    s_setpc_b64 s[30:31]
135;
136; GFX8-SDAG-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
137; GFX8-SDAG:       ; %bb.0:
138; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
140; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
141; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
142; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
143;
144; GFX8-GISEL-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
145; GFX8-GISEL:       ; %bb.0:
146; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX8-GISEL-NEXT:    v_mul_f16_e32 v2, v0, v1
148; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
149; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
150; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
153; GFX10PLUS:       ; %bb.0:
154; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX10PLUS-NEXT:    v_pk_mul_f16 v0, v0, v1
156; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
157  %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
158  ret <2 x half> %val
159}
160
161define <3 x half> @v_constained_fmul_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 {
162; GFX9-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
163; GFX9-SDAG:       ; %bb.0:
164; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v2
166; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
167; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
168;
169; GFX9-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
170; GFX9-GISEL:       ; %bb.0:
171; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
173; GFX9-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
174; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
175;
176; GFX8-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
177; GFX8-SDAG:       ; %bb.0:
178; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
180; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
181; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
182; GFX8-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
183; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
184;
185; GFX8-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
186; GFX8-GISEL:       ; %bb.0:
187; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188; GFX8-GISEL-NEXT:    v_mul_f16_e32 v4, v0, v2
189; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
190; GFX8-GISEL-NEXT:    v_mul_f16_e32 v1, v1, v3
191; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
192; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
193;
194; GFX10-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
195; GFX10-SDAG:       ; %bb.0:
196; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v2
198; GFX10-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
199; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
200;
201; GFX10-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
202; GFX10-GISEL:       ; %bb.0:
203; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
205; GFX10-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
206; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
207;
208; GFX11-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
209; GFX11-SDAG:       ; %bb.0:
210; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v2
212; GFX11-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
213; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
214;
215; GFX1-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
216; GFX1-GISEL:       ; %bb.0:
217; GFX1-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX1-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
219; GFX1-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
220; GFX1-GISEL-NEXT:    s_setpc_b64 s[30:31]
221  %val = call <3 x half> @llvm.experimental.constrained.fmul.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
222  ret <3 x half> %val
223}
224
225; FIXME: Scalarized
226define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 {
227; GFX9-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
228; GFX9-SDAG:       ; %bb.0:
229; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GFX9-SDAG-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
231; GFX9-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
232; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
233; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
234; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
235; GFX9-SDAG-NEXT:    v_perm_b32 v0, v5, v0, s4
236; GFX9-SDAG-NEXT:    v_perm_b32 v1, v4, v1, s4
237; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
238;
239; GFX9-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
240; GFX9-GISEL:       ; %bb.0:
241; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
243; GFX9-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
244; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
245;
246; GFX8-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
247; GFX8-SDAG:       ; %bb.0:
248; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
250; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
251; GFX8-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
252; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
253; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v5
254; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v1, v4
255; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
256;
257; GFX8-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
258; GFX8-GISEL:       ; %bb.0:
259; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX8-GISEL-NEXT:    v_mul_f16_e32 v4, v0, v2
261; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
262; GFX8-GISEL-NEXT:    v_mul_f16_e32 v2, v1, v3
263; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
264; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
265; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
266; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
267;
268; GFX10-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
269; GFX10-SDAG:       ; %bb.0:
270; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271; GFX10-SDAG-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
272; GFX10-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
273; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
274; GFX10-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
275; GFX10-SDAG-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
276; GFX10-SDAG-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
277; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
278;
279; GFX10-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
280; GFX10-GISEL:       ; %bb.0:
281; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
282; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
283; GFX10-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
284; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
285;
286; GFX11-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
287; GFX11-SDAG:       ; %bb.0:
288; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
290; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
291; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
292; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
293; GFX11-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
294; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
295; GFX11-SDAG-NEXT:    v_mul_f16_e32 v2, v6, v5
296; GFX11-SDAG-NEXT:    v_mul_f16_e32 v3, v7, v4
297; GFX11-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
298; GFX11-SDAG-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
299; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
300;
301; GFX1-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
302; GFX1-GISEL:       ; %bb.0:
303; GFX1-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GFX1-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
305; GFX1-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
306; GFX1-GISEL-NEXT:    s_setpc_b64 s[30:31]
307  %val = call <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
308  ret <4 x half> %val
309}
310
311define amdgpu_ps half @s_constained_fmul_f16_fpexcept_strict(half inreg %x, half inreg %y) #0 {
312; GCN-LABEL: s_constained_fmul_f16_fpexcept_strict:
313; GCN:       ; %bb.0:
314; GCN-NEXT:    v_mov_b32_e32 v0, s3
315; GCN-NEXT:    v_mul_f16_e32 v0, s2, v0
316; GCN-NEXT:    ; return to shader part epilog
317;
318; GFX10PLUS-LABEL: s_constained_fmul_f16_fpexcept_strict:
319; GFX10PLUS:       ; %bb.0:
320; GFX10PLUS-NEXT:    v_mul_f16_e64 v0, s2, s3
321; GFX10PLUS-NEXT:    ; return to shader part epilog
322  %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
323  ret half %val
324}
325
326define amdgpu_ps <2 x half> @s_constained_fmul_v2f16_fpexcept_strict(<2 x half> inreg %x, <2 x half> inreg %y) #0 {
327; GFX9-LABEL: s_constained_fmul_v2f16_fpexcept_strict:
328; GFX9:       ; %bb.0:
329; GFX9-NEXT:    v_mov_b32_e32 v0, s3
330; GFX9-NEXT:    v_pk_mul_f16 v0, s2, v0
331; GFX9-NEXT:    ; return to shader part epilog
332;
333; GFX8-SDAG-LABEL: s_constained_fmul_v2f16_fpexcept_strict:
334; GFX8-SDAG:       ; %bb.0:
335; GFX8-SDAG-NEXT:    s_lshr_b32 s0, s3, 16
336; GFX8-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
337; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, s0
338; GFX8-SDAG-NEXT:    v_mov_b32_e32 v1, s1
339; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
340; GFX8-SDAG-NEXT:    v_mov_b32_e32 v1, s3
341; GFX8-SDAG-NEXT:    v_mul_f16_e32 v1, s2, v1
342; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
343; GFX8-SDAG-NEXT:    ; return to shader part epilog
344;
345; GFX8-GISEL-LABEL: s_constained_fmul_v2f16_fpexcept_strict:
346; GFX8-GISEL:       ; %bb.0:
347; GFX8-GISEL-NEXT:    s_lshr_b32 s0, s2, 16
348; GFX8-GISEL-NEXT:    s_lshr_b32 s1, s3, 16
349; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s3
350; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
351; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s0
352; GFX8-GISEL-NEXT:    v_mul_f16_e32 v0, s2, v0
353; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
354; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
355; GFX8-GISEL-NEXT:    ; return to shader part epilog
356;
357; GFX10PLUS-LABEL: s_constained_fmul_v2f16_fpexcept_strict:
358; GFX10PLUS:       ; %bb.0:
359; GFX10PLUS-NEXT:    v_pk_mul_f16 v0, s2, s3
360; GFX10PLUS-NEXT:    ; return to shader part epilog
361  %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
362  ret <2 x half> %val
363}
364
365declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) #1
366declare <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half>, <2 x half>, metadata, metadata) #1
367declare <3 x half> @llvm.experimental.constrained.fmul.v3f16(<3 x half>, <3 x half>, metadata, metadata) #1
368declare <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half>, <4 x half>, metadata, metadata) #1
369
370attributes #0 = { strictfp }
371attributes #1 = { inaccessiblememonly nounwind willreturn }
372;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
373; GFX10: {{.*}}
374; GFX11: {{.*}}
375; GFX8: {{.*}}
376