xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
4
5; fold (fsub (fpext (fneg (fmul, x, y))), z) -> (fneg (fma (fpext x), (fpext y), z))
6define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul(half %x, half %y, float %z) {
7; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul:
8; GFX9-DENORM:       ; %bb.0: ; %entry
9; GFX9-DENORM-NEXT:    v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]
10; GFX9-DENORM-NEXT:    ; return to shader part epilog
11;
12; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul:
13; GFX10-DENORM:       ; %bb.0: ; %entry
14; GFX10-DENORM-NEXT:    v_fma_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]
15; GFX10-DENORM-NEXT:    ; return to shader part epilog
16entry:
17  %a = fmul fast half %x, %y
18  %b = fneg half %a
19  %c = fpext half %b to float
20  %d = fsub fast float %c, %z
21  ret float %d
22}
23
24; fold (fsub (fneg (fpext (fmul, x, y))), z) -> (fneg (fma (fpext x)), (fpext y), z)
25define amdgpu_vs float @test_f16_to_f32_sub_neg_ext_mul(half %x, half %y, float %z) {
26; GFX9-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul:
27; GFX9-DENORM:       ; %bb.0: ; %entry
28; GFX9-DENORM-NEXT:    v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]
29; GFX9-DENORM-NEXT:    ; return to shader part epilog
30;
31; GFX10-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul:
32; GFX10-DENORM:       ; %bb.0: ; %entry
33; GFX10-DENORM-NEXT:    v_fma_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]
34; GFX10-DENORM-NEXT:    ; return to shader part epilog
35entry:
36  %a = fmul fast half %x, %y
37  %b = fpext half %a to float
38  %c = fneg float %b
39  %d = fsub fast float %c, %z
40  ret float %d
41}
42
43
44; fold (fsub x, (fpext (fneg (fmul y, z)))) -> (fma (fpext y), (fpext z), x)
45define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul2(float %x, half %y, half %z) {
46; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul2:
47; GFX9-DENORM:       ; %bb.0: ; %entry
48; GFX9-DENORM-NEXT:    v_mad_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0]
49; GFX9-DENORM-NEXT:    ; return to shader part epilog
50;
51; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul2:
52; GFX10-DENORM:       ; %bb.0: ; %entry
53; GFX10-DENORM-NEXT:    v_fma_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0]
54; GFX10-DENORM-NEXT:    ; return to shader part epilog
55entry:
56  %a = fmul fast half %y, %z
57  %b = fneg half %a
58  %c = fpext half %b to float
59  %d = fsub fast float %x, %c
60  ret float %d
61}
62
63; fold (fsub x, (fneg (fpext (fmul y, z)))) -> (fma (fpext y), (fpext z), x)
64define amdgpu_vs float @test_f16_to_f32_sub_neg_ext_mul2(float %x, half %y, half %z) {
65; GFX9-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul2:
66; GFX9-DENORM:       ; %bb.0: ; %entry
67; GFX9-DENORM-NEXT:    v_mad_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0]
68; GFX9-DENORM-NEXT:    ; return to shader part epilog
69;
70; GFX10-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul2:
71; GFX10-DENORM:       ; %bb.0: ; %entry
72; GFX10-DENORM-NEXT:    v_fma_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0]
73; GFX10-DENORM-NEXT:    ; return to shader part epilog
74entry:
75  %a = fmul fast half %y, %z
76  %b = fpext half %a to float
77  %c = fneg float %b
78  %d = fsub fast float %x, %c
79  ret float %d
80}
81
82; fold (fsub (fpext (fneg (fmul, x, y))), z) -> (fneg (fma (fpext x), (fpext y), z))
83define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x float> %z) {
84; GFX9-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul:
85; GFX9-DENORM:       ; %bb.0: ; %entry
86; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
87; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
88; GFX9-DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v0
89; GFX9-DENORM-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
90; GFX9-DENORM-NEXT:    v_cvt_f32_f16_e32 v8, v1
91; GFX9-DENORM-NEXT:    v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
92; GFX9-DENORM-NEXT:    v_sub_f32_e32 v0, v2, v4
93; GFX9-DENORM-NEXT:    v_sub_f32_e32 v1, v3, v5
94; GFX9-DENORM-NEXT:    v_sub_f32_e32 v2, v8, v6
95; GFX9-DENORM-NEXT:    v_sub_f32_e32 v3, v9, v7
96; GFX9-DENORM-NEXT:    ; return to shader part epilog
97;
98; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul:
99; GFX10-DENORM:       ; %bb.0: ; %entry
100; GFX10-DENORM-NEXT:    v_xor_b32_e32 v8, 0x80008000, v2
101; GFX10-DENORM-NEXT:    v_xor_b32_e32 v9, 0x80008000, v3
102; GFX10-DENORM-NEXT:    v_fma_mix_f32 v5, v0, -v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
103; GFX10-DENORM-NEXT:    v_fma_mix_f32 v3, v1, -v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0]
104; GFX10-DENORM-NEXT:    v_fma_mix_f32 v0, v0, v8, -v4 op_sel_hi:[1,1,0]
105; GFX10-DENORM-NEXT:    v_fma_mix_f32 v2, v1, v9, -v6 op_sel_hi:[1,1,0]
106; GFX10-DENORM-NEXT:    v_mov_b32_e32 v1, v5
107; GFX10-DENORM-NEXT:    ; return to shader part epilog
108entry:
109  %a = fmul fast <4 x half> %x, %y
110  %b = fneg <4 x half> %a
111  %c = fpext <4 x half> %b to <4 x float>
112  %d = fsub fast <4 x float> %c, %z
113  ret <4 x float> %d
114}
115
116; fold (fsub (fneg (fpext (fmul, x, y))), z) -> (fneg (fma (fpext x)), (fpext y), z)
117define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul(<4 x half> %x, <4 x half> %y, <4 x float> %z) {
118; GFX9-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul:
119; GFX9-DENORM:       ; %bb.0: ; %entry
120; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
121; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
122; GFX9-DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v0
123; GFX9-DENORM-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
124; GFX9-DENORM-NEXT:    v_cvt_f32_f16_e32 v8, v1
125; GFX9-DENORM-NEXT:    v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
126; GFX9-DENORM-NEXT:    v_sub_f32_e32 v0, v2, v4
127; GFX9-DENORM-NEXT:    v_sub_f32_e32 v1, v3, v5
128; GFX9-DENORM-NEXT:    v_sub_f32_e32 v2, v8, v6
129; GFX9-DENORM-NEXT:    v_sub_f32_e32 v3, v9, v7
130; GFX9-DENORM-NEXT:    ; return to shader part epilog
131;
132; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul:
133; GFX10-DENORM:       ; %bb.0: ; %entry
134; GFX10-DENORM-NEXT:    v_xor_b32_e32 v8, 0x80008000, v2
135; GFX10-DENORM-NEXT:    v_xor_b32_e32 v9, 0x80008000, v3
136; GFX10-DENORM-NEXT:    v_fma_mix_f32 v5, v0, -v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
137; GFX10-DENORM-NEXT:    v_fma_mix_f32 v3, v1, -v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0]
138; GFX10-DENORM-NEXT:    v_fma_mix_f32 v0, v0, v8, -v4 op_sel_hi:[1,1,0]
139; GFX10-DENORM-NEXT:    v_fma_mix_f32 v2, v1, v9, -v6 op_sel_hi:[1,1,0]
140; GFX10-DENORM-NEXT:    v_mov_b32_e32 v1, v5
141; GFX10-DENORM-NEXT:    ; return to shader part epilog
142entry:
143  %a = fmul fast <4 x half> %x, %y
144  %b = fpext <4 x half> %a to <4 x float>
145  %c = fneg <4 x float> %b
146  %d = fsub fast <4 x float> %c, %z
147  ret <4 x float> %d
148}
149
150
151; fold (fsub x, (fpext (fneg (fmul y, z)))) -> (fma (fpext y), (fpext z), x)
152define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul2(<4 x float> %x, <4 x half> %y, <4 x half> %z) {
153; GFX9-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2:
154; GFX9-DENORM:       ; %bb.0: ; %entry
155; GFX9-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6 neg_lo:[0,1] neg_hi:[0,1]
156; GFX9-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7 neg_lo:[0,1] neg_hi:[0,1]
157; GFX9-DENORM-NEXT:    v_cvt_f32_f16_e32 v6, v4
158; GFX9-DENORM-NEXT:    v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
159; GFX9-DENORM-NEXT:    v_cvt_f32_f16_e32 v7, v5
160; GFX9-DENORM-NEXT:    v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
161; GFX9-DENORM-NEXT:    v_sub_f32_e32 v0, v0, v6
162; GFX9-DENORM-NEXT:    v_sub_f32_e32 v1, v1, v4
163; GFX9-DENORM-NEXT:    v_sub_f32_e32 v2, v2, v7
164; GFX9-DENORM-NEXT:    v_sub_f32_e32 v3, v3, v5
165; GFX9-DENORM-NEXT:    ; return to shader part epilog
166;
167; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2:
168; GFX10-DENORM:       ; %bb.0: ; %entry
169; GFX10-DENORM-NEXT:    v_xor_b32_e32 v8, 0x80008000, v6
170; GFX10-DENORM-NEXT:    v_xor_b32_e32 v9, 0x80008000, v7
171; GFX10-DENORM-NEXT:    v_fma_mix_f32 v1, -v4, -v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
172; GFX10-DENORM-NEXT:    v_fma_mix_f32 v3, -v5, -v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
173; GFX10-DENORM-NEXT:    v_fma_mix_f32 v0, -v4, v8, v0 op_sel_hi:[1,1,0]
174; GFX10-DENORM-NEXT:    v_fma_mix_f32 v2, -v5, v9, v2 op_sel_hi:[1,1,0]
175; GFX10-DENORM-NEXT:    ; return to shader part epilog
176entry:
177  %a = fmul fast <4 x half> %y, %z
178  %b = fneg <4 x half> %a
179  %c = fpext <4 x half> %b to <4 x float>
180  %d = fsub fast <4 x float> %x, %c
181  ret <4 x float> %d
182}
183
184; fold (fsub x, (fneg (fpext (fmul y, z)))) -> (fma (fpext y), (fpext z), x)
185define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul2(<4 x float> %x, <4 x half> %y, <4 x half> %z) {
186; GFX9-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2:
187; GFX9-DENORM:       ; %bb.0: ; %entry
188; GFX9-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6 neg_lo:[0,1] neg_hi:[0,1]
189; GFX9-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7 neg_lo:[0,1] neg_hi:[0,1]
190; GFX9-DENORM-NEXT:    v_cvt_f32_f16_e32 v6, v4
191; GFX9-DENORM-NEXT:    v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
192; GFX9-DENORM-NEXT:    v_cvt_f32_f16_e32 v7, v5
193; GFX9-DENORM-NEXT:    v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
194; GFX9-DENORM-NEXT:    v_sub_f32_e32 v0, v0, v6
195; GFX9-DENORM-NEXT:    v_sub_f32_e32 v1, v1, v4
196; GFX9-DENORM-NEXT:    v_sub_f32_e32 v2, v2, v7
197; GFX9-DENORM-NEXT:    v_sub_f32_e32 v3, v3, v5
198; GFX9-DENORM-NEXT:    ; return to shader part epilog
199;
200; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2:
201; GFX10-DENORM:       ; %bb.0: ; %entry
202; GFX10-DENORM-NEXT:    v_xor_b32_e32 v8, 0x80008000, v6
203; GFX10-DENORM-NEXT:    v_xor_b32_e32 v9, 0x80008000, v7
204; GFX10-DENORM-NEXT:    v_fma_mix_f32 v1, -v4, -v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
205; GFX10-DENORM-NEXT:    v_fma_mix_f32 v3, -v5, -v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
206; GFX10-DENORM-NEXT:    v_fma_mix_f32 v0, -v4, v8, v0 op_sel_hi:[1,1,0]
207; GFX10-DENORM-NEXT:    v_fma_mix_f32 v2, -v5, v9, v2 op_sel_hi:[1,1,0]
208; GFX10-DENORM-NEXT:    ; return to shader part epilog
209entry:
210  %a = fmul fast <4 x half> %y, %z
211  %b = fpext <4 x half> %a to <4 x float>
212  %c = fneg <4 x float> %b
213  %d = fsub fast <4 x float> %x, %c
214  ret <4 x float> %d
215}
216