xref: /llvm-project/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll (revision 5e5fd0e6fc50cc1198750308c11433a5b3acfd0f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s
3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
4
5; Using FTZ should emit fma.ftz.relu for f16, not for bf16
6; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ
7; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
8
9; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
10; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
11
12define half @fma_f16_expanded_no_nans(half %a, half %b, half %c) #0 {
13; CHECK-LABEL: fma_f16_expanded_no_nans(
14; CHECK:       {
15; CHECK-NEXT:    .reg .b16 %rs<5>;
16; CHECK-EMPTY:
17; CHECK-NEXT:  // %bb.0:
18; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0];
19; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1];
20; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2];
21; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
22; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
23; CHECK-NEXT:    ret;
24;
25; CHECK-FTZ-LABEL: fma_f16_expanded_no_nans(
26; CHECK-FTZ:       {
27; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
28; CHECK-FTZ-EMPTY:
29; CHECK-FTZ-NEXT:  // %bb.0:
30; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0];
31; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1];
32; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2];
33; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3;
34; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
35; CHECK-FTZ-NEXT:    ret;
36;
37; CHECK-SM70-LABEL: fma_f16_expanded_no_nans(
38; CHECK-SM70:       {
39; CHECK-SM70-NEXT:    .reg .pred %p<2>;
40; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
41; CHECK-SM70-EMPTY:
42; CHECK-SM70-NEXT:  // %bb.0:
43; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0];
44; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1];
45; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2];
46; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
47; CHECK-SM70-NEXT:    mov.b16 %rs5, 0x0000;
48; CHECK-SM70-NEXT:    setp.gt.f16 %p1, %rs4, %rs5;
49; CHECK-SM70-NEXT:    selp.b16 %rs6, %rs4, 0x0000, %p1;
50; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs6;
51; CHECK-SM70-NEXT:    ret;
52  %1 = fmul half %a, %b
53  %2 = fadd half %1, %c
54  %3 = fcmp ogt half %2, 0.0
55  %4 = select i1 %3, half %2, half 0.0
56  ret half %4
57}
58
59; FMA relu shouldn't be selected if the FMA operation has multiple uses
60define half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) #0 {
61; CHECK-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma(
62; CHECK:       {
63; CHECK-NEXT:    .reg .b16 %rs<10>;
64; CHECK-EMPTY:
65; CHECK-NEXT:  // %bb.0:
66; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0];
67; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1];
68; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2];
69; CHECK-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
70; CHECK-NEXT:    mov.b16 %rs5, 0x0000;
71; CHECK-NEXT:    max.f16 %rs6, %rs4, %rs5;
72; CHECK-NEXT:    mov.b16 %rs7, 0x4700;
73; CHECK-NEXT:    add.f16 %rs8, %rs4, %rs7;
74; CHECK-NEXT:    add.f16 %rs9, %rs6, %rs8;
75; CHECK-NEXT:    st.param.b16 [func_retval0], %rs9;
76; CHECK-NEXT:    ret;
77;
78; CHECK-FTZ-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma(
79; CHECK-FTZ:       {
80; CHECK-FTZ-NEXT:    .reg .b16 %rs<10>;
81; CHECK-FTZ-EMPTY:
82; CHECK-FTZ-NEXT:  // %bb.0:
83; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0];
84; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1];
85; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2];
86; CHECK-FTZ-NEXT:    fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3;
87; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x0000;
88; CHECK-FTZ-NEXT:    max.ftz.f16 %rs6, %rs4, %rs5;
89; CHECK-FTZ-NEXT:    mov.b16 %rs7, 0x4700;
90; CHECK-FTZ-NEXT:    add.ftz.f16 %rs8, %rs4, %rs7;
91; CHECK-FTZ-NEXT:    add.ftz.f16 %rs9, %rs6, %rs8;
92; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs9;
93; CHECK-FTZ-NEXT:    ret;
94;
95; CHECK-SM70-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma(
96; CHECK-SM70:       {
97; CHECK-SM70-NEXT:    .reg .pred %p<2>;
98; CHECK-SM70-NEXT:    .reg .b16 %rs<10>;
99; CHECK-SM70-EMPTY:
100; CHECK-SM70-NEXT:  // %bb.0:
101; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0];
102; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1];
103; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2];
104; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
105; CHECK-SM70-NEXT:    mov.b16 %rs5, 0x0000;
106; CHECK-SM70-NEXT:    setp.gt.f16 %p1, %rs4, %rs5;
107; CHECK-SM70-NEXT:    selp.b16 %rs6, %rs4, 0x0000, %p1;
108; CHECK-SM70-NEXT:    mov.b16 %rs7, 0x4700;
109; CHECK-SM70-NEXT:    add.f16 %rs8, %rs4, %rs7;
110; CHECK-SM70-NEXT:    add.f16 %rs9, %rs6, %rs8;
111; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs9;
112; CHECK-SM70-NEXT:    ret;
113  %1 = fmul half %a, %b
114  %2 = fadd half %1, %c
115  %3 = fcmp ogt half %2, 0.0
116  %4 = select i1 %3, half %2, half 0.0
117  %5 = fadd half %2, 7.0
118  %6 = fadd half %4, %5
119  ret half %6
120}
121
122define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) #1 {
123; CHECK-LABEL: fma_f16_expanded_unsafe_with_nans(
124; CHECK:       {
125; CHECK-NEXT:    .reg .b16 %rs<7>;
126; CHECK-EMPTY:
127; CHECK-NEXT:  // %bb.0:
128; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0];
129; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1];
130; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2];
131; CHECK-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
132; CHECK-NEXT:    mov.b16 %rs5, 0x0000;
133; CHECK-NEXT:    max.f16 %rs6, %rs4, %rs5;
134; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
135; CHECK-NEXT:    ret;
136;
137; CHECK-FTZ-LABEL: fma_f16_expanded_unsafe_with_nans(
138; CHECK-FTZ:       {
139; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
140; CHECK-FTZ-EMPTY:
141; CHECK-FTZ-NEXT:  // %bb.0:
142; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0];
143; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1];
144; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2];
145; CHECK-FTZ-NEXT:    fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3;
146; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x0000;
147; CHECK-FTZ-NEXT:    max.ftz.f16 %rs6, %rs4, %rs5;
148; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs6;
149; CHECK-FTZ-NEXT:    ret;
150;
151; CHECK-SM70-LABEL: fma_f16_expanded_unsafe_with_nans(
152; CHECK-SM70:       {
153; CHECK-SM70-NEXT:    .reg .pred %p<2>;
154; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
155; CHECK-SM70-EMPTY:
156; CHECK-SM70-NEXT:  // %bb.0:
157; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0];
158; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1];
159; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2];
160; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
161; CHECK-SM70-NEXT:    mov.b16 %rs5, 0x0000;
162; CHECK-SM70-NEXT:    setp.gt.f16 %p1, %rs4, %rs5;
163; CHECK-SM70-NEXT:    selp.b16 %rs6, %rs4, 0x0000, %p1;
164; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs6;
165; CHECK-SM70-NEXT:    ret;
166  %1 = fmul half %a, %b
167  %2 = fadd half %1, %c
168  %3 = fcmp ogt half %2, 0.0
169  %4 = select i1 %3, half %2, half 0.0
170  ret half %4
171}
172
173define half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) #0 {
174; CHECK-LABEL: fma_f16_expanded_maxnum_no_nans(
175; CHECK:       {
176; CHECK-NEXT:    .reg .b16 %rs<5>;
177; CHECK-EMPTY:
178; CHECK-NEXT:  // %bb.0:
179; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0];
180; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1];
181; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2];
182; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
183; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
184; CHECK-NEXT:    ret;
185;
186; CHECK-FTZ-LABEL: fma_f16_expanded_maxnum_no_nans(
187; CHECK-FTZ:       {
188; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
189; CHECK-FTZ-EMPTY:
190; CHECK-FTZ-NEXT:  // %bb.0:
191; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0];
192; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1];
193; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2];
194; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3;
195; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
196; CHECK-FTZ-NEXT:    ret;
197;
198; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_no_nans(
199; CHECK-SM70:       {
200; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
201; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
202; CHECK-SM70-EMPTY:
203; CHECK-SM70-NEXT:  // %bb.0:
204; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_no_nans_param_0];
205; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_no_nans_param_1];
206; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_no_nans_param_2];
207; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
208; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
209; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
210; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
211; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
212; CHECK-SM70-NEXT:    ret;
213  %1 = fmul half %a, %b
214  %2 = fadd half %1, %c
215  %3 = call half @llvm.maxnum.f16(half %2, half 0.0)
216  ret half %3
217}
218
219define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) #1 {
220; CHECK-LABEL: fma_bf16_expanded_unsafe_with_nans(
221; CHECK:       {
222; CHECK-NEXT:    .reg .b16 %rs<7>;
223; CHECK-EMPTY:
224; CHECK-NEXT:  // %bb.0:
225; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_with_nans_param_0];
226; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_with_nans_param_1];
227; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_with_nans_param_2];
228; CHECK-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
229; CHECK-NEXT:    mov.b16 %rs5, 0x0000;
230; CHECK-NEXT:    max.bf16 %rs6, %rs4, %rs5;
231; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
232; CHECK-NEXT:    ret;
233;
234; CHECK-FTZ-LABEL: fma_bf16_expanded_unsafe_with_nans(
235; CHECK-FTZ:       {
236; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
237; CHECK-FTZ-EMPTY:
238; CHECK-FTZ-NEXT:  // %bb.0:
239; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_with_nans_param_0];
240; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_with_nans_param_1];
241; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_with_nans_param_2];
242; CHECK-FTZ-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
243; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x0000;
244; CHECK-FTZ-NEXT:    max.bf16 %rs6, %rs4, %rs5;
245; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs6;
246; CHECK-FTZ-NEXT:    ret;
247;
248; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe_with_nans(
249; CHECK-SM70:       {
250; CHECK-SM70-NEXT:    .reg .pred %p<3>;
251; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
252; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
253; CHECK-SM70-NEXT:    .reg .f32 %f<6>;
254; CHECK-SM70-EMPTY:
255; CHECK-SM70-NEXT:  // %bb.0:
256; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2];
257; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
258; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
259; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1];
260; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
261; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
262; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0];
263; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
264; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
265; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
266; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
267; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
268; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
269; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
270; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
271; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
272; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
273; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
274; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
275; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
276; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
277; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
278; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
279; CHECK-SM70-NEXT:    ret;
280  %1 = fmul bfloat %a, %b
281  %2 = fadd bfloat %1, %c
282  %3 = fcmp ogt bfloat %2, 0.0
283  %4 = select i1 %3, bfloat %2, bfloat 0.0
284  ret bfloat %4
285}
286
287define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
288; CHECK-LABEL: fma_bf16_expanded_no_nans(
289; CHECK:       {
290; CHECK-NEXT:    .reg .b16 %rs<5>;
291; CHECK-EMPTY:
292; CHECK-NEXT:  // %bb.0:
293; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_param_0];
294; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_param_1];
295; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_param_2];
296; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
297; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
298; CHECK-NEXT:    ret;
299;
300; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans(
301; CHECK-FTZ:       {
302; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
303; CHECK-FTZ-EMPTY:
304; CHECK-FTZ-NEXT:  // %bb.0:
305; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_param_0];
306; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_param_1];
307; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_param_2];
308; CHECK-FTZ-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
309; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
310; CHECK-FTZ-NEXT:    ret;
311;
312; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans(
313; CHECK-SM70:       {
314; CHECK-SM70-NEXT:    .reg .pred %p<3>;
315; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
316; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
317; CHECK-SM70-NEXT:    .reg .f32 %f<6>;
318; CHECK-SM70-EMPTY:
319; CHECK-SM70-NEXT:  // %bb.0:
320; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2];
321; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
322; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
323; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1];
324; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
325; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
326; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0];
327; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
328; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
329; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
330; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
331; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
332; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
333; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
334; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
335; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
336; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
337; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
338; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
339; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
340; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
341; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
342; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
343; CHECK-SM70-NEXT:    ret;
344  %1 = fmul bfloat %a, %b
345  %2 = fadd bfloat %1, %c
346  %3 = fcmp ogt bfloat %2, 0.0
347  %4 = select i1 %3, bfloat %2, bfloat 0.0
348  ret bfloat %4
349}
350
351; FMA relu shouldn't be selected if the FMA operation has multiple uses
352define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
353; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
354; CHECK:       {
355; CHECK-NEXT:    .reg .b16 %rs<11>;
356; CHECK-EMPTY:
357; CHECK-NEXT:  // %bb.0:
358; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
359; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
360; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
361; CHECK-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
362; CHECK-NEXT:    mov.b16 %rs5, 0x0000;
363; CHECK-NEXT:    max.bf16 %rs6, %rs4, %rs5;
364; CHECK-NEXT:    mov.b16 %rs7, 0x3F80;
365; CHECK-NEXT:    mov.b16 %rs8, 0x40E0;
366; CHECK-NEXT:    fma.rn.bf16 %rs9, %rs4, %rs7, %rs8;
367; CHECK-NEXT:    fma.rn.bf16 %rs10, %rs6, %rs7, %rs9;
368; CHECK-NEXT:    st.param.b16 [func_retval0], %rs10;
369; CHECK-NEXT:    ret;
370;
371; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
372; CHECK-FTZ:       {
373; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
374; CHECK-FTZ-NEXT:    .reg .b32 %r<7>;
375; CHECK-FTZ-NEXT:    .reg .f32 %f<6>;
376; CHECK-FTZ-EMPTY:
377; CHECK-FTZ-NEXT:  // %bb.0:
378; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
379; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
380; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
381; CHECK-FTZ-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
382; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x0000;
383; CHECK-FTZ-NEXT:    max.bf16 %rs6, %rs4, %rs5;
384; CHECK-FTZ-NEXT:    cvt.u32.u16 %r1, %rs4;
385; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
386; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
387; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
388; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
389; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
390; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
391; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
392; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs7;
393; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
394; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
395; CHECK-FTZ-NEXT:    add.ftz.f32 %f5, %f3, %f4;
396; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
397; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs8;
398; CHECK-FTZ-NEXT:    ret;
399;
400; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
401; CHECK-SM70:       {
402; CHECK-SM70-NEXT:    .reg .pred %p<5>;
403; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
404; CHECK-SM70-NEXT:    .reg .b32 %r<29>;
405; CHECK-SM70-NEXT:    .reg .f32 %f<10>;
406; CHECK-SM70-EMPTY:
407; CHECK-SM70-NEXT:  // %bb.0:
408; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
409; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
410; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
411; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
412; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
413; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
414; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
415; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
416; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
417; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
418; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
419; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
420; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
421; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
422; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
423; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
424; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
425; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
426; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
427; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
428; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
429; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
430; CHECK-SM70-NEXT:    add.f32 %f6, %f5, 0f40E00000;
431; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
432; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
433; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
434; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
435; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f6, %f6;
436; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
437; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p3;
438; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
439; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
440; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
441; CHECK-SM70-NEXT:    and.b32 %r22, %r19, -65536;
442; CHECK-SM70-NEXT:    mov.b32 %f8, %r22;
443; CHECK-SM70-NEXT:    add.f32 %f9, %f7, %f8;
444; CHECK-SM70-NEXT:    mov.b32 %r23, %f9;
445; CHECK-SM70-NEXT:    bfe.u32 %r24, %r23, 16, 1;
446; CHECK-SM70-NEXT:    add.s32 %r25, %r24, %r23;
447; CHECK-SM70-NEXT:    add.s32 %r26, %r25, 32767;
448; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f9, %f9;
449; CHECK-SM70-NEXT:    or.b32 %r27, %r23, 4194304;
450; CHECK-SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p4;
451; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; }
452; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
453; CHECK-SM70-NEXT:    ret;
454  %1 = fmul bfloat %a, %b
455  %2 = fadd bfloat %1, %c
456  %3 = fcmp ogt bfloat %2, 0.0
457  %4 = select i1 %3, bfloat %2, bfloat 0.0
458  %5 = fadd bfloat %2, 7.0
459  %6 = fadd bfloat %4, %5
460  ret bfloat %6
461}
462
463define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
464; CHECK-LABEL: fma_bf16_expanded_maxnum_no_nans(
465; CHECK:       {
466; CHECK-NEXT:    .reg .b16 %rs<5>;
467; CHECK-EMPTY:
468; CHECK-NEXT:  // %bb.0:
469; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_no_nans_param_0];
470; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_no_nans_param_1];
471; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_no_nans_param_2];
472; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
473; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
474; CHECK-NEXT:    ret;
475;
476; CHECK-FTZ-LABEL: fma_bf16_expanded_maxnum_no_nans(
477; CHECK-FTZ:       {
478; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
479; CHECK-FTZ-EMPTY:
480; CHECK-FTZ-NEXT:  // %bb.0:
481; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_no_nans_param_0];
482; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_no_nans_param_1];
483; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_no_nans_param_2];
484; CHECK-FTZ-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
485; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
486; CHECK-FTZ-NEXT:    ret;
487;
488; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_no_nans(
489; CHECK-SM70:       {
490; CHECK-SM70-NEXT:    .reg .pred %p<3>;
491; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
492; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
493; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
494; CHECK-SM70-EMPTY:
495; CHECK-SM70-NEXT:  // %bb.0:
496; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
497; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
498; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
499; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
500; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
501; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
502; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
503; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
504; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
505; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
506; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
507; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
508; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
509; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
510; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
511; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
512; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
513; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
514; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
515; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
516; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
517; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
518; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
519; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
520; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
521; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
522; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
523; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
524; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
525; CHECK-SM70-NEXT:    ret;
526  %1 = fmul bfloat %a, %b
527  %2 = fadd bfloat %1, %c
528  %3 = call bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0)
529  ret bfloat %3
530}
531
532define <2 x half> @fma_f16x2_expanded_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
533; CHECK-LABEL: fma_f16x2_expanded_no_nans(
534; CHECK:       {
535; CHECK-NEXT:    .reg .b32 %r<5>;
536; CHECK-EMPTY:
537; CHECK-NEXT:  // %bb.0:
538; CHECK-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2];
539; CHECK-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1];
540; CHECK-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0];
541; CHECK-NEXT:    fma.rn.relu.f16x2 %r4, %r3, %r2, %r1;
542; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
543; CHECK-NEXT:    ret;
544;
545; CHECK-FTZ-LABEL: fma_f16x2_expanded_no_nans(
546; CHECK-FTZ:       {
547; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
548; CHECK-FTZ-EMPTY:
549; CHECK-FTZ-NEXT:  // %bb.0:
550; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2];
551; CHECK-FTZ-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1];
552; CHECK-FTZ-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0];
553; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1;
554; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r4;
555; CHECK-FTZ-NEXT:    ret;
556;
557; CHECK-SM70-LABEL: fma_f16x2_expanded_no_nans(
558; CHECK-SM70:       {
559; CHECK-SM70-NEXT:    .reg .pred %p<3>;
560; CHECK-SM70-NEXT:    .reg .b16 %rs<5>;
561; CHECK-SM70-NEXT:    .reg .b32 %r<7>;
562; CHECK-SM70-EMPTY:
563; CHECK-SM70-NEXT:  // %bb.0:
564; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_param_2];
565; CHECK-SM70-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_param_1];
566; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_param_0];
567; CHECK-SM70-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
568; CHECK-SM70-NEXT:    mov.b32 %r5, 0;
569; CHECK-SM70-NEXT:    setp.gt.f16x2 %p1|%p2, %r4, %r5;
570; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
571; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs2, 0x0000, %p2;
572; CHECK-SM70-NEXT:    selp.b16 %rs4, %rs1, 0x0000, %p1;
573; CHECK-SM70-NEXT:    mov.b32 %r6, {%rs4, %rs3};
574; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r6;
575; CHECK-SM70-NEXT:    ret;
576  %1 = fmul <2 x half> %a, %b
577  %2 = fadd <2 x half> %1, %c
578  %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0>
579  %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
580  ret <2 x half> %4
581}
582
583; FMA relu shouldn't be selected if the FMA operation has multiple uses
584define <2 x half> @fma_f16x2_expanded_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
585; CHECK-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma(
586; CHECK:       {
587; CHECK-NEXT:    .reg .b32 %r<10>;
588; CHECK-EMPTY:
589; CHECK-NEXT:  // %bb.0:
590; CHECK-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
591; CHECK-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1];
592; CHECK-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
593; CHECK-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
594; CHECK-NEXT:    mov.b32 %r5, 0;
595; CHECK-NEXT:    max.f16x2 %r6, %r4, %r5;
596; CHECK-NEXT:    mov.b32 %r7, 1191200512;
597; CHECK-NEXT:    add.f16x2 %r8, %r4, %r7;
598; CHECK-NEXT:    add.f16x2 %r9, %r6, %r8;
599; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
600; CHECK-NEXT:    ret;
601;
602; CHECK-FTZ-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma(
603; CHECK-FTZ:       {
604; CHECK-FTZ-NEXT:    .reg .b32 %r<10>;
605; CHECK-FTZ-EMPTY:
606; CHECK-FTZ-NEXT:  // %bb.0:
607; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
608; CHECK-FTZ-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1];
609; CHECK-FTZ-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
610; CHECK-FTZ-NEXT:    fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1;
611; CHECK-FTZ-NEXT:    mov.b32 %r5, 0;
612; CHECK-FTZ-NEXT:    max.ftz.f16x2 %r6, %r4, %r5;
613; CHECK-FTZ-NEXT:    mov.b32 %r7, 1191200512;
614; CHECK-FTZ-NEXT:    add.ftz.f16x2 %r8, %r4, %r7;
615; CHECK-FTZ-NEXT:    add.ftz.f16x2 %r9, %r6, %r8;
616; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r9;
617; CHECK-FTZ-NEXT:    ret;
618;
619; CHECK-SM70-LABEL: fma_f16x2_expanded_no_nans_multiple_uses_of_fma(
620; CHECK-SM70:       {
621; CHECK-SM70-NEXT:    .reg .pred %p<3>;
622; CHECK-SM70-NEXT:    .reg .b16 %rs<5>;
623; CHECK-SM70-NEXT:    .reg .b32 %r<10>;
624; CHECK-SM70-EMPTY:
625; CHECK-SM70-NEXT:  // %bb.0:
626; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
627; CHECK-SM70-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_1];
628; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
629; CHECK-SM70-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
630; CHECK-SM70-NEXT:    mov.b32 %r5, 0;
631; CHECK-SM70-NEXT:    setp.gt.f16x2 %p1|%p2, %r4, %r5;
632; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
633; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs2, 0x0000, %p2;
634; CHECK-SM70-NEXT:    selp.b16 %rs4, %rs1, 0x0000, %p1;
635; CHECK-SM70-NEXT:    mov.b32 %r6, {%rs4, %rs3};
636; CHECK-SM70-NEXT:    mov.b32 %r7, 1191200512;
637; CHECK-SM70-NEXT:    add.f16x2 %r8, %r4, %r7;
638; CHECK-SM70-NEXT:    add.f16x2 %r9, %r6, %r8;
639; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r9;
640; CHECK-SM70-NEXT:    ret;
641  %1 = fmul <2 x half> %a, %b
642  %2 = fadd <2 x half> %1, %c
643  %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0>
644  %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
645  %5 = fadd <2 x half> %2, <half 7.0, half 7.0>
646  %6 = fadd <2 x half> %4, %5
647  ret <2 x half> %6
648}
649
650define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 {
651; CHECK-LABEL: fma_f16x2_expanded_unsafe_with_nans(
652; CHECK:       {
653; CHECK-NEXT:    .reg .b32 %r<7>;
654; CHECK-EMPTY:
655; CHECK-NEXT:  // %bb.0:
656; CHECK-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2];
657; CHECK-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1];
658; CHECK-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0];
659; CHECK-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
660; CHECK-NEXT:    mov.b32 %r5, 0;
661; CHECK-NEXT:    max.f16x2 %r6, %r4, %r5;
662; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
663; CHECK-NEXT:    ret;
664;
665; CHECK-FTZ-LABEL: fma_f16x2_expanded_unsafe_with_nans(
666; CHECK-FTZ:       {
667; CHECK-FTZ-NEXT:    .reg .b32 %r<7>;
668; CHECK-FTZ-EMPTY:
669; CHECK-FTZ-NEXT:  // %bb.0:
670; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2];
671; CHECK-FTZ-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1];
672; CHECK-FTZ-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0];
673; CHECK-FTZ-NEXT:    fma.rn.ftz.f16x2 %r4, %r3, %r2, %r1;
674; CHECK-FTZ-NEXT:    mov.b32 %r5, 0;
675; CHECK-FTZ-NEXT:    max.ftz.f16x2 %r6, %r4, %r5;
676; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r6;
677; CHECK-FTZ-NEXT:    ret;
678;
679; CHECK-SM70-LABEL: fma_f16x2_expanded_unsafe_with_nans(
680; CHECK-SM70:       {
681; CHECK-SM70-NEXT:    .reg .pred %p<3>;
682; CHECK-SM70-NEXT:    .reg .b16 %rs<5>;
683; CHECK-SM70-NEXT:    .reg .b32 %r<7>;
684; CHECK-SM70-EMPTY:
685; CHECK-SM70-NEXT:  // %bb.0:
686; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_unsafe_with_nans_param_2];
687; CHECK-SM70-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_unsafe_with_nans_param_1];
688; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_unsafe_with_nans_param_0];
689; CHECK-SM70-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
690; CHECK-SM70-NEXT:    mov.b32 %r5, 0;
691; CHECK-SM70-NEXT:    setp.gt.f16x2 %p1|%p2, %r4, %r5;
692; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
693; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs2, 0x0000, %p2;
694; CHECK-SM70-NEXT:    selp.b16 %rs4, %rs1, 0x0000, %p1;
695; CHECK-SM70-NEXT:    mov.b32 %r6, {%rs4, %rs3};
696; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r6;
697; CHECK-SM70-NEXT:    ret;
698  %1 = fmul <2 x half> %a, %b
699  %2 = fadd <2 x half> %1, %c
700  %3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0>
701  %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
702  ret <2 x half> %4
703}
704
705define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
706; CHECK-LABEL: fma_f16x2_expanded_maxnum_no_nans(
707; CHECK:       {
708; CHECK-NEXT:    .reg .b32 %r<5>;
709; CHECK-EMPTY:
710; CHECK-NEXT:  // %bb.0:
711; CHECK-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2];
712; CHECK-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1];
713; CHECK-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0];
714; CHECK-NEXT:    fma.rn.relu.f16x2 %r4, %r3, %r2, %r1;
715; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
716; CHECK-NEXT:    ret;
717;
718; CHECK-FTZ-LABEL: fma_f16x2_expanded_maxnum_no_nans(
719; CHECK-FTZ:       {
720; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
721; CHECK-FTZ-EMPTY:
722; CHECK-FTZ-NEXT:  // %bb.0:
723; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2];
724; CHECK-FTZ-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1];
725; CHECK-FTZ-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0];
726; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.f16x2 %r4, %r3, %r2, %r1;
727; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r4;
728; CHECK-FTZ-NEXT:    ret;
729;
730; CHECK-SM70-LABEL: fma_f16x2_expanded_maxnum_no_nans(
731; CHECK-SM70:       {
732; CHECK-SM70-NEXT:    .reg .b16 %rs<5>;
733; CHECK-SM70-NEXT:    .reg .b32 %r<6>;
734; CHECK-SM70-NEXT:    .reg .f32 %f<5>;
735; CHECK-SM70-EMPTY:
736; CHECK-SM70-NEXT:  // %bb.0:
737; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_f16x2_expanded_maxnum_no_nans_param_2];
738; CHECK-SM70-NEXT:    ld.param.b32 %r2, [fma_f16x2_expanded_maxnum_no_nans_param_1];
739; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_f16x2_expanded_maxnum_no_nans_param_0];
740; CHECK-SM70-NEXT:    fma.rn.f16x2 %r4, %r3, %r2, %r1;
741; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
742; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs2;
743; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
744; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
745; CHECK-SM70-NEXT:    cvt.f32.f16 %f3, %rs1;
746; CHECK-SM70-NEXT:    max.f32 %f4, %f3, 0f00000000;
747; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
748; CHECK-SM70-NEXT:    mov.b32 %r5, {%rs4, %rs3};
749; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r5;
750; CHECK-SM70-NEXT:    ret;
751  %1 = fmul <2 x half> %a, %b
752  %2 = fadd <2 x half> %1, %c
753  %3 = call <2 x half> @llvm.maxnum.f16x2(<2 x half> %2, <2 x half> <half 0.0, half 0.0>)
754  ret <2 x half> %3
755}
756
757define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #1 {
758; CHECK-LABEL: fma_bf16x2_expanded_unsafe_with_nans(
759; CHECK:       {
760; CHECK-NEXT:    .reg .b32 %r<7>;
761; CHECK-EMPTY:
762; CHECK-NEXT:  // %bb.0:
763; CHECK-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_2];
764; CHECK-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1];
765; CHECK-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_0];
766; CHECK-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
767; CHECK-NEXT:    mov.b32 %r5, 0;
768; CHECK-NEXT:    max.bf16x2 %r6, %r4, %r5;
769; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
770; CHECK-NEXT:    ret;
771;
772; CHECK-FTZ-LABEL: fma_bf16x2_expanded_unsafe_with_nans(
773; CHECK-FTZ:       {
774; CHECK-FTZ-NEXT:    .reg .b32 %r<7>;
775; CHECK-FTZ-EMPTY:
776; CHECK-FTZ-NEXT:  // %bb.0:
777; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_2];
778; CHECK-FTZ-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1];
779; CHECK-FTZ-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_0];
780; CHECK-FTZ-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
781; CHECK-FTZ-NEXT:    mov.b32 %r5, 0;
782; CHECK-FTZ-NEXT:    max.bf16x2 %r6, %r4, %r5;
783; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r6;
784; CHECK-FTZ-NEXT:    ret;
785;
786; CHECK-SM70-LABEL: fma_bf16x2_expanded_unsafe_with_nans(
787; CHECK-SM70:       {
788; CHECK-SM70-NEXT:    .reg .pred %p<5>;
789; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
790; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
791; CHECK-SM70-NEXT:    .reg .f32 %f<11>;
792; CHECK-SM70-EMPTY:
793; CHECK-SM70-NEXT:  // %bb.0:
794; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_0];
795; CHECK-SM70-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_unsafe_with_nans_param_1];
796; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_2];
797; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
798; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
799; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
800; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
801; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
802; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
803; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
804; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
805; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
806; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
807; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
808; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
809; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
810; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
811; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
812; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
813; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
814; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
815; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
816; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
817; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
818; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
819; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
820; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
821; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
822; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
823; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
824; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
825; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
826; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
827; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
828; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
829; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
830; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
831; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
832; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
833; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
834; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
835; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
836; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
837; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
838; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
839; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
840; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
841; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
842; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
843; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
844; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
845; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
846; CHECK-SM70-NEXT:    ret;
847  %1 = fmul <2 x bfloat> %a, %b
848  %2 = fadd <2 x bfloat> %1, %c
849  %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0>
850  %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
851  ret <2 x bfloat> %4
852}
853
854define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
855; CHECK-LABEL: fma_bf16x2_expanded_no_nans(
856; CHECK:       {
857; CHECK-NEXT:    .reg .b32 %r<5>;
858; CHECK-EMPTY:
859; CHECK-NEXT:  // %bb.0:
860; CHECK-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_2];
861; CHECK-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1];
862; CHECK-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_0];
863; CHECK-NEXT:    fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1;
864; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
865; CHECK-NEXT:    ret;
866;
867; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans(
868; CHECK-FTZ:       {
869; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
870; CHECK-FTZ-EMPTY:
871; CHECK-FTZ-NEXT:  // %bb.0:
872; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_2];
873; CHECK-FTZ-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1];
874; CHECK-FTZ-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_0];
875; CHECK-FTZ-NEXT:    fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1;
876; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r4;
877; CHECK-FTZ-NEXT:    ret;
878;
879; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans(
880; CHECK-SM70:       {
881; CHECK-SM70-NEXT:    .reg .pred %p<5>;
882; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
883; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
884; CHECK-SM70-NEXT:    .reg .f32 %f<11>;
885; CHECK-SM70-EMPTY:
886; CHECK-SM70-NEXT:  // %bb.0:
887; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_0];
888; CHECK-SM70-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_param_1];
889; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_2];
890; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
891; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
892; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
893; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
894; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
895; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
896; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
897; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
898; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
899; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
900; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
901; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
902; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
903; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
904; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
905; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
906; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
907; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
908; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
909; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
910; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
911; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
912; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
913; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
914; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
915; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
916; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
917; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
918; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
919; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
920; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
921; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
922; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
923; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
924; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
925; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
926; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
927; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
928; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
929; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
930; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
931; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
932; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
933; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
934; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
935; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
936; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
937; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
938; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
939; CHECK-SM70-NEXT:    ret;
940  %1 = fmul <2 x bfloat> %a, %b
941  %2 = fadd <2 x bfloat> %1, %c
942  %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0>
943  %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
944  ret <2 x bfloat> %4
945}
946
947; FMA relu shouldn't be selected if the FMA operation has multiple uses
948define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
949; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
950; CHECK:       {
951; CHECK-NEXT:    .reg .b32 %r<11>;
952; CHECK-EMPTY:
953; CHECK-NEXT:  // %bb.0:
954; CHECK-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
955; CHECK-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1];
956; CHECK-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
957; CHECK-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
958; CHECK-NEXT:    mov.b32 %r5, 0;
959; CHECK-NEXT:    max.bf16x2 %r6, %r4, %r5;
960; CHECK-NEXT:    mov.b32 %r7, 1065369472;
961; CHECK-NEXT:    mov.b32 %r8, 1088438496;
962; CHECK-NEXT:    fma.rn.bf16x2 %r9, %r4, %r7, %r8;
963; CHECK-NEXT:    fma.rn.bf16x2 %r10, %r6, %r7, %r9;
964; CHECK-NEXT:    st.param.b32 [func_retval0], %r10;
965; CHECK-NEXT:    ret;
966;
967; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
968; CHECK-FTZ:       {
969; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
970; CHECK-FTZ-NEXT:    .reg .b32 %r<20>;
971; CHECK-FTZ-NEXT:    .reg .f32 %f<11>;
972; CHECK-FTZ-EMPTY:
973; CHECK-FTZ-NEXT:  // %bb.0:
974; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
975; CHECK-FTZ-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1];
976; CHECK-FTZ-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
977; CHECK-FTZ-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
978; CHECK-FTZ-NEXT:    mov.b32 %r5, 0;
979; CHECK-FTZ-NEXT:    max.bf16x2 %r6, %r4, %r5;
980; CHECK-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
981; CHECK-FTZ-NEXT:    cvt.u32.u16 %r7, %rs2;
982; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
983; CHECK-FTZ-NEXT:    mov.b32 %f1, %r8;
984; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
985; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
986; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs1;
987; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
988; CHECK-FTZ-NEXT:    mov.b32 %f3, %r10;
989; CHECK-FTZ-NEXT:    add.ftz.f32 %f4, %f3, 0f40E00000;
990; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
991; CHECK-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
992; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs5;
993; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
994; CHECK-FTZ-NEXT:    mov.b32 %f5, %r12;
995; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs4;
996; CHECK-FTZ-NEXT:    shl.b32 %r14, %r13, 16;
997; CHECK-FTZ-NEXT:    mov.b32 %f6, %r14;
998; CHECK-FTZ-NEXT:    add.ftz.f32 %f7, %f5, %f6;
999; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs6;
1000; CHECK-FTZ-NEXT:    shl.b32 %r16, %r15, 16;
1001; CHECK-FTZ-NEXT:    mov.b32 %f8, %r16;
1002; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs3;
1003; CHECK-FTZ-NEXT:    shl.b32 %r18, %r17, 16;
1004; CHECK-FTZ-NEXT:    mov.b32 %f9, %r18;
1005; CHECK-FTZ-NEXT:    add.ftz.f32 %f10, %f8, %f9;
1006; CHECK-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r19, %f10, %f7;
1007; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r19;
1008; CHECK-FTZ-NEXT:    ret;
1009;
1010; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
1011; CHECK-SM70:       {
1012; CHECK-SM70-NEXT:    .reg .pred %p<9>;
1013; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
1014; CHECK-SM70-NEXT:    .reg .b32 %r<61>;
1015; CHECK-SM70-NEXT:    .reg .f32 %f<19>;
1016; CHECK-SM70-EMPTY:
1017; CHECK-SM70-NEXT:  // %bb.0:
1018; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
1019; CHECK-SM70-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_1];
1020; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
1021; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
1022; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
1023; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
1024; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
1025; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
1026; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
1027; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
1028; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
1029; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
1030; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
1031; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
1032; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
1033; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
1034; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
1035; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
1036; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
1037; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
1038; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
1039; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
1040; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
1041; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
1042; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
1043; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
1044; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
1045; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
1046; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
1047; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
1048; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
1049; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
1050; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
1051; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
1052; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
1053; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
1054; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
1055; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
1056; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
1057; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
1058; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
1059; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
1060; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
1061; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
1062; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
1063; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
1064; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
1065; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
1066; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
1067; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
1068; CHECK-SM70-NEXT:    add.f32 %f11, %f10, 0f40E00000;
1069; CHECK-SM70-NEXT:    mov.b32 %r30, %f11;
1070; CHECK-SM70-NEXT:    bfe.u32 %r31, %r30, 16, 1;
1071; CHECK-SM70-NEXT:    add.s32 %r32, %r31, %r30;
1072; CHECK-SM70-NEXT:    add.s32 %r33, %r32, 32767;
1073; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %f11, %f11;
1074; CHECK-SM70-NEXT:    or.b32 %r34, %r30, 4194304;
1075; CHECK-SM70-NEXT:    selp.b32 %r35, %r34, %r33, %p5;
1076; CHECK-SM70-NEXT:    add.f32 %f12, %f9, 0f40E00000;
1077; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
1078; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
1079; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
1080; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
1081; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %f12, %f12;
1082; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
1083; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p6;
1084; CHECK-SM70-NEXT:    cvt.u32.u16 %r42, %rs10;
1085; CHECK-SM70-NEXT:    shl.b32 %r43, %r42, 16;
1086; CHECK-SM70-NEXT:    mov.b32 %f13, %r43;
1087; CHECK-SM70-NEXT:    and.b32 %r44, %r41, -65536;
1088; CHECK-SM70-NEXT:    mov.b32 %f14, %r44;
1089; CHECK-SM70-NEXT:    add.f32 %f15, %f13, %f14;
1090; CHECK-SM70-NEXT:    mov.b32 %r45, %f15;
1091; CHECK-SM70-NEXT:    bfe.u32 %r46, %r45, 16, 1;
1092; CHECK-SM70-NEXT:    add.s32 %r47, %r46, %r45;
1093; CHECK-SM70-NEXT:    add.s32 %r48, %r47, 32767;
1094; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %f15, %f15;
1095; CHECK-SM70-NEXT:    or.b32 %r49, %r45, 4194304;
1096; CHECK-SM70-NEXT:    selp.b32 %r50, %r49, %r48, %p7;
1097; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs9;
1098; CHECK-SM70-NEXT:    shl.b32 %r52, %r51, 16;
1099; CHECK-SM70-NEXT:    mov.b32 %f16, %r52;
1100; CHECK-SM70-NEXT:    and.b32 %r53, %r35, -65536;
1101; CHECK-SM70-NEXT:    mov.b32 %f17, %r53;
1102; CHECK-SM70-NEXT:    add.f32 %f18, %f16, %f17;
1103; CHECK-SM70-NEXT:    mov.b32 %r54, %f18;
1104; CHECK-SM70-NEXT:    bfe.u32 %r55, %r54, 16, 1;
1105; CHECK-SM70-NEXT:    add.s32 %r56, %r55, %r54;
1106; CHECK-SM70-NEXT:    add.s32 %r57, %r56, 32767;
1107; CHECK-SM70-NEXT:    setp.nan.f32 %p8, %f18, %f18;
1108; CHECK-SM70-NEXT:    or.b32 %r58, %r54, 4194304;
1109; CHECK-SM70-NEXT:    selp.b32 %r59, %r58, %r57, %p8;
1110; CHECK-SM70-NEXT:    prmt.b32 %r60, %r59, %r50, 0x7632U;
1111; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r60;
1112; CHECK-SM70-NEXT:    ret;
1113  %1 = fmul <2 x bfloat> %a, %b
1114  %2 = fadd <2 x bfloat> %1, %c
1115  %3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0>
1116  %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
1117  %5 = fadd <2 x bfloat> %2, <bfloat 7.0, bfloat 7.0>
1118  %6 = fadd <2 x bfloat> %4, %5
1119  ret <2 x bfloat> %6
1120}
1121
1122define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
1123; CHECK-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
1124; CHECK:       {
1125; CHECK-NEXT:    .reg .b32 %r<5>;
1126; CHECK-EMPTY:
1127; CHECK-NEXT:  // %bb.0:
1128; CHECK-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_2];
1129; CHECK-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1];
1130; CHECK-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_0];
1131; CHECK-NEXT:    fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1;
1132; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
1133; CHECK-NEXT:    ret;
1134;
1135; CHECK-FTZ-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
1136; CHECK-FTZ:       {
1137; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
1138; CHECK-FTZ-EMPTY:
1139; CHECK-FTZ-NEXT:  // %bb.0:
1140; CHECK-FTZ-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_2];
1141; CHECK-FTZ-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1];
1142; CHECK-FTZ-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_0];
1143; CHECK-FTZ-NEXT:    fma.rn.relu.bf16x2 %r4, %r3, %r2, %r1;
1144; CHECK-FTZ-NEXT:    st.param.b32 [func_retval0], %r4;
1145; CHECK-FTZ-NEXT:    ret;
1146;
1147; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
1148; CHECK-SM70:       {
1149; CHECK-SM70-NEXT:    .reg .pred %p<5>;
1150; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
1151; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
1152; CHECK-SM70-NEXT:    .reg .f32 %f<13>;
1153; CHECK-SM70-EMPTY:
1154; CHECK-SM70-NEXT:  // %bb.0:
1155; CHECK-SM70-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_0];
1156; CHECK-SM70-NEXT:    ld.param.b32 %r2, [fma_bf16x2_expanded_maxnum_no_nans_param_1];
1157; CHECK-SM70-NEXT:    ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_2];
1158; CHECK-SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
1159; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
1160; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
1161; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
1162; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
1163; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
1164; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
1165; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
1166; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
1167; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
1168; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
1169; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
1170; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
1171; CHECK-SM70-NEXT:    mov.b32 %r10, %f4;
1172; CHECK-SM70-NEXT:    bfe.u32 %r11, %r10, 16, 1;
1173; CHECK-SM70-NEXT:    add.s32 %r12, %r11, %r10;
1174; CHECK-SM70-NEXT:    add.s32 %r13, %r12, 32767;
1175; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
1176; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
1177; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
1178; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
1179; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
1180; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
1181; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
1182; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
1183; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
1184; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
1185; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
1186; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
1187; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
1188; CHECK-SM70-NEXT:    mov.b32 %r22, %f8;
1189; CHECK-SM70-NEXT:    bfe.u32 %r23, %r22, 16, 1;
1190; CHECK-SM70-NEXT:    add.s32 %r24, %r23, %r22;
1191; CHECK-SM70-NEXT:    add.s32 %r25, %r24, 32767;
1192; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
1193; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
1194; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
1195; CHECK-SM70-NEXT:    and.b32 %r28, %r27, -65536;
1196; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
1197; CHECK-SM70-NEXT:    max.f32 %f10, %f9, 0f00000000;
1198; CHECK-SM70-NEXT:    mov.b32 %r29, %f10;
1199; CHECK-SM70-NEXT:    bfe.u32 %r30, %r29, 16, 1;
1200; CHECK-SM70-NEXT:    add.s32 %r31, %r30, %r29;
1201; CHECK-SM70-NEXT:    add.s32 %r32, %r31, 32767;
1202; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
1203; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
1204; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
1205; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
1206; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
1207; CHECK-SM70-NEXT:    max.f32 %f12, %f11, 0f00000000;
1208; CHECK-SM70-NEXT:    mov.b32 %r36, %f12;
1209; CHECK-SM70-NEXT:    bfe.u32 %r37, %r36, 16, 1;
1210; CHECK-SM70-NEXT:    add.s32 %r38, %r37, %r36;
1211; CHECK-SM70-NEXT:    add.s32 %r39, %r38, 32767;
1212; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
1213; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
1214; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
1215; CHECK-SM70-NEXT:    prmt.b32 %r42, %r41, %r34, 0x7632U;
1216; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r42;
1217; CHECK-SM70-NEXT:    ret;
1218  %1 = fmul <2 x bfloat> %a, %b
1219  %2 = fadd <2 x bfloat> %1, %c
1220  %3 = call <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
1221  ret <2 x bfloat> %3
1222}
1223
1224attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
1225attributes #1 = { "unsafe-fp-math"="true" }
1226