xref: /llvm-project/llvm/test/CodeGen/NVPTX/bf16-instructions.ll (revision 5e5fd0e6fc50cc1198750308c11433a5b3acfd0f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
3; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
4; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
5; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
6; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
7; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
8; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
9
10target triple = "nvptx64-nvidia-cuda"
11
12; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
13@"bfloat_array" = addrspace(1) constant [4 x bfloat]
14                [bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
15
16define bfloat @test_fadd(bfloat %0, bfloat %1) {
17; SM70-LABEL: test_fadd(
18; SM70:       {
19; SM70-NEXT:    .reg .pred %p<2>;
20; SM70-NEXT:    .reg .b16 %rs<2>;
21; SM70-NEXT:    .reg .b32 %r<11>;
22; SM70-NEXT:    .reg .f32 %f<4>;
23; SM70-EMPTY:
24; SM70-NEXT:  // %bb.0:
25; SM70-NEXT:    ld.param.u16 %r1, [test_fadd_param_1];
26; SM70-NEXT:    shl.b32 %r2, %r1, 16;
27; SM70-NEXT:    mov.b32 %f1, %r2;
28; SM70-NEXT:    ld.param.u16 %r3, [test_fadd_param_0];
29; SM70-NEXT:    shl.b32 %r4, %r3, 16;
30; SM70-NEXT:    mov.b32 %f2, %r4;
31; SM70-NEXT:    add.rn.f32 %f3, %f2, %f1;
32; SM70-NEXT:    mov.b32 %r5, %f3;
33; SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
34; SM70-NEXT:    add.s32 %r7, %r6, %r5;
35; SM70-NEXT:    add.s32 %r8, %r7, 32767;
36; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
37; SM70-NEXT:    or.b32 %r9, %r5, 4194304;
38; SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
39; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
40; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
41; SM70-NEXT:    ret;
42;
43; SM80-LABEL: test_fadd(
44; SM80:       {
45; SM80-NEXT:    .reg .b16 %rs<5>;
46; SM80-EMPTY:
47; SM80-NEXT:  // %bb.0:
48; SM80-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
49; SM80-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
50; SM80-NEXT:    mov.b16 %rs3, 0x3F80;
51; SM80-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs3, %rs2;
52; SM80-NEXT:    st.param.b16 [func_retval0], %rs4;
53; SM80-NEXT:    ret;
54;
55; SM80-FTZ-LABEL: test_fadd(
56; SM80-FTZ:       {
57; SM80-FTZ-NEXT:    .reg .b16 %rs<4>;
58; SM80-FTZ-NEXT:    .reg .f32 %f<4>;
59; SM80-FTZ-EMPTY:
60; SM80-FTZ-NEXT:  // %bb.0:
61; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
62; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
63; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs2;
64; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs1;
65; SM80-FTZ-NEXT:    add.rn.ftz.f32 %f3, %f2, %f1;
66; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f3;
67; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
68; SM80-FTZ-NEXT:    ret;
69;
70; SM90-LABEL: test_fadd(
71; SM90:       {
72; SM90-NEXT:    .reg .b16 %rs<4>;
73; SM90-EMPTY:
74; SM90-NEXT:  // %bb.0:
75; SM90-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
76; SM90-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
77; SM90-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
78; SM90-NEXT:    st.param.b16 [func_retval0], %rs3;
79; SM90-NEXT:    ret;
80  %3 = fadd bfloat %0, %1
81  ret bfloat %3
82}
83
84define bfloat @test_fsub(bfloat %0, bfloat %1) {
85; SM70-LABEL: test_fsub(
86; SM70:       {
87; SM70-NEXT:    .reg .pred %p<2>;
88; SM70-NEXT:    .reg .b16 %rs<2>;
89; SM70-NEXT:    .reg .b32 %r<11>;
90; SM70-NEXT:    .reg .f32 %f<4>;
91; SM70-EMPTY:
92; SM70-NEXT:  // %bb.0:
93; SM70-NEXT:    ld.param.u16 %r1, [test_fsub_param_1];
94; SM70-NEXT:    shl.b32 %r2, %r1, 16;
95; SM70-NEXT:    mov.b32 %f1, %r2;
96; SM70-NEXT:    ld.param.u16 %r3, [test_fsub_param_0];
97; SM70-NEXT:    shl.b32 %r4, %r3, 16;
98; SM70-NEXT:    mov.b32 %f2, %r4;
99; SM70-NEXT:    sub.rn.f32 %f3, %f2, %f1;
100; SM70-NEXT:    mov.b32 %r5, %f3;
101; SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
102; SM70-NEXT:    add.s32 %r7, %r6, %r5;
103; SM70-NEXT:    add.s32 %r8, %r7, 32767;
104; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
105; SM70-NEXT:    or.b32 %r9, %r5, 4194304;
106; SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
107; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
108; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
109; SM70-NEXT:    ret;
110;
111; SM80-LABEL: test_fsub(
112; SM80:       {
113; SM80-NEXT:    .reg .b16 %rs<5>;
114; SM80-EMPTY:
115; SM80-NEXT:  // %bb.0:
116; SM80-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
117; SM80-NEXT:    mov.b16 %rs2, 0xBF80;
118; SM80-NEXT:    ld.param.b16 %rs3, [test_fsub_param_1];
119; SM80-NEXT:    fma.rn.bf16 %rs4, %rs3, %rs2, %rs1;
120; SM80-NEXT:    st.param.b16 [func_retval0], %rs4;
121; SM80-NEXT:    ret;
122;
123; SM80-FTZ-LABEL: test_fsub(
124; SM80-FTZ:       {
125; SM80-FTZ-NEXT:    .reg .b16 %rs<4>;
126; SM80-FTZ-NEXT:    .reg .f32 %f<4>;
127; SM80-FTZ-EMPTY:
128; SM80-FTZ-NEXT:  // %bb.0:
129; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
130; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
131; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs2;
132; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs1;
133; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %f3, %f2, %f1;
134; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f3;
135; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
136; SM80-FTZ-NEXT:    ret;
137;
138; SM90-LABEL: test_fsub(
139; SM90:       {
140; SM90-NEXT:    .reg .b16 %rs<4>;
141; SM90-EMPTY:
142; SM90-NEXT:  // %bb.0:
143; SM90-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
144; SM90-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
145; SM90-NEXT:    sub.rn.bf16 %rs3, %rs1, %rs2;
146; SM90-NEXT:    st.param.b16 [func_retval0], %rs3;
147; SM90-NEXT:    ret;
148  %3 = fsub bfloat %0, %1
149  ret bfloat %3
150}
151
152define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
153; SM70-LABEL: test_faddx2(
154; SM70:       {
155; SM70-NEXT:    .reg .pred %p<3>;
156; SM70-NEXT:    .reg .b16 %rs<5>;
157; SM70-NEXT:    .reg .b32 %r<24>;
158; SM70-NEXT:    .reg .f32 %f<7>;
159; SM70-EMPTY:
160; SM70-NEXT:  // %bb.0:
161; SM70-NEXT:    ld.param.b32 %r1, [test_faddx2_param_0];
162; SM70-NEXT:    ld.param.b32 %r2, [test_faddx2_param_1];
163; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
164; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
165; SM70-NEXT:    shl.b32 %r4, %r3, 16;
166; SM70-NEXT:    mov.b32 %f1, %r4;
167; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
168; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
169; SM70-NEXT:    shl.b32 %r6, %r5, 16;
170; SM70-NEXT:    mov.b32 %f2, %r6;
171; SM70-NEXT:    add.rn.f32 %f3, %f2, %f1;
172; SM70-NEXT:    mov.b32 %r7, %f3;
173; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
174; SM70-NEXT:    add.s32 %r9, %r8, %r7;
175; SM70-NEXT:    add.s32 %r10, %r9, 32767;
176; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
177; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
178; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
179; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
180; SM70-NEXT:    shl.b32 %r14, %r13, 16;
181; SM70-NEXT:    mov.b32 %f4, %r14;
182; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
183; SM70-NEXT:    shl.b32 %r16, %r15, 16;
184; SM70-NEXT:    mov.b32 %f5, %r16;
185; SM70-NEXT:    add.rn.f32 %f6, %f5, %f4;
186; SM70-NEXT:    mov.b32 %r17, %f6;
187; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
188; SM70-NEXT:    add.s32 %r19, %r18, %r17;
189; SM70-NEXT:    add.s32 %r20, %r19, 32767;
190; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
191; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
192; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
193; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
194; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
195; SM70-NEXT:    ret;
196;
197; SM80-LABEL: test_faddx2(
198; SM80:       {
199; SM80-NEXT:    .reg .b32 %r<5>;
200; SM80-EMPTY:
201; SM80-NEXT:  // %bb.0:
202; SM80-NEXT:    ld.param.b32 %r1, [test_faddx2_param_1];
203; SM80-NEXT:    ld.param.b32 %r2, [test_faddx2_param_0];
204; SM80-NEXT:    mov.b32 %r3, 1065369472;
205; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r3, %r1;
206; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
207; SM80-NEXT:    ret;
208;
209; SM80-FTZ-LABEL: test_faddx2(
210; SM80-FTZ:       {
211; SM80-FTZ-NEXT:    .reg .b16 %rs<5>;
212; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
213; SM80-FTZ-NEXT:    .reg .f32 %f<7>;
214; SM80-FTZ-EMPTY:
215; SM80-FTZ-NEXT:  // %bb.0:
216; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_faddx2_param_0];
217; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_faddx2_param_1];
218; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
219; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
220; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
221; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs3;
222; SM80-FTZ-NEXT:    add.rn.ftz.f32 %f3, %f2, %f1;
223; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs2;
224; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
225; SM80-FTZ-NEXT:    add.rn.ftz.f32 %f6, %f5, %f4;
226; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
227; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
228; SM80-FTZ-NEXT:    ret;
229;
230; SM90-LABEL: test_faddx2(
231; SM90:       {
232; SM90-NEXT:    .reg .b32 %r<4>;
233; SM90-EMPTY:
234; SM90-NEXT:  // %bb.0:
235; SM90-NEXT:    ld.param.b32 %r1, [test_faddx2_param_1];
236; SM90-NEXT:    ld.param.b32 %r2, [test_faddx2_param_0];
237; SM90-NEXT:    add.rn.bf16x2 %r3, %r2, %r1;
238; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
239; SM90-NEXT:    ret;
240  %r = fadd <2 x bfloat> %a, %b
241  ret <2 x bfloat> %r
242}
243
244define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
245; SM70-LABEL: test_fsubx2(
246; SM70:       {
247; SM70-NEXT:    .reg .pred %p<3>;
248; SM70-NEXT:    .reg .b16 %rs<5>;
249; SM70-NEXT:    .reg .b32 %r<24>;
250; SM70-NEXT:    .reg .f32 %f<7>;
251; SM70-EMPTY:
252; SM70-NEXT:  // %bb.0:
253; SM70-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
254; SM70-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
255; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
256; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
257; SM70-NEXT:    shl.b32 %r4, %r3, 16;
258; SM70-NEXT:    mov.b32 %f1, %r4;
259; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
260; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
261; SM70-NEXT:    shl.b32 %r6, %r5, 16;
262; SM70-NEXT:    mov.b32 %f2, %r6;
263; SM70-NEXT:    sub.rn.f32 %f3, %f2, %f1;
264; SM70-NEXT:    mov.b32 %r7, %f3;
265; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
266; SM70-NEXT:    add.s32 %r9, %r8, %r7;
267; SM70-NEXT:    add.s32 %r10, %r9, 32767;
268; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
269; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
270; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
271; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
272; SM70-NEXT:    shl.b32 %r14, %r13, 16;
273; SM70-NEXT:    mov.b32 %f4, %r14;
274; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
275; SM70-NEXT:    shl.b32 %r16, %r15, 16;
276; SM70-NEXT:    mov.b32 %f5, %r16;
277; SM70-NEXT:    sub.rn.f32 %f6, %f5, %f4;
278; SM70-NEXT:    mov.b32 %r17, %f6;
279; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
280; SM70-NEXT:    add.s32 %r19, %r18, %r17;
281; SM70-NEXT:    add.s32 %r20, %r19, 32767;
282; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
283; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
284; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
285; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
286; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
287; SM70-NEXT:    ret;
288;
289; SM80-LABEL: test_fsubx2(
290; SM80:       {
291; SM80-NEXT:    .reg .b32 %r<5>;
292; SM80-EMPTY:
293; SM80-NEXT:  // %bb.0:
294; SM80-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
295; SM80-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
296; SM80-NEXT:    mov.b32 %r3, -1082081408;
297; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r3, %r1;
298; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
299; SM80-NEXT:    ret;
300;
301; SM80-FTZ-LABEL: test_fsubx2(
302; SM80-FTZ:       {
303; SM80-FTZ-NEXT:    .reg .b16 %rs<5>;
304; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
305; SM80-FTZ-NEXT:    .reg .f32 %f<7>;
306; SM80-FTZ-EMPTY:
307; SM80-FTZ-NEXT:  // %bb.0:
308; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
309; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
310; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
311; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
312; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
313; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs3;
314; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %f3, %f2, %f1;
315; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs2;
316; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
317; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %f6, %f5, %f4;
318; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
319; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
320; SM80-FTZ-NEXT:    ret;
321;
322; SM90-LABEL: test_fsubx2(
323; SM90:       {
324; SM90-NEXT:    .reg .b32 %r<4>;
325; SM90-EMPTY:
326; SM90-NEXT:  // %bb.0:
327; SM90-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_1];
328; SM90-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_0];
329; SM90-NEXT:    sub.rn.bf16x2 %r3, %r2, %r1;
330; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
331; SM90-NEXT:    ret;
332  %r = fsub <2 x bfloat> %a, %b
333  ret <2 x bfloat> %r
334}
335
336define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
337; SM70-LABEL: test_fmulx2(
338; SM70:       {
339; SM70-NEXT:    .reg .pred %p<3>;
340; SM70-NEXT:    .reg .b16 %rs<5>;
341; SM70-NEXT:    .reg .b32 %r<24>;
342; SM70-NEXT:    .reg .f32 %f<7>;
343; SM70-EMPTY:
344; SM70-NEXT:  // %bb.0:
345; SM70-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
346; SM70-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
347; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
348; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
349; SM70-NEXT:    shl.b32 %r4, %r3, 16;
350; SM70-NEXT:    mov.b32 %f1, %r4;
351; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
352; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
353; SM70-NEXT:    shl.b32 %r6, %r5, 16;
354; SM70-NEXT:    mov.b32 %f2, %r6;
355; SM70-NEXT:    mul.rn.f32 %f3, %f2, %f1;
356; SM70-NEXT:    mov.b32 %r7, %f3;
357; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
358; SM70-NEXT:    add.s32 %r9, %r8, %r7;
359; SM70-NEXT:    add.s32 %r10, %r9, 32767;
360; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
361; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
362; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
363; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
364; SM70-NEXT:    shl.b32 %r14, %r13, 16;
365; SM70-NEXT:    mov.b32 %f4, %r14;
366; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
367; SM70-NEXT:    shl.b32 %r16, %r15, 16;
368; SM70-NEXT:    mov.b32 %f5, %r16;
369; SM70-NEXT:    mul.rn.f32 %f6, %f5, %f4;
370; SM70-NEXT:    mov.b32 %r17, %f6;
371; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
372; SM70-NEXT:    add.s32 %r19, %r18, %r17;
373; SM70-NEXT:    add.s32 %r20, %r19, 32767;
374; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
375; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
376; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
377; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
378; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
379; SM70-NEXT:    ret;
380;
381; SM80-LABEL: test_fmulx2(
382; SM80:       {
383; SM80-NEXT:    .reg .b32 %r<5>;
384; SM80-EMPTY:
385; SM80-NEXT:  // %bb.0:
386; SM80-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_1];
387; SM80-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_0];
388; SM80-NEXT:    mov.b32 %r3, -2147450880;
389; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r1, %r3;
390; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
391; SM80-NEXT:    ret;
392;
393; SM80-FTZ-LABEL: test_fmulx2(
394; SM80-FTZ:       {
395; SM80-FTZ-NEXT:    .reg .b16 %rs<5>;
396; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
397; SM80-FTZ-NEXT:    .reg .f32 %f<7>;
398; SM80-FTZ-EMPTY:
399; SM80-FTZ-NEXT:  // %bb.0:
400; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
401; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
402; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
403; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
404; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
405; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs3;
406; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %f3, %f2, %f1;
407; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs2;
408; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
409; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %f6, %f5, %f4;
410; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
411; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
412; SM80-FTZ-NEXT:    ret;
413;
414; SM90-LABEL: test_fmulx2(
415; SM90:       {
416; SM90-NEXT:    .reg .b32 %r<4>;
417; SM90-EMPTY:
418; SM90-NEXT:  // %bb.0:
419; SM90-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_1];
420; SM90-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_0];
421; SM90-NEXT:    mul.rn.bf16x2 %r3, %r2, %r1;
422; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
423; SM90-NEXT:    ret;
424  %r = fmul <2 x bfloat> %a, %b
425  ret <2 x bfloat> %r
426}
427
428define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
429; SM70-LABEL: test_fdiv(
430; SM70:       {
431; SM70-NEXT:    .reg .pred %p<3>;
432; SM70-NEXT:    .reg .b16 %rs<5>;
433; SM70-NEXT:    .reg .b32 %r<24>;
434; SM70-NEXT:    .reg .f32 %f<7>;
435; SM70-EMPTY:
436; SM70-NEXT:  // %bb.0:
437; SM70-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
438; SM70-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
439; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
440; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
441; SM70-NEXT:    shl.b32 %r4, %r3, 16;
442; SM70-NEXT:    mov.b32 %f1, %r4;
443; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
444; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
445; SM70-NEXT:    shl.b32 %r6, %r5, 16;
446; SM70-NEXT:    mov.b32 %f2, %r6;
447; SM70-NEXT:    div.rn.f32 %f3, %f2, %f1;
448; SM70-NEXT:    mov.b32 %r7, %f3;
449; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
450; SM70-NEXT:    add.s32 %r9, %r8, %r7;
451; SM70-NEXT:    add.s32 %r10, %r9, 32767;
452; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
453; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
454; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
455; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
456; SM70-NEXT:    shl.b32 %r14, %r13, 16;
457; SM70-NEXT:    mov.b32 %f4, %r14;
458; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
459; SM70-NEXT:    shl.b32 %r16, %r15, 16;
460; SM70-NEXT:    mov.b32 %f5, %r16;
461; SM70-NEXT:    div.rn.f32 %f6, %f5, %f4;
462; SM70-NEXT:    mov.b32 %r17, %f6;
463; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
464; SM70-NEXT:    add.s32 %r19, %r18, %r17;
465; SM70-NEXT:    add.s32 %r20, %r19, 32767;
466; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
467; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
468; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
469; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
470; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
471; SM70-NEXT:    ret;
472;
473; SM80-LABEL: test_fdiv(
474; SM80:       {
475; SM80-NEXT:    .reg .b16 %rs<5>;
476; SM80-NEXT:    .reg .b32 %r<4>;
477; SM80-NEXT:    .reg .f32 %f<7>;
478; SM80-EMPTY:
479; SM80-NEXT:  // %bb.0:
480; SM80-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
481; SM80-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
482; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
483; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
484; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
485; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
486; SM80-NEXT:    div.rn.f32 %f3, %f2, %f1;
487; SM80-NEXT:    cvt.f32.bf16 %f4, %rs2;
488; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
489; SM80-NEXT:    div.rn.f32 %f6, %f5, %f4;
490; SM80-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
491; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
492; SM80-NEXT:    ret;
493;
494; SM80-FTZ-LABEL: test_fdiv(
495; SM80-FTZ:       {
496; SM80-FTZ-NEXT:    .reg .b16 %rs<5>;
497; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
498; SM80-FTZ-NEXT:    .reg .f32 %f<7>;
499; SM80-FTZ-EMPTY:
500; SM80-FTZ-NEXT:  // %bb.0:
501; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
502; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
503; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
504; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
505; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
506; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs3;
507; SM80-FTZ-NEXT:    div.rn.ftz.f32 %f3, %f2, %f1;
508; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs2;
509; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
510; SM80-FTZ-NEXT:    div.rn.ftz.f32 %f6, %f5, %f4;
511; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
512; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
513; SM80-FTZ-NEXT:    ret;
514;
515; SM90-LABEL: test_fdiv(
516; SM90:       {
517; SM90-NEXT:    .reg .b16 %rs<5>;
518; SM90-NEXT:    .reg .b32 %r<4>;
519; SM90-NEXT:    .reg .f32 %f<7>;
520; SM90-EMPTY:
521; SM90-NEXT:  // %bb.0:
522; SM90-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
523; SM90-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
524; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
525; SM90-NEXT:    cvt.f32.bf16 %f1, %rs1;
526; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
527; SM90-NEXT:    cvt.f32.bf16 %f2, %rs3;
528; SM90-NEXT:    div.rn.f32 %f3, %f2, %f1;
529; SM90-NEXT:    cvt.f32.bf16 %f4, %rs2;
530; SM90-NEXT:    cvt.f32.bf16 %f5, %rs4;
531; SM90-NEXT:    div.rn.f32 %f6, %f5, %f4;
532; SM90-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
533; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
534; SM90-NEXT:    ret;
535  %r = fdiv <2 x bfloat> %a, %b
536  ret <2 x bfloat> %r
537}
538
539define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
540; CHECK-LABEL: test_extract_0(
541; CHECK:       {
542; CHECK-NEXT:    .reg .b16 %rs<2>;
543; CHECK-EMPTY:
544; CHECK-NEXT:  // %bb.0:
545; CHECK-NEXT:    ld.param.b16 %rs1, [test_extract_0_param_0];
546; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
547; CHECK-NEXT:    ret;
548  %e = extractelement <2 x bfloat> %a, i32 0
549  ret bfloat %e
550}
551
552define bfloat @test_extract_1(<2 x bfloat> %a) #0 {
553; CHECK-LABEL: test_extract_1(
554; CHECK:       {
555; CHECK-NEXT:    .reg .b16 %rs<2>;
556; CHECK-EMPTY:
557; CHECK-NEXT:  // %bb.0:
558; CHECK-NEXT:    ld.param.b16 %rs1, [test_extract_1_param_0+2];
559; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
560; CHECK-NEXT:    ret;
561  %e = extractelement <2 x bfloat> %a, i32 1
562  ret bfloat %e
563}
564
565define float @test_fpext_float(bfloat %a) #0 {
566; SM70-LABEL: test_fpext_float(
567; SM70:       {
568; SM70-NEXT:    .reg .b32 %r<3>;
569; SM70-NEXT:    .reg .f32 %f<2>;
570; SM70-EMPTY:
571; SM70-NEXT:  // %bb.0:
572; SM70-NEXT:    ld.param.u16 %r1, [test_fpext_float_param_0];
573; SM70-NEXT:    shl.b32 %r2, %r1, 16;
574; SM70-NEXT:    mov.b32 %f1, %r2;
575; SM70-NEXT:    st.param.f32 [func_retval0], %f1;
576; SM70-NEXT:    ret;
577;
578; SM80-LABEL: test_fpext_float(
579; SM80:       {
580; SM80-NEXT:    .reg .b16 %rs<2>;
581; SM80-NEXT:    .reg .f32 %f<2>;
582; SM80-EMPTY:
583; SM80-NEXT:  // %bb.0:
584; SM80-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
585; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
586; SM80-NEXT:    st.param.f32 [func_retval0], %f1;
587; SM80-NEXT:    ret;
588;
589; SM80-FTZ-LABEL: test_fpext_float(
590; SM80-FTZ:       {
591; SM80-FTZ-NEXT:    .reg .b16 %rs<2>;
592; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
593; SM80-FTZ-EMPTY:
594; SM80-FTZ-NEXT:  // %bb.0:
595; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
596; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
597; SM80-FTZ-NEXT:    st.param.f32 [func_retval0], %f1;
598; SM80-FTZ-NEXT:    ret;
599;
600; SM90-LABEL: test_fpext_float(
601; SM90:       {
602; SM90-NEXT:    .reg .b16 %rs<2>;
603; SM90-NEXT:    .reg .f32 %f<2>;
604; SM90-EMPTY:
605; SM90-NEXT:  // %bb.0:
606; SM90-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
607; SM90-NEXT:    cvt.f32.bf16 %f1, %rs1;
608; SM90-NEXT:    st.param.f32 [func_retval0], %f1;
609; SM90-NEXT:    ret;
610  %r = fpext bfloat %a to float
611  ret float %r
612}
613
614define bfloat @test_fptrunc_float(float %a) #0 {
615; SM70-LABEL: test_fptrunc_float(
616; SM70:       {
617; SM70-NEXT:    .reg .pred %p<2>;
618; SM70-NEXT:    .reg .b16 %rs<2>;
619; SM70-NEXT:    .reg .b32 %r<7>;
620; SM70-NEXT:    .reg .f32 %f<2>;
621; SM70-EMPTY:
622; SM70-NEXT:  // %bb.0:
623; SM70-NEXT:    ld.param.f32 %f1, [test_fptrunc_float_param_0];
624; SM70-NEXT:    mov.b32 %r1, %f1;
625; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
626; SM70-NEXT:    add.s32 %r3, %r2, %r1;
627; SM70-NEXT:    add.s32 %r4, %r3, 32767;
628; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
629; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
630; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
631; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; }
632; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
633; SM70-NEXT:    ret;
634;
635; SM80-LABEL: test_fptrunc_float(
636; SM80:       {
637; SM80-NEXT:    .reg .b16 %rs<2>;
638; SM80-NEXT:    .reg .f32 %f<2>;
639; SM80-EMPTY:
640; SM80-NEXT:  // %bb.0:
641; SM80-NEXT:    ld.param.f32 %f1, [test_fptrunc_float_param_0];
642; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
643; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
644; SM80-NEXT:    ret;
645;
646; SM80-FTZ-LABEL: test_fptrunc_float(
647; SM80-FTZ:       {
648; SM80-FTZ-NEXT:    .reg .b16 %rs<2>;
649; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
650; SM80-FTZ-EMPTY:
651; SM80-FTZ-NEXT:  // %bb.0:
652; SM80-FTZ-NEXT:    ld.param.f32 %f1, [test_fptrunc_float_param_0];
653; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
654; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
655; SM80-FTZ-NEXT:    ret;
656;
657; SM90-LABEL: test_fptrunc_float(
658; SM90:       {
659; SM90-NEXT:    .reg .b16 %rs<2>;
660; SM90-NEXT:    .reg .f32 %f<2>;
661; SM90-EMPTY:
662; SM90-NEXT:  // %bb.0:
663; SM90-NEXT:    ld.param.f32 %f1, [test_fptrunc_float_param_0];
664; SM90-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
665; SM90-NEXT:    st.param.b16 [func_retval0], %rs1;
666; SM90-NEXT:    ret;
667  %r = fptrunc float %a to bfloat
668  ret bfloat %r
669}
670
671define bfloat @test_fadd_imm_1(bfloat %a) #0 {
672; SM70-LABEL: test_fadd_imm_1(
673; SM70:       {
674; SM70-NEXT:    .reg .pred %p<2>;
675; SM70-NEXT:    .reg .b16 %rs<2>;
676; SM70-NEXT:    .reg .b32 %r<9>;
677; SM70-NEXT:    .reg .f32 %f<3>;
678; SM70-EMPTY:
679; SM70-NEXT:  // %bb.0:
680; SM70-NEXT:    ld.param.u16 %r1, [test_fadd_imm_1_param_0];
681; SM70-NEXT:    shl.b32 %r2, %r1, 16;
682; SM70-NEXT:    mov.b32 %f1, %r2;
683; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
684; SM70-NEXT:    mov.b32 %r3, %f2;
685; SM70-NEXT:    bfe.u32 %r4, %r3, 16, 1;
686; SM70-NEXT:    add.s32 %r5, %r4, %r3;
687; SM70-NEXT:    add.s32 %r6, %r5, 32767;
688; SM70-NEXT:    setp.nan.f32 %p1, %f2, %f2;
689; SM70-NEXT:    or.b32 %r7, %r3, 4194304;
690; SM70-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
691; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
692; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
693; SM70-NEXT:    ret;
694;
695; SM80-LABEL: test_fadd_imm_1(
696; SM80:       {
697; SM80-NEXT:    .reg .b16 %rs<4>;
698; SM80-EMPTY:
699; SM80-NEXT:  // %bb.0:
700; SM80-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
701; SM80-NEXT:    mov.b16 %rs2, 0x3F80;
702; SM80-NEXT:    fma.rn.bf16 %rs3, %rs1, %rs2, %rs2;
703; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
704; SM80-NEXT:    ret;
705;
706; SM80-FTZ-LABEL: test_fadd_imm_1(
707; SM80-FTZ:       {
708; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
709; SM80-FTZ-NEXT:    .reg .f32 %f<3>;
710; SM80-FTZ-EMPTY:
711; SM80-FTZ-NEXT:  // %bb.0:
712; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
713; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
714; SM80-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f3F800000;
715; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
716; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
717; SM80-FTZ-NEXT:    ret;
718;
719; SM90-LABEL: test_fadd_imm_1(
720; SM90:       {
721; SM90-NEXT:    .reg .b16 %rs<4>;
722; SM90-EMPTY:
723; SM90-NEXT:  // %bb.0:
724; SM90-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
725; SM90-NEXT:    mov.b16 %rs2, 0x3F80;
726; SM90-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
727; SM90-NEXT:    st.param.b16 [func_retval0], %rs3;
728; SM90-NEXT:    ret;
729  %r = fadd bfloat %a, 1.0
730  ret bfloat %r
731}
732
733define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat %d) #0 {
734; CHECK-LABEL: test_select_cc_bf16_f64(
735; CHECK:       {
736; CHECK-NEXT:    .reg .pred %p<2>;
737; CHECK-NEXT:    .reg .b16 %rs<4>;
738; CHECK-NEXT:    .reg .f64 %fd<3>;
739; CHECK-EMPTY:
740; CHECK-NEXT:  // %bb.0:
741; CHECK-NEXT:    ld.param.f64 %fd1, [test_select_cc_bf16_f64_param_0];
742; CHECK-NEXT:    ld.param.f64 %fd2, [test_select_cc_bf16_f64_param_1];
743; CHECK-NEXT:    setp.lt.f64 %p1, %fd1, %fd2;
744; CHECK-NEXT:    ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2];
745; CHECK-NEXT:    ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3];
746; CHECK-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
747; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
748; CHECK-NEXT:    ret;
749  %cc = fcmp olt double %a, %b
750  %r = select i1 %cc, bfloat %c, bfloat %d
751  ret bfloat %r
752}
753
754define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
755; SM70-LABEL: test_extload_bf16x8(
756; SM70:       {
757; SM70-NEXT:    .reg .b16 %rs<9>;
758; SM70-NEXT:    .reg .b32 %r<21>;
759; SM70-NEXT:    .reg .f32 %f<9>;
760; SM70-NEXT:    .reg .b64 %rd<2>;
761; SM70-EMPTY:
762; SM70-NEXT:  // %bb.0:
763; SM70-NEXT:    ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
764; SM70-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
765; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
766; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
767; SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
768; SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
769; SM70-NEXT:    cvt.u32.u16 %r5, %rs8;
770; SM70-NEXT:    shl.b32 %r6, %r5, 16;
771; SM70-NEXT:    mov.b32 %f1, %r6;
772; SM70-NEXT:    cvt.u32.u16 %r7, %rs7;
773; SM70-NEXT:    shl.b32 %r8, %r7, 16;
774; SM70-NEXT:    mov.b32 %f2, %r8;
775; SM70-NEXT:    cvt.u32.u16 %r9, %rs6;
776; SM70-NEXT:    shl.b32 %r10, %r9, 16;
777; SM70-NEXT:    mov.b32 %f3, %r10;
778; SM70-NEXT:    cvt.u32.u16 %r11, %rs5;
779; SM70-NEXT:    shl.b32 %r12, %r11, 16;
780; SM70-NEXT:    mov.b32 %f4, %r12;
781; SM70-NEXT:    cvt.u32.u16 %r13, %rs4;
782; SM70-NEXT:    shl.b32 %r14, %r13, 16;
783; SM70-NEXT:    mov.b32 %f5, %r14;
784; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
785; SM70-NEXT:    shl.b32 %r16, %r15, 16;
786; SM70-NEXT:    mov.b32 %f6, %r16;
787; SM70-NEXT:    cvt.u32.u16 %r17, %rs2;
788; SM70-NEXT:    shl.b32 %r18, %r17, 16;
789; SM70-NEXT:    mov.b32 %f7, %r18;
790; SM70-NEXT:    cvt.u32.u16 %r19, %rs1;
791; SM70-NEXT:    shl.b32 %r20, %r19, 16;
792; SM70-NEXT:    mov.b32 %f8, %r20;
793; SM70-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
794; SM70-NEXT:    st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
795; SM70-NEXT:    ret;
796;
797; SM80-LABEL: test_extload_bf16x8(
798; SM80:       {
799; SM80-NEXT:    .reg .b16 %rs<9>;
800; SM80-NEXT:    .reg .b32 %r<5>;
801; SM80-NEXT:    .reg .f32 %f<9>;
802; SM80-NEXT:    .reg .b64 %rd<2>;
803; SM80-EMPTY:
804; SM80-NEXT:  // %bb.0:
805; SM80-NEXT:    ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
806; SM80-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
807; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
808; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
809; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
810; SM80-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
811; SM80-NEXT:    cvt.f32.bf16 %f1, %rs8;
812; SM80-NEXT:    cvt.f32.bf16 %f2, %rs7;
813; SM80-NEXT:    cvt.f32.bf16 %f3, %rs6;
814; SM80-NEXT:    cvt.f32.bf16 %f4, %rs5;
815; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
816; SM80-NEXT:    cvt.f32.bf16 %f6, %rs3;
817; SM80-NEXT:    cvt.f32.bf16 %f7, %rs2;
818; SM80-NEXT:    cvt.f32.bf16 %f8, %rs1;
819; SM80-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
820; SM80-NEXT:    st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
821; SM80-NEXT:    ret;
822;
823; SM80-FTZ-LABEL: test_extload_bf16x8(
824; SM80-FTZ:       {
825; SM80-FTZ-NEXT:    .reg .b16 %rs<9>;
826; SM80-FTZ-NEXT:    .reg .b32 %r<5>;
827; SM80-FTZ-NEXT:    .reg .f32 %f<9>;
828; SM80-FTZ-NEXT:    .reg .b64 %rd<2>;
829; SM80-FTZ-EMPTY:
830; SM80-FTZ-NEXT:  // %bb.0:
831; SM80-FTZ-NEXT:    ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
832; SM80-FTZ-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
833; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
834; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
835; SM80-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
836; SM80-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
837; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs8;
838; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f2, %rs7;
839; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f3, %rs6;
840; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f4, %rs5;
841; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f5, %rs4;
842; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f6, %rs3;
843; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f7, %rs2;
844; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f8, %rs1;
845; SM80-FTZ-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
846; SM80-FTZ-NEXT:    st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
847; SM80-FTZ-NEXT:    ret;
848;
849; SM90-LABEL: test_extload_bf16x8(
850; SM90:       {
851; SM90-NEXT:    .reg .b16 %rs<9>;
852; SM90-NEXT:    .reg .b32 %r<5>;
853; SM90-NEXT:    .reg .f32 %f<9>;
854; SM90-NEXT:    .reg .b64 %rd<2>;
855; SM90-EMPTY:
856; SM90-NEXT:  // %bb.0:
857; SM90-NEXT:    ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
858; SM90-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
859; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
860; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
861; SM90-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
862; SM90-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
863; SM90-NEXT:    cvt.f32.bf16 %f1, %rs8;
864; SM90-NEXT:    cvt.f32.bf16 %f2, %rs7;
865; SM90-NEXT:    cvt.f32.bf16 %f3, %rs6;
866; SM90-NEXT:    cvt.f32.bf16 %f4, %rs5;
867; SM90-NEXT:    cvt.f32.bf16 %f5, %rs4;
868; SM90-NEXT:    cvt.f32.bf16 %f6, %rs3;
869; SM90-NEXT:    cvt.f32.bf16 %f7, %rs2;
870; SM90-NEXT:    cvt.f32.bf16 %f8, %rs1;
871; SM90-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
872; SM90-NEXT:    st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
873; SM90-NEXT:    ret;
874  %load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16
875  %res = fpext <8 x bfloat> %load to <8 x float>
876  ret <8 x float> %res
877}
878
879define i16 @test_fptosi_i16(bfloat %a) {
880; SM70-LABEL: test_fptosi_i16(
881; SM70:       {
882; SM70-NEXT:    .reg .b16 %rs<2>;
883; SM70-NEXT:    .reg .b32 %r<4>;
884; SM70-NEXT:    .reg .f32 %f<2>;
885; SM70-EMPTY:
886; SM70-NEXT:  // %bb.0:
887; SM70-NEXT:    ld.param.u16 %r1, [test_fptosi_i16_param_0];
888; SM70-NEXT:    shl.b32 %r2, %r1, 16;
889; SM70-NEXT:    mov.b32 %f1, %r2;
890; SM70-NEXT:    cvt.rzi.s16.f32 %rs1, %f1;
891; SM70-NEXT:    cvt.u32.u16 %r3, %rs1;
892; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
893; SM70-NEXT:    ret;
894;
895; SM80-LABEL: test_fptosi_i16(
896; SM80:       {
897; SM80-NEXT:    .reg .b16 %rs<3>;
898; SM80-NEXT:    .reg .b32 %r<2>;
899; SM80-NEXT:    .reg .f32 %f<2>;
900; SM80-EMPTY:
901; SM80-NEXT:  // %bb.0:
902; SM80-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
903; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
904; SM80-NEXT:    cvt.rzi.s16.f32 %rs2, %f1;
905; SM80-NEXT:    cvt.u32.u16 %r1, %rs2;
906; SM80-NEXT:    st.param.b32 [func_retval0], %r1;
907; SM80-NEXT:    ret;
908;
909; SM80-FTZ-LABEL: test_fptosi_i16(
910; SM80-FTZ:       {
911; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
912; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
913; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
914; SM80-FTZ-EMPTY:
915; SM80-FTZ-NEXT:  // %bb.0:
916; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
917; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
918; SM80-FTZ-NEXT:    cvt.rzi.ftz.s16.f32 %rs2, %f1;
919; SM80-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
920; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
921; SM80-FTZ-NEXT:    ret;
922;
923; SM90-LABEL: test_fptosi_i16(
924; SM90:       {
925; SM90-NEXT:    .reg .b16 %rs<3>;
926; SM90-NEXT:    .reg .b32 %r<2>;
927; SM90-EMPTY:
928; SM90-NEXT:  // %bb.0:
929; SM90-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
930; SM90-NEXT:    cvt.rzi.s16.bf16 %rs2, %rs1;
931; SM90-NEXT:    cvt.u32.u16 %r1, %rs2;
932; SM90-NEXT:    st.param.b32 [func_retval0], %r1;
933; SM90-NEXT:    ret;
934  %r = fptosi bfloat %a to i16
935  ret i16 %r
936}
937
938define i16 @test_fptoui_i16(bfloat %a) {
939; SM70-LABEL: test_fptoui_i16(
940; SM70:       {
941; SM70-NEXT:    .reg .b16 %rs<2>;
942; SM70-NEXT:    .reg .b32 %r<4>;
943; SM70-NEXT:    .reg .f32 %f<2>;
944; SM70-EMPTY:
945; SM70-NEXT:  // %bb.0:
946; SM70-NEXT:    ld.param.u16 %r1, [test_fptoui_i16_param_0];
947; SM70-NEXT:    shl.b32 %r2, %r1, 16;
948; SM70-NEXT:    mov.b32 %f1, %r2;
949; SM70-NEXT:    cvt.rzi.u16.f32 %rs1, %f1;
950; SM70-NEXT:    cvt.u32.u16 %r3, %rs1;
951; SM70-NEXT:    st.param.b32 [func_retval0], %r3;
952; SM70-NEXT:    ret;
953;
954; SM80-LABEL: test_fptoui_i16(
955; SM80:       {
956; SM80-NEXT:    .reg .b16 %rs<3>;
957; SM80-NEXT:    .reg .b32 %r<2>;
958; SM80-NEXT:    .reg .f32 %f<2>;
959; SM80-EMPTY:
960; SM80-NEXT:  // %bb.0:
961; SM80-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
962; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
963; SM80-NEXT:    cvt.rzi.u16.f32 %rs2, %f1;
964; SM80-NEXT:    cvt.u32.u16 %r1, %rs2;
965; SM80-NEXT:    st.param.b32 [func_retval0], %r1;
966; SM80-NEXT:    ret;
967;
968; SM80-FTZ-LABEL: test_fptoui_i16(
969; SM80-FTZ:       {
970; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
971; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
972; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
973; SM80-FTZ-EMPTY:
974; SM80-FTZ-NEXT:  // %bb.0:
975; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
976; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
977; SM80-FTZ-NEXT:    cvt.rzi.ftz.u16.f32 %rs2, %f1;
978; SM80-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
979; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
980; SM80-FTZ-NEXT:    ret;
981;
982; SM90-LABEL: test_fptoui_i16(
983; SM90:       {
984; SM90-NEXT:    .reg .b16 %rs<3>;
985; SM90-NEXT:    .reg .b32 %r<2>;
986; SM90-EMPTY:
987; SM90-NEXT:  // %bb.0:
988; SM90-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
989; SM90-NEXT:    cvt.rzi.u16.bf16 %rs2, %rs1;
990; SM90-NEXT:    cvt.u32.u16 %r1, %rs2;
991; SM90-NEXT:    st.param.b32 [func_retval0], %r1;
992; SM90-NEXT:    ret;
993  %r = fptoui bfloat %a to i16
994  ret i16 %r
995}
996
997define bfloat @test_sitofp_i16(i16 %a) {
998; SM70-LABEL: test_sitofp_i16(
999; SM70:       {
1000; SM70-NEXT:    .reg .pred %p<2>;
1001; SM70-NEXT:    .reg .b16 %rs<3>;
1002; SM70-NEXT:    .reg .b32 %r<7>;
1003; SM70-NEXT:    .reg .f32 %f<2>;
1004; SM70-EMPTY:
1005; SM70-NEXT:  // %bb.0:
1006; SM70-NEXT:    ld.param.u16 %rs1, [test_sitofp_i16_param_0];
1007; SM70-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
1008; SM70-NEXT:    mov.b32 %r1, %f1;
1009; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
1010; SM70-NEXT:    add.s32 %r3, %r2, %r1;
1011; SM70-NEXT:    add.s32 %r4, %r3, 32767;
1012; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
1013; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
1014; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
1015; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1016; SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
1017; SM70-NEXT:    ret;
1018;
1019; SM80-LABEL: test_sitofp_i16(
1020; SM80:       {
1021; SM80-NEXT:    .reg .b16 %rs<3>;
1022; SM80-NEXT:    .reg .f32 %f<2>;
1023; SM80-EMPTY:
1024; SM80-NEXT:  // %bb.0:
1025; SM80-NEXT:    ld.param.u16 %rs1, [test_sitofp_i16_param_0];
1026; SM80-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
1027; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
1028; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
1029; SM80-NEXT:    ret;
1030;
1031; SM80-FTZ-LABEL: test_sitofp_i16(
1032; SM80-FTZ:       {
1033; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
1034; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
1035; SM80-FTZ-EMPTY:
1036; SM80-FTZ-NEXT:  // %bb.0:
1037; SM80-FTZ-NEXT:    ld.param.u16 %rs1, [test_sitofp_i16_param_0];
1038; SM80-FTZ-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
1039; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
1040; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
1041; SM80-FTZ-NEXT:    ret;
1042;
1043; SM90-LABEL: test_sitofp_i16(
1044; SM90:       {
1045; SM90-NEXT:    .reg .b16 %rs<3>;
1046; SM90-EMPTY:
1047; SM90-NEXT:  // %bb.0:
1048; SM90-NEXT:    ld.param.u16 %rs1, [test_sitofp_i16_param_0];
1049; SM90-NEXT:    cvt.rn.bf16.s16 %rs2, %rs1;
1050; SM90-NEXT:    st.param.b16 [func_retval0], %rs2;
1051; SM90-NEXT:    ret;
1052  %r = sitofp i16 %a to bfloat
1053  ret bfloat %r
1054}
1055
1056define bfloat @test_uitofp_i8(i8 %a) {
1057; SM70-LABEL: test_uitofp_i8(
1058; SM70:       {
1059; SM70-NEXT:    .reg .pred %p<2>;
1060; SM70-NEXT:    .reg .b16 %rs<3>;
1061; SM70-NEXT:    .reg .b32 %r<7>;
1062; SM70-NEXT:    .reg .f32 %f<2>;
1063; SM70-EMPTY:
1064; SM70-NEXT:  // %bb.0:
1065; SM70-NEXT:    ld.param.u8 %rs1, [test_uitofp_i8_param_0];
1066; SM70-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
1067; SM70-NEXT:    mov.b32 %r1, %f1;
1068; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
1069; SM70-NEXT:    add.s32 %r3, %r2, %r1;
1070; SM70-NEXT:    add.s32 %r4, %r3, 32767;
1071; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
1072; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
1073; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
1074; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1075; SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
1076; SM70-NEXT:    ret;
1077;
1078; SM80-LABEL: test_uitofp_i8(
1079; SM80:       {
1080; SM80-NEXT:    .reg .b16 %rs<3>;
1081; SM80-NEXT:    .reg .f32 %f<2>;
1082; SM80-EMPTY:
1083; SM80-NEXT:  // %bb.0:
1084; SM80-NEXT:    ld.param.u8 %rs1, [test_uitofp_i8_param_0];
1085; SM80-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
1086; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
1087; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
1088; SM80-NEXT:    ret;
1089;
1090; SM80-FTZ-LABEL: test_uitofp_i8(
1091; SM80-FTZ:       {
1092; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
1093; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
1094; SM80-FTZ-EMPTY:
1095; SM80-FTZ-NEXT:  // %bb.0:
1096; SM80-FTZ-NEXT:    ld.param.u8 %rs1, [test_uitofp_i8_param_0];
1097; SM80-FTZ-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
1098; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
1099; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
1100; SM80-FTZ-NEXT:    ret;
1101;
1102; SM90-LABEL: test_uitofp_i8(
1103; SM90:       {
1104; SM90-NEXT:    .reg .b16 %rs<3>;
1105; SM90-EMPTY:
1106; SM90-NEXT:  // %bb.0:
1107; SM90-NEXT:    ld.param.u8 %rs1, [test_uitofp_i8_param_0];
1108; SM90-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
1109; SM90-NEXT:    st.param.b16 [func_retval0], %rs2;
1110; SM90-NEXT:    ret;
1111  %r = uitofp i8 %a to bfloat
1112  ret bfloat %r
1113}
1114
1115define bfloat @test_uitofp_i1(i1 %a) {
1116; SM70-LABEL: test_uitofp_i1(
1117; SM70:       {
1118; SM70-NEXT:    .reg .pred %p<3>;
1119; SM70-NEXT:    .reg .b16 %rs<4>;
1120; SM70-NEXT:    .reg .b32 %r<8>;
1121; SM70-NEXT:    .reg .f32 %f<2>;
1122; SM70-EMPTY:
1123; SM70-NEXT:  // %bb.0:
1124; SM70-NEXT:    ld.param.u8 %rs1, [test_uitofp_i1_param_0];
1125; SM70-NEXT:    and.b16 %rs2, %rs1, 1;
1126; SM70-NEXT:    setp.eq.b16 %p1, %rs2, 1;
1127; SM70-NEXT:    selp.u32 %r1, 1, 0, %p1;
1128; SM70-NEXT:    cvt.rn.f32.u32 %f1, %r1;
1129; SM70-NEXT:    mov.b32 %r2, %f1;
1130; SM70-NEXT:    bfe.u32 %r3, %r2, 16, 1;
1131; SM70-NEXT:    add.s32 %r4, %r3, %r2;
1132; SM70-NEXT:    add.s32 %r5, %r4, 32767;
1133; SM70-NEXT:    setp.nan.f32 %p2, %f1, %f1;
1134; SM70-NEXT:    or.b32 %r6, %r2, 4194304;
1135; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p2;
1136; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r7; }
1137; SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
1138; SM70-NEXT:    ret;
1139;
1140; SM80-LABEL: test_uitofp_i1(
1141; SM80:       {
1142; SM80-NEXT:    .reg .pred %p<2>;
1143; SM80-NEXT:    .reg .b16 %rs<4>;
1144; SM80-NEXT:    .reg .b32 %r<2>;
1145; SM80-NEXT:    .reg .f32 %f<2>;
1146; SM80-EMPTY:
1147; SM80-NEXT:  // %bb.0:
1148; SM80-NEXT:    ld.param.u8 %rs1, [test_uitofp_i1_param_0];
1149; SM80-NEXT:    and.b16 %rs2, %rs1, 1;
1150; SM80-NEXT:    setp.eq.b16 %p1, %rs2, 1;
1151; SM80-NEXT:    selp.u32 %r1, 1, 0, %p1;
1152; SM80-NEXT:    cvt.rn.f32.u32 %f1, %r1;
1153; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f1;
1154; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
1155; SM80-NEXT:    ret;
1156;
1157; SM80-FTZ-LABEL: test_uitofp_i1(
1158; SM80-FTZ:       {
1159; SM80-FTZ-NEXT:    .reg .pred %p<2>;
1160; SM80-FTZ-NEXT:    .reg .b16 %rs<4>;
1161; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
1162; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
1163; SM80-FTZ-EMPTY:
1164; SM80-FTZ-NEXT:  // %bb.0:
1165; SM80-FTZ-NEXT:    ld.param.u8 %rs1, [test_uitofp_i1_param_0];
1166; SM80-FTZ-NEXT:    and.b16 %rs2, %rs1, 1;
1167; SM80-FTZ-NEXT:    setp.eq.b16 %p1, %rs2, 1;
1168; SM80-FTZ-NEXT:    selp.u32 %r1, 1, 0, %p1;
1169; SM80-FTZ-NEXT:    cvt.rn.f32.u32 %f1, %r1;
1170; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f1;
1171; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
1172; SM80-FTZ-NEXT:    ret;
1173;
1174; SM90-LABEL: test_uitofp_i1(
1175; SM90:       {
1176; SM90-NEXT:    .reg .pred %p<2>;
1177; SM90-NEXT:    .reg .b16 %rs<4>;
1178; SM90-NEXT:    .reg .b32 %r<2>;
1179; SM90-EMPTY:
1180; SM90-NEXT:  // %bb.0:
1181; SM90-NEXT:    ld.param.u8 %rs1, [test_uitofp_i1_param_0];
1182; SM90-NEXT:    and.b16 %rs2, %rs1, 1;
1183; SM90-NEXT:    setp.eq.b16 %p1, %rs2, 1;
1184; SM90-NEXT:    selp.u32 %r1, 1, 0, %p1;
1185; SM90-NEXT:    cvt.rn.bf16.u32 %rs3, %r1;
1186; SM90-NEXT:    st.param.b16 [func_retval0], %rs3;
1187; SM90-NEXT:    ret;
1188  %r = uitofp i1 %a to bfloat
1189  ret bfloat %r
1190}
1191
1192define bfloat @test_uitofp_i16(i16 %a) {
1193; SM70-LABEL: test_uitofp_i16(
1194; SM70:       {
1195; SM70-NEXT:    .reg .pred %p<2>;
1196; SM70-NEXT:    .reg .b16 %rs<3>;
1197; SM70-NEXT:    .reg .b32 %r<7>;
1198; SM70-NEXT:    .reg .f32 %f<2>;
1199; SM70-EMPTY:
1200; SM70-NEXT:  // %bb.0:
1201; SM70-NEXT:    ld.param.u16 %rs1, [test_uitofp_i16_param_0];
1202; SM70-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
1203; SM70-NEXT:    mov.b32 %r1, %f1;
1204; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
1205; SM70-NEXT:    add.s32 %r3, %r2, %r1;
1206; SM70-NEXT:    add.s32 %r4, %r3, 32767;
1207; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
1208; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
1209; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
1210; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1211; SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
1212; SM70-NEXT:    ret;
1213;
1214; SM80-LABEL: test_uitofp_i16(
1215; SM80:       {
1216; SM80-NEXT:    .reg .b16 %rs<3>;
1217; SM80-NEXT:    .reg .f32 %f<2>;
1218; SM80-EMPTY:
1219; SM80-NEXT:  // %bb.0:
1220; SM80-NEXT:    ld.param.u16 %rs1, [test_uitofp_i16_param_0];
1221; SM80-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
1222; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
1223; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
1224; SM80-NEXT:    ret;
1225;
1226; SM80-FTZ-LABEL: test_uitofp_i16(
1227; SM80-FTZ:       {
1228; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
1229; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
1230; SM80-FTZ-EMPTY:
1231; SM80-FTZ-NEXT:  // %bb.0:
1232; SM80-FTZ-NEXT:    ld.param.u16 %rs1, [test_uitofp_i16_param_0];
1233; SM80-FTZ-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
1234; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
1235; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
1236; SM80-FTZ-NEXT:    ret;
1237;
1238; SM90-LABEL: test_uitofp_i16(
1239; SM90:       {
1240; SM90-NEXT:    .reg .b16 %rs<3>;
1241; SM90-EMPTY:
1242; SM90-NEXT:  // %bb.0:
1243; SM90-NEXT:    ld.param.u16 %rs1, [test_uitofp_i16_param_0];
1244; SM90-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
1245; SM90-NEXT:    st.param.b16 [func_retval0], %rs2;
1246; SM90-NEXT:    ret;
1247  %r = uitofp i16 %a to bfloat
1248  ret bfloat %r
1249}
1250
1251define bfloat @test_uitofp_i32(i32 %a) {
1252; SM70-LABEL: test_uitofp_i32(
1253; SM70:       {
1254; SM70-NEXT:    .reg .pred %p<2>;
1255; SM70-NEXT:    .reg .b16 %rs<2>;
1256; SM70-NEXT:    .reg .b32 %r<8>;
1257; SM70-NEXT:    .reg .f32 %f<2>;
1258; SM70-EMPTY:
1259; SM70-NEXT:  // %bb.0:
1260; SM70-NEXT:    ld.param.u32 %r1, [test_uitofp_i32_param_0];
1261; SM70-NEXT:    cvt.rn.f32.u32 %f1, %r1;
1262; SM70-NEXT:    mov.b32 %r2, %f1;
1263; SM70-NEXT:    bfe.u32 %r3, %r2, 16, 1;
1264; SM70-NEXT:    add.s32 %r4, %r3, %r2;
1265; SM70-NEXT:    add.s32 %r5, %r4, 32767;
1266; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
1267; SM70-NEXT:    or.b32 %r6, %r2, 4194304;
1268; SM70-NEXT:    selp.b32 %r7, %r6, %r5, %p1;
1269; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
1270; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
1271; SM70-NEXT:    ret;
1272;
1273; SM80-LABEL: test_uitofp_i32(
1274; SM80:       {
1275; SM80-NEXT:    .reg .b16 %rs<2>;
1276; SM80-NEXT:    .reg .b32 %r<2>;
1277; SM80-NEXT:    .reg .f32 %f<2>;
1278; SM80-EMPTY:
1279; SM80-NEXT:  // %bb.0:
1280; SM80-NEXT:    ld.param.u32 %r1, [test_uitofp_i32_param_0];
1281; SM80-NEXT:    cvt.rn.f32.u32 %f1, %r1;
1282; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
1283; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
1284; SM80-NEXT:    ret;
1285;
1286; SM80-FTZ-LABEL: test_uitofp_i32(
1287; SM80-FTZ:       {
1288; SM80-FTZ-NEXT:    .reg .b16 %rs<2>;
1289; SM80-FTZ-NEXT:    .reg .b32 %r<2>;
1290; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
1291; SM80-FTZ-EMPTY:
1292; SM80-FTZ-NEXT:  // %bb.0:
1293; SM80-FTZ-NEXT:    ld.param.u32 %r1, [test_uitofp_i32_param_0];
1294; SM80-FTZ-NEXT:    cvt.rn.f32.u32 %f1, %r1;
1295; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
1296; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
1297; SM80-FTZ-NEXT:    ret;
1298;
1299; SM90-LABEL: test_uitofp_i32(
1300; SM90:       {
1301; SM90-NEXT:    .reg .b16 %rs<2>;
1302; SM90-NEXT:    .reg .b32 %r<2>;
1303; SM90-EMPTY:
1304; SM90-NEXT:  // %bb.0:
1305; SM90-NEXT:    ld.param.u32 %r1, [test_uitofp_i32_param_0];
1306; SM90-NEXT:    cvt.rn.bf16.u32 %rs1, %r1;
1307; SM90-NEXT:    st.param.b16 [func_retval0], %rs1;
1308; SM90-NEXT:    ret;
1309  %r = uitofp i32 %a to bfloat
1310  ret bfloat %r
1311}
1312
1313define bfloat @test_uitofp_i64(i64 %a) {
1314; SM70-LABEL: test_uitofp_i64(
1315; SM70:       {
1316; SM70-NEXT:    .reg .pred %p<2>;
1317; SM70-NEXT:    .reg .b16 %rs<2>;
1318; SM70-NEXT:    .reg .b32 %r<7>;
1319; SM70-NEXT:    .reg .f32 %f<2>;
1320; SM70-NEXT:    .reg .b64 %rd<2>;
1321; SM70-EMPTY:
1322; SM70-NEXT:  // %bb.0:
1323; SM70-NEXT:    ld.param.u64 %rd1, [test_uitofp_i64_param_0];
1324; SM70-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
1325; SM70-NEXT:    mov.b32 %r1, %f1;
1326; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
1327; SM70-NEXT:    add.s32 %r3, %r2, %r1;
1328; SM70-NEXT:    add.s32 %r4, %r3, 32767;
1329; SM70-NEXT:    setp.nan.f32 %p1, %f1, %f1;
1330; SM70-NEXT:    or.b32 %r5, %r1, 4194304;
1331; SM70-NEXT:    selp.b32 %r6, %r5, %r4, %p1;
1332; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; }
1333; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
1334; SM70-NEXT:    ret;
1335;
1336; SM80-LABEL: test_uitofp_i64(
1337; SM80:       {
1338; SM80-NEXT:    .reg .b16 %rs<2>;
1339; SM80-NEXT:    .reg .f32 %f<2>;
1340; SM80-NEXT:    .reg .b64 %rd<2>;
1341; SM80-EMPTY:
1342; SM80-NEXT:  // %bb.0:
1343; SM80-NEXT:    ld.param.u64 %rd1, [test_uitofp_i64_param_0];
1344; SM80-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
1345; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
1346; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
1347; SM80-NEXT:    ret;
1348;
1349; SM80-FTZ-LABEL: test_uitofp_i64(
1350; SM80-FTZ:       {
1351; SM80-FTZ-NEXT:    .reg .b16 %rs<2>;
1352; SM80-FTZ-NEXT:    .reg .f32 %f<2>;
1353; SM80-FTZ-NEXT:    .reg .b64 %rd<2>;
1354; SM80-FTZ-EMPTY:
1355; SM80-FTZ-NEXT:  // %bb.0:
1356; SM80-FTZ-NEXT:    ld.param.u64 %rd1, [test_uitofp_i64_param_0];
1357; SM80-FTZ-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
1358; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
1359; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
1360; SM80-FTZ-NEXT:    ret;
1361;
1362; SM90-LABEL: test_uitofp_i64(
1363; SM90:       {
1364; SM90-NEXT:    .reg .b16 %rs<2>;
1365; SM90-NEXT:    .reg .b64 %rd<2>;
1366; SM90-EMPTY:
1367; SM90-NEXT:  // %bb.0:
1368; SM90-NEXT:    ld.param.u64 %rd1, [test_uitofp_i64_param_0];
1369; SM90-NEXT:    cvt.rn.bf16.u64 %rs1, %rd1;
1370; SM90-NEXT:    st.param.b16 [func_retval0], %rs1;
1371; SM90-NEXT:    ret;
1372  %r = uitofp i64 %a to bfloat
1373  ret bfloat %r
1374}
1375
1376define bfloat @test_roundeven(bfloat %a) {
1377; SM70-LABEL: test_roundeven(
1378; SM70:       {
1379; SM70-NEXT:    .reg .pred %p<2>;
1380; SM70-NEXT:    .reg .b16 %rs<2>;
1381; SM70-NEXT:    .reg .b32 %r<9>;
1382; SM70-NEXT:    .reg .f32 %f<3>;
1383; SM70-EMPTY:
1384; SM70-NEXT:  // %bb.0:
1385; SM70-NEXT:    ld.param.u16 %r1, [test_roundeven_param_0];
1386; SM70-NEXT:    shl.b32 %r2, %r1, 16;
1387; SM70-NEXT:    mov.b32 %f1, %r2;
1388; SM70-NEXT:    cvt.rni.f32.f32 %f2, %f1;
1389; SM70-NEXT:    mov.b32 %r3, %f2;
1390; SM70-NEXT:    bfe.u32 %r4, %r3, 16, 1;
1391; SM70-NEXT:    add.s32 %r5, %r4, %r3;
1392; SM70-NEXT:    add.s32 %r6, %r5, 32767;
1393; SM70-NEXT:    setp.nan.f32 %p1, %f2, %f2;
1394; SM70-NEXT:    or.b32 %r7, %r3, 4194304;
1395; SM70-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
1396; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
1397; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
1398; SM70-NEXT:    ret;
1399;
1400; SM80-LABEL: test_roundeven(
1401; SM80:       {
1402; SM80-NEXT:    .reg .b16 %rs<3>;
1403; SM80-NEXT:    .reg .f32 %f<3>;
1404; SM80-EMPTY:
1405; SM80-NEXT:  // %bb.0:
1406; SM80-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
1407; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
1408; SM80-NEXT:    cvt.rni.f32.f32 %f2, %f1;
1409; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
1410; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
1411; SM80-NEXT:    ret;
1412;
1413; SM80-FTZ-LABEL: test_roundeven(
1414; SM80-FTZ:       {
1415; SM80-FTZ-NEXT:    .reg .b16 %rs<3>;
1416; SM80-FTZ-NEXT:    .reg .f32 %f<3>;
1417; SM80-FTZ-EMPTY:
1418; SM80-FTZ-NEXT:  // %bb.0:
1419; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
1420; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
1421; SM80-FTZ-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
1422; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
1423; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
1424; SM80-FTZ-NEXT:    ret;
1425;
1426; SM90-LABEL: test_roundeven(
1427; SM90:       {
1428; SM90-NEXT:    .reg .b16 %rs<3>;
1429; SM90-EMPTY:
1430; SM90-NEXT:  // %bb.0:
1431; SM90-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
1432; SM90-NEXT:    cvt.rni.bf16.bf16 %rs2, %rs1;
1433; SM90-NEXT:    st.param.b16 [func_retval0], %rs2;
1434; SM90-NEXT:    ret;
1435  %r = call bfloat @llvm.roundeven.bf16(bfloat %a)
1436  ret bfloat %r
1437}
1438
1439define bfloat @test_maximum(bfloat %a, bfloat %b) {
1440; SM70-LABEL: test_maximum(
1441; SM70:       {
1442; SM70-NEXT:    .reg .pred %p<6>;
1443; SM70-NEXT:    .reg .b16 %rs<8>;
1444; SM70-NEXT:    .reg .b32 %r<7>;
1445; SM70-NEXT:    .reg .f32 %f<4>;
1446; SM70-EMPTY:
1447; SM70-NEXT:  // %bb.0:
1448; SM70-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
1449; SM70-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
1450; SM70-NEXT:    cvt.u32.u16 %r1, %rs2;
1451; SM70-NEXT:    shl.b32 %r2, %r1, 16;
1452; SM70-NEXT:    mov.b32 %f1, %r2;
1453; SM70-NEXT:    cvt.u32.u16 %r3, %rs1;
1454; SM70-NEXT:    shl.b32 %r4, %r3, 16;
1455; SM70-NEXT:    mov.b32 %f2, %r4;
1456; SM70-NEXT:    setp.gt.f32 %p1, %f2, %f1;
1457; SM70-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
1458; SM70-NEXT:    setp.nan.f32 %p2, %f2, %f1;
1459; SM70-NEXT:    selp.b16 %rs4, 0x7FC0, %rs3, %p2;
1460; SM70-NEXT:    setp.eq.s16 %p3, %rs1, 0;
1461; SM70-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
1462; SM70-NEXT:    setp.eq.s16 %p4, %rs2, 0;
1463; SM70-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
1464; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
1465; SM70-NEXT:    shl.b32 %r6, %r5, 16;
1466; SM70-NEXT:    mov.b32 %f3, %r6;
1467; SM70-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
1468; SM70-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
1469; SM70-NEXT:    st.param.b16 [func_retval0], %rs7;
1470; SM70-NEXT:    ret;
1471;
1472; SM80-LABEL: test_maximum(
1473; SM80:       {
1474; SM80-NEXT:    .reg .b16 %rs<4>;
1475; SM80-EMPTY:
1476; SM80-NEXT:  // %bb.0:
1477; SM80-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
1478; SM80-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
1479; SM80-NEXT:    max.NaN.bf16 %rs3, %rs1, %rs2;
1480; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
1481; SM80-NEXT:    ret;
1482;
1483; SM80-FTZ-LABEL: test_maximum(
1484; SM80-FTZ:       {
1485; SM80-FTZ-NEXT:    .reg .b16 %rs<4>;
1486; SM80-FTZ-EMPTY:
1487; SM80-FTZ-NEXT:  // %bb.0:
1488; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
1489; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
1490; SM80-FTZ-NEXT:    max.NaN.bf16 %rs3, %rs1, %rs2;
1491; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
1492; SM80-FTZ-NEXT:    ret;
1493;
1494; SM90-LABEL: test_maximum(
1495; SM90:       {
1496; SM90-NEXT:    .reg .b16 %rs<4>;
1497; SM90-EMPTY:
1498; SM90-NEXT:  // %bb.0:
1499; SM90-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
1500; SM90-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
1501; SM90-NEXT:    max.NaN.bf16 %rs3, %rs1, %rs2;
1502; SM90-NEXT:    st.param.b16 [func_retval0], %rs3;
1503; SM90-NEXT:    ret;
1504  %r = call bfloat @llvm.maximum.bf16(bfloat %a, bfloat %b)
1505  ret bfloat %r
1506}
1507
1508define bfloat @test_maxnum(bfloat %a, bfloat %b) {
1509; SM70-LABEL: test_maxnum(
1510; SM70:       {
1511; SM70-NEXT:    .reg .pred %p<2>;
1512; SM70-NEXT:    .reg .b16 %rs<2>;
1513; SM70-NEXT:    .reg .b32 %r<11>;
1514; SM70-NEXT:    .reg .f32 %f<4>;
1515; SM70-EMPTY:
1516; SM70-NEXT:  // %bb.0:
1517; SM70-NEXT:    ld.param.u16 %r1, [test_maxnum_param_1];
1518; SM70-NEXT:    shl.b32 %r2, %r1, 16;
1519; SM70-NEXT:    mov.b32 %f1, %r2;
1520; SM70-NEXT:    ld.param.u16 %r3, [test_maxnum_param_0];
1521; SM70-NEXT:    shl.b32 %r4, %r3, 16;
1522; SM70-NEXT:    mov.b32 %f2, %r4;
1523; SM70-NEXT:    max.f32 %f3, %f2, %f1;
1524; SM70-NEXT:    mov.b32 %r5, %f3;
1525; SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
1526; SM70-NEXT:    add.s32 %r7, %r6, %r5;
1527; SM70-NEXT:    add.s32 %r8, %r7, 32767;
1528; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
1529; SM70-NEXT:    or.b32 %r9, %r5, 4194304;
1530; SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
1531; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
1532; SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
1533; SM70-NEXT:    ret;
1534;
1535; SM80-LABEL: test_maxnum(
1536; SM80:       {
1537; SM80-NEXT:    .reg .b16 %rs<4>;
1538; SM80-EMPTY:
1539; SM80-NEXT:  // %bb.0:
1540; SM80-NEXT:    ld.param.b16 %rs1, [test_maxnum_param_0];
1541; SM80-NEXT:    ld.param.b16 %rs2, [test_maxnum_param_1];
1542; SM80-NEXT:    max.bf16 %rs3, %rs1, %rs2;
1543; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
1544; SM80-NEXT:    ret;
1545;
1546; SM80-FTZ-LABEL: test_maxnum(
1547; SM80-FTZ:       {
1548; SM80-FTZ-NEXT:    .reg .b16 %rs<4>;
1549; SM80-FTZ-EMPTY:
1550; SM80-FTZ-NEXT:  // %bb.0:
1551; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_maxnum_param_0];
1552; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_maxnum_param_1];
1553; SM80-FTZ-NEXT:    max.bf16 %rs3, %rs1, %rs2;
1554; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
1555; SM80-FTZ-NEXT:    ret;
1556;
1557; SM90-LABEL: test_maxnum(
1558; SM90:       {
1559; SM90-NEXT:    .reg .b16 %rs<4>;
1560; SM90-EMPTY:
1561; SM90-NEXT:  // %bb.0:
1562; SM90-NEXT:    ld.param.b16 %rs1, [test_maxnum_param_0];
1563; SM90-NEXT:    ld.param.b16 %rs2, [test_maxnum_param_1];
1564; SM90-NEXT:    max.bf16 %rs3, %rs1, %rs2;
1565; SM90-NEXT:    st.param.b16 [func_retval0], %rs3;
1566; SM90-NEXT:    ret;
1567  %r = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
1568  ret bfloat %r
1569}
1570
1571define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
1572; SM70-LABEL: test_maximum_v2(
1573; SM70:       {
1574; SM70-NEXT:    .reg .pred %p<11>;
1575; SM70-NEXT:    .reg .b16 %rs<15>;
1576; SM70-NEXT:    .reg .b32 %r<16>;
1577; SM70-NEXT:    .reg .f32 %f<7>;
1578; SM70-EMPTY:
1579; SM70-NEXT:  // %bb.0:
1580; SM70-NEXT:    ld.param.b32 %r1, [test_maximum_v2_param_0];
1581; SM70-NEXT:    ld.param.b32 %r2, [test_maximum_v2_param_1];
1582; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
1583; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
1584; SM70-NEXT:    shl.b32 %r4, %r3, 16;
1585; SM70-NEXT:    mov.b32 %f1, %r4;
1586; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
1587; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
1588; SM70-NEXT:    shl.b32 %r6, %r5, 16;
1589; SM70-NEXT:    mov.b32 %f2, %r6;
1590; SM70-NEXT:    setp.gt.f32 %p1, %f2, %f1;
1591; SM70-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
1592; SM70-NEXT:    setp.nan.f32 %p2, %f2, %f1;
1593; SM70-NEXT:    selp.b16 %rs6, 0x7FC0, %rs5, %p2;
1594; SM70-NEXT:    setp.eq.s16 %p3, %rs4, 0;
1595; SM70-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
1596; SM70-NEXT:    setp.eq.s16 %p4, %rs2, 0;
1597; SM70-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
1598; SM70-NEXT:    cvt.u32.u16 %r7, %rs6;
1599; SM70-NEXT:    shl.b32 %r8, %r7, 16;
1600; SM70-NEXT:    mov.b32 %f3, %r8;
1601; SM70-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
1602; SM70-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
1603; SM70-NEXT:    cvt.u32.u16 %r9, %rs1;
1604; SM70-NEXT:    shl.b32 %r10, %r9, 16;
1605; SM70-NEXT:    mov.b32 %f4, %r10;
1606; SM70-NEXT:    cvt.u32.u16 %r11, %rs3;
1607; SM70-NEXT:    shl.b32 %r12, %r11, 16;
1608; SM70-NEXT:    mov.b32 %f5, %r12;
1609; SM70-NEXT:    setp.gt.f32 %p6, %f5, %f4;
1610; SM70-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
1611; SM70-NEXT:    setp.nan.f32 %p7, %f5, %f4;
1612; SM70-NEXT:    selp.b16 %rs11, 0x7FC0, %rs10, %p7;
1613; SM70-NEXT:    setp.eq.s16 %p8, %rs3, 0;
1614; SM70-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
1615; SM70-NEXT:    setp.eq.s16 %p9, %rs1, 0;
1616; SM70-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
1617; SM70-NEXT:    cvt.u32.u16 %r13, %rs11;
1618; SM70-NEXT:    shl.b32 %r14, %r13, 16;
1619; SM70-NEXT:    mov.b32 %f6, %r14;
1620; SM70-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
1621; SM70-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
1622; SM70-NEXT:    mov.b32 %r15, {%rs14, %rs9};
1623; SM70-NEXT:    st.param.b32 [func_retval0], %r15;
1624; SM70-NEXT:    ret;
1625;
1626; SM80-LABEL: test_maximum_v2(
1627; SM80:       {
1628; SM80-NEXT:    .reg .b32 %r<4>;
1629; SM80-EMPTY:
1630; SM80-NEXT:  // %bb.0:
1631; SM80-NEXT:    ld.param.b32 %r1, [test_maximum_v2_param_1];
1632; SM80-NEXT:    ld.param.b32 %r2, [test_maximum_v2_param_0];
1633; SM80-NEXT:    max.NaN.bf16x2 %r3, %r2, %r1;
1634; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
1635; SM80-NEXT:    ret;
1636;
1637; SM80-FTZ-LABEL: test_maximum_v2(
1638; SM80-FTZ:       {
1639; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
1640; SM80-FTZ-EMPTY:
1641; SM80-FTZ-NEXT:  // %bb.0:
1642; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_maximum_v2_param_1];
1643; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_maximum_v2_param_0];
1644; SM80-FTZ-NEXT:    max.NaN.bf16x2 %r3, %r2, %r1;
1645; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
1646; SM80-FTZ-NEXT:    ret;
1647;
1648; SM90-LABEL: test_maximum_v2(
1649; SM90:       {
1650; SM90-NEXT:    .reg .b32 %r<4>;
1651; SM90-EMPTY:
1652; SM90-NEXT:  // %bb.0:
1653; SM90-NEXT:    ld.param.b32 %r1, [test_maximum_v2_param_1];
1654; SM90-NEXT:    ld.param.b32 %r2, [test_maximum_v2_param_0];
1655; SM90-NEXT:    max.NaN.bf16x2 %r3, %r2, %r1;
1656; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
1657; SM90-NEXT:    ret;
1658  %r = call <2 x bfloat> @llvm.maximum.bf16(<2 x bfloat> %a, <2 x bfloat> %b)
1659  ret <2 x bfloat> %r
1660}
1661
1662define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
1663; SM70-LABEL: test_maxnum_v2(
1664; SM70:       {
1665; SM70-NEXT:    .reg .pred %p<3>;
1666; SM70-NEXT:    .reg .b16 %rs<5>;
1667; SM70-NEXT:    .reg .b32 %r<24>;
1668; SM70-NEXT:    .reg .f32 %f<7>;
1669; SM70-EMPTY:
1670; SM70-NEXT:  // %bb.0:
1671; SM70-NEXT:    ld.param.b32 %r1, [test_maxnum_v2_param_0];
1672; SM70-NEXT:    ld.param.b32 %r2, [test_maxnum_v2_param_1];
1673; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
1674; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
1675; SM70-NEXT:    shl.b32 %r4, %r3, 16;
1676; SM70-NEXT:    mov.b32 %f1, %r4;
1677; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
1678; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
1679; SM70-NEXT:    shl.b32 %r6, %r5, 16;
1680; SM70-NEXT:    mov.b32 %f2, %r6;
1681; SM70-NEXT:    max.f32 %f3, %f2, %f1;
1682; SM70-NEXT:    mov.b32 %r7, %f3;
1683; SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
1684; SM70-NEXT:    add.s32 %r9, %r8, %r7;
1685; SM70-NEXT:    add.s32 %r10, %r9, 32767;
1686; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
1687; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
1688; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
1689; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
1690; SM70-NEXT:    shl.b32 %r14, %r13, 16;
1691; SM70-NEXT:    mov.b32 %f4, %r14;
1692; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
1693; SM70-NEXT:    shl.b32 %r16, %r15, 16;
1694; SM70-NEXT:    mov.b32 %f5, %r16;
1695; SM70-NEXT:    max.f32 %f6, %f5, %f4;
1696; SM70-NEXT:    mov.b32 %r17, %f6;
1697; SM70-NEXT:    bfe.u32 %r18, %r17, 16, 1;
1698; SM70-NEXT:    add.s32 %r19, %r18, %r17;
1699; SM70-NEXT:    add.s32 %r20, %r19, 32767;
1700; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
1701; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
1702; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
1703; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
1704; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
1705; SM70-NEXT:    ret;
1706;
1707; SM80-LABEL: test_maxnum_v2(
1708; SM80:       {
1709; SM80-NEXT:    .reg .b32 %r<4>;
1710; SM80-EMPTY:
1711; SM80-NEXT:  // %bb.0:
1712; SM80-NEXT:    ld.param.b32 %r1, [test_maxnum_v2_param_1];
1713; SM80-NEXT:    ld.param.b32 %r2, [test_maxnum_v2_param_0];
1714; SM80-NEXT:    max.bf16x2 %r3, %r2, %r1;
1715; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
1716; SM80-NEXT:    ret;
1717;
1718; SM80-FTZ-LABEL: test_maxnum_v2(
1719; SM80-FTZ:       {
1720; SM80-FTZ-NEXT:    .reg .b32 %r<4>;
1721; SM80-FTZ-EMPTY:
1722; SM80-FTZ-NEXT:  // %bb.0:
1723; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_maxnum_v2_param_1];
1724; SM80-FTZ-NEXT:    ld.param.b32 %r2, [test_maxnum_v2_param_0];
1725; SM80-FTZ-NEXT:    max.bf16x2 %r3, %r2, %r1;
1726; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
1727; SM80-FTZ-NEXT:    ret;
1728;
1729; SM90-LABEL: test_maxnum_v2(
1730; SM90:       {
1731; SM90-NEXT:    .reg .b32 %r<4>;
1732; SM90-EMPTY:
1733; SM90-NEXT:  // %bb.0:
1734; SM90-NEXT:    ld.param.b32 %r1, [test_maxnum_v2_param_1];
1735; SM90-NEXT:    ld.param.b32 %r2, [test_maxnum_v2_param_0];
1736; SM90-NEXT:    max.bf16x2 %r3, %r2, %r1;
1737; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
1738; SM90-NEXT:    ret;
1739  %r = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
1740  ret <2 x bfloat> %r
1741}
1742