xref: /llvm-project/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll (revision 5e5fd0e6fc50cc1198750308c11433a5b3acfd0f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
3; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
4; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
5; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
6
7target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
8
9define <2 x bfloat> @test_ret_const() #0 {
10; CHECK-LABEL: test_ret_const(
11; CHECK:       {
12; CHECK-NEXT:    .reg .b32 %r<2>;
13; CHECK-EMPTY:
14; CHECK-NEXT:  // %bb.0:
15; CHECK-NEXT:    mov.b32 %r1, 1073758080;
16; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
17; CHECK-NEXT:    ret;
18  ret <2 x bfloat> <bfloat 1.0, bfloat 2.0>
19}
20
21; Check that we can lower fadd with immediate arguments.
22define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 {
23; SM80-LABEL: test_fadd_imm_0(
24; SM80:       {
25; SM80-NEXT:    .reg .b32 %r<5>;
26; SM80-EMPTY:
27; SM80-NEXT:  // %bb.0:
28; SM80-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
29; SM80-NEXT:    mov.b32 %r2, 1065369472;
30; SM80-NEXT:    mov.b32 %r3, 1073758080;
31; SM80-NEXT:    fma.rn.bf16x2 %r4, %r1, %r2, %r3;
32; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
33; SM80-NEXT:    ret;
34;
35; SM90-LABEL: test_fadd_imm_0(
36; SM90:       {
37; SM90-NEXT:    .reg .b32 %r<4>;
38; SM90-EMPTY:
39; SM90-NEXT:  // %bb.0:
40; SM90-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
41; SM90-NEXT:    mov.b32 %r2, 1073758080;
42; SM90-NEXT:    add.rn.bf16x2 %r3, %r1, %r2;
43; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
44; SM90-NEXT:    ret;
45  %r = fadd <2 x bfloat> <bfloat 1.0, bfloat 2.0>, %a
46  ret <2 x bfloat> %r
47}
48
49define bfloat @test_fadd_imm_1(bfloat %a) #0 {
50; SM80-LABEL: test_fadd_imm_1(
51; SM80:       {
52; SM80-NEXT:    .reg .b16 %rs<4>;
53; SM80-EMPTY:
54; SM80-NEXT:  // %bb.0:
55; SM80-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
56; SM80-NEXT:    mov.b16 %rs2, 0x3F80;
57; SM80-NEXT:    fma.rn.bf16 %rs3, %rs1, %rs2, %rs2;
58; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
59; SM80-NEXT:    ret;
60;
61; SM90-LABEL: test_fadd_imm_1(
62; SM90:       {
63; SM90-NEXT:    .reg .b16 %rs<4>;
64; SM90-EMPTY:
65; SM90-NEXT:  // %bb.0:
66; SM90-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
67; SM90-NEXT:    mov.b16 %rs2, 0x3F80;
68; SM90-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
69; SM90-NEXT:    st.param.b16 [func_retval0], %rs3;
70; SM90-NEXT:    ret;
71  %r = fadd bfloat %a, 1.0
72  ret bfloat %r
73}
74
75define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
76; SM80-LABEL: test_fsubx2(
77; SM80:       {
78; SM80-NEXT:    .reg .b32 %r<5>;
79; SM80-EMPTY:
80; SM80-NEXT:  // %bb.0:
81; SM80-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
82; SM80-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
83; SM80-NEXT:    mov.b32 %r3, -1082081408;
84; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r3, %r1;
85; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
86; SM80-NEXT:    ret;
87;
88; SM90-LABEL: test_fsubx2(
89; SM90:       {
90; SM90-NEXT:    .reg .b32 %r<4>;
91; SM90-EMPTY:
92; SM90-NEXT:  // %bb.0:
93; SM90-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_1];
94; SM90-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_0];
95; SM90-NEXT:    sub.rn.bf16x2 %r3, %r2, %r1;
96; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
97; SM90-NEXT:    ret;
98  %r = fsub <2 x bfloat> %a, %b
99  ret <2 x bfloat> %r
100}
101
102define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
103; SM80-LABEL: test_fmulx2(
104; SM80:       {
105; SM80-NEXT:    .reg .b32 %r<5>;
106; SM80-EMPTY:
107; SM80-NEXT:  // %bb.0:
108; SM80-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_1];
109; SM80-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_0];
110; SM80-NEXT:    mov.b32 %r3, -2147450880;
111; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r1, %r3;
112; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
113; SM80-NEXT:    ret;
114;
115; SM90-LABEL: test_fmulx2(
116; SM90:       {
117; SM90-NEXT:    .reg .b32 %r<4>;
118; SM90-EMPTY:
119; SM90-NEXT:  // %bb.0:
120; SM90-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_1];
121; SM90-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_0];
122; SM90-NEXT:    mul.rn.bf16x2 %r3, %r2, %r1;
123; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
124; SM90-NEXT:    ret;
125  %r = fmul <2 x bfloat> %a, %b
126  ret <2 x bfloat> %r
127}
128
129define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
130; CHECK-LABEL: test_fdiv(
131; CHECK:       {
132; CHECK-NEXT:    .reg .b16 %rs<5>;
133; CHECK-NEXT:    .reg .b32 %r<4>;
134; CHECK-NEXT:    .reg .f32 %f<7>;
135; CHECK-EMPTY:
136; CHECK-NEXT:  // %bb.0:
137; CHECK-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
138; CHECK-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
139; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
140; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs1;
141; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
142; CHECK-NEXT:    cvt.f32.bf16 %f2, %rs3;
143; CHECK-NEXT:    div.rn.f32 %f3, %f2, %f1;
144; CHECK-NEXT:    cvt.f32.bf16 %f4, %rs2;
145; CHECK-NEXT:    cvt.f32.bf16 %f5, %rs4;
146; CHECK-NEXT:    div.rn.f32 %f6, %f5, %f4;
147; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
148; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
149; CHECK-NEXT:    ret;
150  %r = fdiv <2 x bfloat> %a, %b
151  ret <2 x bfloat> %r
152}
153
154define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
155; CHECK-LABEL: test_fneg(
156; CHECK:       {
157; CHECK-NEXT:    .reg .b32 %r<3>;
158; CHECK-EMPTY:
159; CHECK-NEXT:  // %bb.0:
160; CHECK-NEXT:    ld.param.u32 %r1, [test_fneg_param_0];
161; CHECK-NEXT:    xor.b32 %r2, %r1, -2147450880;
162; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
163; CHECK-NEXT:    ret;
164  %r = fneg <2 x bfloat> %a
165  ret <2 x bfloat> %r
166}
167
168define void @test_ldst_v2bf16(ptr %a, ptr %b) {
169; CHECK-LABEL: test_ldst_v2bf16(
170; CHECK:       {
171; CHECK-NEXT:    .reg .b32 %r<2>;
172; CHECK-NEXT:    .reg .b64 %rd<3>;
173; CHECK-EMPTY:
174; CHECK-NEXT:  // %bb.0:
175; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v2bf16_param_0];
176; CHECK-NEXT:    ld.b32 %r1, [%rd1];
177; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v2bf16_param_1];
178; CHECK-NEXT:    st.b32 [%rd2], %r1;
179; CHECK-NEXT:    ret;
180  %t1 = load <2 x bfloat>, ptr %a
181  store <2 x bfloat> %t1, ptr %b, align 16
182  ret void
183}
184
185define void @test_ldst_v3bf16(ptr %a, ptr %b) {
186; CHECK-LABEL: test_ldst_v3bf16(
187; CHECK:       {
188; CHECK-NEXT:    .reg .b16 %rs<2>;
189; CHECK-NEXT:    .reg .b32 %r<2>;
190; CHECK-NEXT:    .reg .b64 %rd<4>;
191; CHECK-EMPTY:
192; CHECK-NEXT:  // %bb.0:
193; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3bf16_param_0];
194; CHECK-NEXT:    ld.u64 %rd2, [%rd1];
195; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; }
196; CHECK-NEXT:    ld.param.u64 %rd3, [test_ldst_v3bf16_param_1];
197; CHECK-NEXT:    st.u32 [%rd3], %rd2;
198; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
199; CHECK-NEXT:    st.b16 [%rd3+4], %rs1;
200; CHECK-NEXT:    ret;
201  %t1 = load <3 x bfloat>, ptr %a
202  store <3 x bfloat> %t1, ptr %b, align 16
203  ret void
204}
205
206declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0
207
208define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
209; CHECK-LABEL: test_call(
210; CHECK:       {
211; CHECK-NEXT:    .reg .b32 %r<5>;
212; CHECK-EMPTY:
213; CHECK-NEXT:  // %bb.0:
214; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
215; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
216; CHECK-NEXT:    { // callseq 0, 0
217; CHECK-NEXT:    .param .align 4 .b8 param0[4];
218; CHECK-NEXT:    st.param.b32 [param0], %r1;
219; CHECK-NEXT:    .param .align 4 .b8 param1[4];
220; CHECK-NEXT:    st.param.b32 [param1], %r2;
221; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
222; CHECK-NEXT:    call.uni (retval0),
223; CHECK-NEXT:    test_callee,
224; CHECK-NEXT:    (
225; CHECK-NEXT:    param0,
226; CHECK-NEXT:    param1
227; CHECK-NEXT:    );
228; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
229; CHECK-NEXT:    } // callseq 0
230; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
231; CHECK-NEXT:    ret;
232  %r = call <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b)
233  ret <2 x bfloat> %r
234}
235
236define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c) #0 {
237; CHECK-LABEL: test_select(
238; CHECK:       {
239; CHECK-NEXT:    .reg .pred %p<2>;
240; CHECK-NEXT:    .reg .b16 %rs<3>;
241; CHECK-NEXT:    .reg .b32 %r<4>;
242; CHECK-EMPTY:
243; CHECK-NEXT:  // %bb.0:
244; CHECK-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
245; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
246; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
247; CHECK-NEXT:    ld.param.b32 %r1, [test_select_param_1];
248; CHECK-NEXT:    ld.param.b32 %r2, [test_select_param_0];
249; CHECK-NEXT:    selp.b32 %r3, %r2, %r1, %p1;
250; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
251; CHECK-NEXT:    ret;
252  %r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b
253  ret <2 x bfloat> %r
254}
255
256define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c, <2 x bfloat> %d) #0 {
257; SM80-LABEL: test_select_cc(
258; SM80:       {
259; SM80-NEXT:    .reg .pred %p<3>;
260; SM80-NEXT:    .reg .b16 %rs<11>;
261; SM80-NEXT:    .reg .b32 %r<6>;
262; SM80-NEXT:    .reg .f32 %f<5>;
263; SM80-EMPTY:
264; SM80-NEXT:  // %bb.0:
265; SM80-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
266; SM80-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
267; SM80-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
268; SM80-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
269; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
270; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
271; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
272; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
273; SM80-NEXT:    setp.neu.f32 %p1, %f2, %f1;
274; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
275; SM80-NEXT:    cvt.f32.bf16 %f4, %rs4;
276; SM80-NEXT:    setp.neu.f32 %p2, %f4, %f3;
277; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
278; SM80-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
279; SM80-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
280; SM80-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
281; SM80-NEXT:    mov.b32 %r5, {%rs10, %rs9};
282; SM80-NEXT:    st.param.b32 [func_retval0], %r5;
283; SM80-NEXT:    ret;
284;
285; SM90-LABEL: test_select_cc(
286; SM90:       {
287; SM90-NEXT:    .reg .pred %p<3>;
288; SM90-NEXT:    .reg .b16 %rs<7>;
289; SM90-NEXT:    .reg .b32 %r<6>;
290; SM90-EMPTY:
291; SM90-NEXT:  // %bb.0:
292; SM90-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
293; SM90-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
294; SM90-NEXT:    ld.param.b32 %r3, [test_select_cc_param_3];
295; SM90-NEXT:    ld.param.b32 %r4, [test_select_cc_param_2];
296; SM90-NEXT:    setp.neu.bf16x2 %p1|%p2, %r4, %r3;
297; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
298; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
299; SM90-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
300; SM90-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
301; SM90-NEXT:    mov.b32 %r5, {%rs6, %rs5};
302; SM90-NEXT:    st.param.b32 [func_retval0], %r5;
303; SM90-NEXT:    ret;
304  %cc = fcmp une <2 x bfloat> %c, %d
305  %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b
306  ret <2 x bfloat> %r
307}
308
309define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
310; SM80-LABEL: test_select_cc_f32_bf16(
311; SM80:       {
312; SM80-NEXT:    .reg .pred %p<3>;
313; SM80-NEXT:    .reg .b16 %rs<5>;
314; SM80-NEXT:    .reg .b32 %r<3>;
315; SM80-NEXT:    .reg .f32 %f<11>;
316; SM80-EMPTY:
317; SM80-NEXT:  // %bb.0:
318; SM80-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
319; SM80-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_bf16_param_2];
320; SM80-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_bf16_param_3];
321; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
322; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
323; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
324; SM80-NEXT:    cvt.f32.bf16 %f4, %rs3;
325; SM80-NEXT:    setp.neu.f32 %p1, %f4, %f3;
326; SM80-NEXT:    cvt.f32.bf16 %f5, %rs2;
327; SM80-NEXT:    cvt.f32.bf16 %f6, %rs4;
328; SM80-NEXT:    setp.neu.f32 %p2, %f6, %f5;
329; SM80-NEXT:    ld.param.v2.f32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1];
330; SM80-NEXT:    selp.f32 %f9, %f2, %f8, %p2;
331; SM80-NEXT:    selp.f32 %f10, %f1, %f7, %p1;
332; SM80-NEXT:    st.param.v2.f32 [func_retval0], {%f10, %f9};
333; SM80-NEXT:    ret;
334;
335; SM90-LABEL: test_select_cc_f32_bf16(
336; SM90:       {
337; SM90-NEXT:    .reg .pred %p<3>;
338; SM90-NEXT:    .reg .b32 %r<3>;
339; SM90-NEXT:    .reg .f32 %f<7>;
340; SM90-EMPTY:
341; SM90-NEXT:  // %bb.0:
342; SM90-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
343; SM90-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_bf16_param_3];
344; SM90-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_bf16_param_2];
345; SM90-NEXT:    setp.neu.bf16x2 %p1|%p2, %r2, %r1;
346; SM90-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1];
347; SM90-NEXT:    selp.f32 %f5, %f2, %f4, %p2;
348; SM90-NEXT:    selp.f32 %f6, %f1, %f3, %p1;
349; SM90-NEXT:    st.param.v2.f32 [func_retval0], {%f6, %f5};
350; SM90-NEXT:    ret;
351                                           <2 x bfloat> %c, <2 x bfloat> %d) #0 {
352  %cc = fcmp une <2 x bfloat> %c, %d
353  %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
354  ret <2 x float> %r
355}
356
357define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
358; CHECK-LABEL: test_select_cc_bf16_f32(
359; CHECK:       {
360; CHECK-NEXT:    .reg .pred %p<3>;
361; CHECK-NEXT:    .reg .b16 %rs<7>;
362; CHECK-NEXT:    .reg .b32 %r<4>;
363; CHECK-NEXT:    .reg .f32 %f<5>;
364; CHECK-EMPTY:
365; CHECK-NEXT:  // %bb.0:
366; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0];
367; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1];
368; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2];
369; CHECK-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3];
370; CHECK-NEXT:    setp.neu.f32 %p1, %f1, %f3;
371; CHECK-NEXT:    setp.neu.f32 %p2, %f2, %f4;
372; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
373; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
374; CHECK-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
375; CHECK-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
376; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
377; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
378; CHECK-NEXT:    ret;
379                                          <2 x float> %c, <2 x float> %d) #0 {
380  %cc = fcmp une <2 x float> %c, %d
381  %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b
382  ret <2 x bfloat> %r
383}
384
385define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
386; CHECK-LABEL: test_fptrunc_2xfloat(
387; CHECK:       {
388; CHECK-NEXT:    .reg .b32 %r<2>;
389; CHECK-NEXT:    .reg .f32 %f<3>;
390; CHECK-EMPTY:
391; CHECK-NEXT:  // %bb.0:
392; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
393; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r1, %f2, %f1;
394; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
395; CHECK-NEXT:    ret;
396  %r = fptrunc <2 x float> %a to <2 x bfloat>
397  ret <2 x bfloat> %r
398}
399
400define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 {
401; CHECK-LABEL: test_fpext_2xfloat(
402; CHECK:       {
403; CHECK-NEXT:    .reg .b16 %rs<3>;
404; CHECK-NEXT:    .reg .b32 %r<2>;
405; CHECK-NEXT:    .reg .f32 %f<3>;
406; CHECK-EMPTY:
407; CHECK-NEXT:  // %bb.0:
408; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
409; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
410; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
411; CHECK-NEXT:    cvt.f32.bf16 %f2, %rs1;
412; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
413; CHECK-NEXT:    ret;
414  %r = fpext <2 x bfloat> %a to <2 x float>
415  ret <2 x float> %r
416}
417
418define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 {
419; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16(
420; CHECK:       {
421; CHECK-NEXT:    .reg .b32 %r<2>;
422; CHECK-EMPTY:
423; CHECK-NEXT:  // %bb.0:
424; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0];
425; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
426; CHECK-NEXT:    ret;
427  %r = bitcast <2 x bfloat> %a to <2 x i16>
428  ret <2 x i16> %r
429}
430
431define <2 x bfloat> @test_bitcast_2xi16_to_2xbf16(<2 x i16> %a) #0 {
432; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16(
433; CHECK:       {
434; CHECK-NEXT:    .reg .b32 %r<2>;
435; CHECK-EMPTY:
436; CHECK-NEXT:  // %bb.0:
437; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xi16_to_2xbf16_param_0];
438; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
439; CHECK-NEXT:    ret;
440  %r = bitcast <2 x i16> %a to <2 x bfloat>
441  ret <2 x bfloat> %r
442}
443
444declare <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a) #0
445declare <2 x bfloat> @llvm.powi.f16(<2 x bfloat> %a, <2 x i32> %b) #0
446declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0
447declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
448declare <2 x bfloat> @llvm.pow.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
449declare <2 x bfloat> @llvm.exp.f16(<2 x bfloat> %a) #0
450declare <2 x bfloat> @llvm.exp2.f16(<2 x bfloat> %a) #0
451declare <2 x bfloat> @llvm.log.f16(<2 x bfloat> %a) #0
452declare <2 x bfloat> @llvm.log10.f16(<2 x bfloat> %a) #0
453declare <2 x bfloat> @llvm.log2.f16(<2 x bfloat> %a) #0
454declare <2 x bfloat> @llvm.fma.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0
455declare <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a) #0
456declare <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
457declare <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
458declare <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
459declare <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a) #0
460declare <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a) #0
461declare <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a) #0
462declare <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a) #0
463declare <2 x bfloat> @llvm.nearbyint.f16(<2 x bfloat> %a) #0
464declare <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) #0
465declare <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0
466
467define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 {
468; CHECK-LABEL: test_sqrt(
469; CHECK:       {
470; CHECK-NEXT:    .reg .b16 %rs<3>;
471; CHECK-NEXT:    .reg .b32 %r<3>;
472; CHECK-NEXT:    .reg .f32 %f<5>;
473; CHECK-EMPTY:
474; CHECK-NEXT:  // %bb.0:
475; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_param_0];
476; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
477; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs1;
478; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
479; CHECK-NEXT:    cvt.f32.bf16 %f3, %rs2;
480; CHECK-NEXT:    sqrt.rn.f32 %f4, %f3;
481; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
482; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
483; CHECK-NEXT:    ret;
484  %r = call <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a)
485  ret <2 x bfloat> %r
486}
487
488define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
489; CHECK-LABEL: test_fmuladd(
490; CHECK:       {
491; CHECK-NEXT:    .reg .b32 %r<5>;
492; CHECK-EMPTY:
493; CHECK-NEXT:  // %bb.0:
494; CHECK-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_2];
495; CHECK-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
496; CHECK-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_0];
497; CHECK-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
498; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
499; CHECK-NEXT:    ret;
500  %r = call <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
501  ret <2 x bfloat> %r
502}
503
504define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
505; CHECK-LABEL: test_fabs(
506; CHECK:       {
507; CHECK-NEXT:    .reg .b32 %r<3>;
508; CHECK-EMPTY:
509; CHECK-NEXT:  // %bb.0:
510; CHECK-NEXT:    ld.param.u32 %r1, [test_fabs_param_0];
511; CHECK-NEXT:    and.b32 %r2, %r1, 2147450879;
512; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
513; CHECK-NEXT:    ret;
514  %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a)
515  ret <2 x bfloat> %r
516}
517
518define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
519; SM80-LABEL: test_fabs_add(
520; SM80:       {
521; SM80-NEXT:    .reg .b32 %r<7>;
522; SM80-EMPTY:
523; SM80-NEXT:  // %bb.0:
524; SM80-NEXT:    ld.param.b32 %r1, [test_fabs_add_param_1];
525; SM80-NEXT:    ld.param.b32 %r2, [test_fabs_add_param_0];
526; SM80-NEXT:    mov.b32 %r3, 1065369472;
527; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r3, %r2;
528; SM80-NEXT:    abs.bf16x2 %r5, %r4;
529; SM80-NEXT:    fma.rn.bf16x2 %r6, %r5, %r3, %r1;
530; SM80-NEXT:    st.param.b32 [func_retval0], %r6;
531; SM80-NEXT:    ret;
532;
533; SM90-LABEL: test_fabs_add(
534; SM90:       {
535; SM90-NEXT:    .reg .b32 %r<6>;
536; SM90-EMPTY:
537; SM90-NEXT:  // %bb.0:
538; SM90-NEXT:    ld.param.b32 %r1, [test_fabs_add_param_1];
539; SM90-NEXT:    ld.param.b32 %r2, [test_fabs_add_param_0];
540; SM90-NEXT:    add.rn.bf16x2 %r3, %r2, %r2;
541; SM90-NEXT:    abs.bf16x2 %r4, %r3;
542; SM90-NEXT:    add.rn.bf16x2 %r5, %r4, %r1;
543; SM90-NEXT:    st.param.b32 [func_retval0], %r5;
544; SM90-NEXT:    ret;
545  %s = fadd <2 x bfloat> %a, %a
546  %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %s)
547  %d = fadd <2 x bfloat> %r, %b
548  ret <2 x bfloat> %d
549}
550
551define <2 x bfloat> @test_minnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
552; CHECK-LABEL: test_minnum(
553; CHECK:       {
554; CHECK-NEXT:    .reg .b32 %r<4>;
555; CHECK-EMPTY:
556; CHECK-NEXT:  // %bb.0:
557; CHECK-NEXT:    ld.param.b32 %r1, [test_minnum_param_1];
558; CHECK-NEXT:    ld.param.b32 %r2, [test_minnum_param_0];
559; CHECK-NEXT:    min.bf16x2 %r3, %r2, %r1;
560; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
561; CHECK-NEXT:    ret;
562  %r = call <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b)
563  ret <2 x bfloat> %r
564}
565
566define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
567; CHECK-LABEL: test_maxnum(
568; CHECK:       {
569; CHECK-NEXT:    .reg .b32 %r<4>;
570; CHECK-EMPTY:
571; CHECK-NEXT:  // %bb.0:
572; CHECK-NEXT:    ld.param.b32 %r1, [test_maxnum_param_1];
573; CHECK-NEXT:    ld.param.b32 %r2, [test_maxnum_param_0];
574; CHECK-NEXT:    max.bf16x2 %r3, %r2, %r1;
575; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
576; CHECK-NEXT:    ret;
577  %r = call <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b)
578  ret <2 x bfloat> %r
579}
580
581define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 {
582; SM80-LABEL: test_floor(
583; SM80:       {
584; SM80-NEXT:    .reg .b16 %rs<3>;
585; SM80-NEXT:    .reg .b32 %r<3>;
586; SM80-NEXT:    .reg .f32 %f<5>;
587; SM80-EMPTY:
588; SM80-NEXT:  // %bb.0:
589; SM80-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
590; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
591; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
592; SM80-NEXT:    cvt.rmi.f32.f32 %f2, %f1;
593; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
594; SM80-NEXT:    cvt.rmi.f32.f32 %f4, %f3;
595; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
596; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
597; SM80-NEXT:    ret;
598;
599; SM90-LABEL: test_floor(
600; SM90:       {
601; SM90-NEXT:    .reg .b16 %rs<5>;
602; SM90-NEXT:    .reg .b32 %r<3>;
603; SM90-EMPTY:
604; SM90-NEXT:  // %bb.0:
605; SM90-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
606; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
607; SM90-NEXT:    cvt.rmi.bf16.bf16 %rs3, %rs2;
608; SM90-NEXT:    cvt.rmi.bf16.bf16 %rs4, %rs1;
609; SM90-NEXT:    mov.b32 %r2, {%rs4, %rs3};
610; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
611; SM90-NEXT:    ret;
612  %r = call <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a)
613  ret <2 x bfloat> %r
614}
615
616define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 {
617; SM80-LABEL: test_ceil(
618; SM80:       {
619; SM80-NEXT:    .reg .b16 %rs<3>;
620; SM80-NEXT:    .reg .b32 %r<3>;
621; SM80-NEXT:    .reg .f32 %f<5>;
622; SM80-EMPTY:
623; SM80-NEXT:  // %bb.0:
624; SM80-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
625; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
626; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
627; SM80-NEXT:    cvt.rpi.f32.f32 %f2, %f1;
628; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
629; SM80-NEXT:    cvt.rpi.f32.f32 %f4, %f3;
630; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
631; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
632; SM80-NEXT:    ret;
633;
634; SM90-LABEL: test_ceil(
635; SM90:       {
636; SM90-NEXT:    .reg .b16 %rs<5>;
637; SM90-NEXT:    .reg .b32 %r<3>;
638; SM90-EMPTY:
639; SM90-NEXT:  // %bb.0:
640; SM90-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
641; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
642; SM90-NEXT:    cvt.rpi.bf16.bf16 %rs3, %rs2;
643; SM90-NEXT:    cvt.rpi.bf16.bf16 %rs4, %rs1;
644; SM90-NEXT:    mov.b32 %r2, {%rs4, %rs3};
645; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
646; SM90-NEXT:    ret;
647  %r = call <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a)
648  ret <2 x bfloat> %r
649}
650
651define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 {
652; SM80-LABEL: test_trunc(
653; SM80:       {
654; SM80-NEXT:    .reg .b16 %rs<3>;
655; SM80-NEXT:    .reg .b32 %r<3>;
656; SM80-NEXT:    .reg .f32 %f<5>;
657; SM80-EMPTY:
658; SM80-NEXT:  // %bb.0:
659; SM80-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
660; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
661; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
662; SM80-NEXT:    cvt.rzi.f32.f32 %f2, %f1;
663; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
664; SM80-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
665; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
666; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
667; SM80-NEXT:    ret;
668;
669; SM90-LABEL: test_trunc(
670; SM90:       {
671; SM90-NEXT:    .reg .b16 %rs<5>;
672; SM90-NEXT:    .reg .b32 %r<3>;
673; SM90-EMPTY:
674; SM90-NEXT:  // %bb.0:
675; SM90-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
676; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
677; SM90-NEXT:    cvt.rzi.bf16.bf16 %rs3, %rs2;
678; SM90-NEXT:    cvt.rzi.bf16.bf16 %rs4, %rs1;
679; SM90-NEXT:    mov.b32 %r2, {%rs4, %rs3};
680; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
681; SM90-NEXT:    ret;
682  %r = call <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a)
683  ret <2 x bfloat> %r
684}
685
686define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 {
687; SM80-LABEL: test_rint(
688; SM80:       {
689; SM80-NEXT:    .reg .b16 %rs<3>;
690; SM80-NEXT:    .reg .b32 %r<3>;
691; SM80-NEXT:    .reg .f32 %f<5>;
692; SM80-EMPTY:
693; SM80-NEXT:  // %bb.0:
694; SM80-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
695; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
696; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
697; SM80-NEXT:    cvt.rni.f32.f32 %f2, %f1;
698; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
699; SM80-NEXT:    cvt.rni.f32.f32 %f4, %f3;
700; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
701; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
702; SM80-NEXT:    ret;
703;
704; SM90-LABEL: test_rint(
705; SM90:       {
706; SM90-NEXT:    .reg .b16 %rs<5>;
707; SM90-NEXT:    .reg .b32 %r<3>;
708; SM90-EMPTY:
709; SM90-NEXT:  // %bb.0:
710; SM90-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
711; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
712; SM90-NEXT:    cvt.rni.bf16.bf16 %rs3, %rs2;
713; SM90-NEXT:    cvt.rni.bf16.bf16 %rs4, %rs1;
714; SM90-NEXT:    mov.b32 %r2, {%rs4, %rs3};
715; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
716; SM90-NEXT:    ret;
717  %r = call <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a)
718  ret <2 x bfloat> %r
719}
720
721define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
722; CHECK-LABEL: test_round(
723; CHECK:       {
724; CHECK-NEXT:    .reg .pred %p<5>;
725; CHECK-NEXT:    .reg .b16 %rs<3>;
726; CHECK-NEXT:    .reg .b32 %r<9>;
727; CHECK-NEXT:    .reg .f32 %f<17>;
728; CHECK-EMPTY:
729; CHECK-NEXT:  // %bb.0:
730; CHECK-NEXT:    ld.param.b32 %r1, [test_round_param_0];
731; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
732; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs1;
733; CHECK-NEXT:    mov.b32 %r2, %f1;
734; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
735; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
736; CHECK-NEXT:    mov.b32 %f2, %r4;
737; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
738; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
739; CHECK-NEXT:    abs.f32 %f5, %f1;
740; CHECK-NEXT:    setp.gt.f32 %p1, %f5, 0f4B000000;
741; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
742; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
743; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
744; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
745; CHECK-NEXT:    cvt.f32.bf16 %f9, %rs2;
746; CHECK-NEXT:    mov.b32 %r5, %f9;
747; CHECK-NEXT:    and.b32 %r6, %r5, -2147483648;
748; CHECK-NEXT:    or.b32 %r7, %r6, 1056964608;
749; CHECK-NEXT:    mov.b32 %f10, %r7;
750; CHECK-NEXT:    add.rn.f32 %f11, %f9, %f10;
751; CHECK-NEXT:    cvt.rzi.f32.f32 %f12, %f11;
752; CHECK-NEXT:    abs.f32 %f13, %f9;
753; CHECK-NEXT:    setp.gt.f32 %p3, %f13, 0f4B000000;
754; CHECK-NEXT:    selp.f32 %f14, %f9, %f12, %p3;
755; CHECK-NEXT:    cvt.rzi.f32.f32 %f15, %f9;
756; CHECK-NEXT:    setp.lt.f32 %p4, %f13, 0f3F000000;
757; CHECK-NEXT:    selp.f32 %f16, %f15, %f14, %p4;
758; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r8, %f16, %f8;
759; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
760; CHECK-NEXT:    ret;
761  %r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a)
762  ret <2 x bfloat> %r
763}
764
765define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
766; CHECK-LABEL: test_copysign(
767; CHECK:       {
768; CHECK-NEXT:    .reg .b32 %r<6>;
769; CHECK-EMPTY:
770; CHECK-NEXT:  // %bb.0:
771; CHECK-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
772; CHECK-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
773; CHECK-NEXT:    and.b32 %r3, %r2, -2147450880;
774; CHECK-NEXT:    and.b32 %r4, %r1, 2147450879;
775; CHECK-NEXT:    or.b32 %r5, %r4, %r3;
776; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
777; CHECK-NEXT:    ret;
778  %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
779  ret <2 x bfloat> %r
780}
781
782