1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s 3; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s 4; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} 5; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} 6 7target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" 8 9define <2 x bfloat> @test_ret_const() #0 { 10; CHECK-LABEL: test_ret_const( 11; CHECK: { 12; CHECK-NEXT: .reg .b32 %r<2>; 13; CHECK-EMPTY: 14; CHECK-NEXT: // %bb.0: 15; CHECK-NEXT: mov.b32 %r1, 1073758080; 16; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 17; CHECK-NEXT: ret; 18 ret <2 x bfloat> <bfloat 1.0, bfloat 2.0> 19} 20 21; Check that we can lower fadd with immediate arguments. 22define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 { 23; SM80-LABEL: test_fadd_imm_0( 24; SM80: { 25; SM80-NEXT: .reg .b32 %r<5>; 26; SM80-EMPTY: 27; SM80-NEXT: // %bb.0: 28; SM80-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; 29; SM80-NEXT: mov.b32 %r2, 1065369472; 30; SM80-NEXT: mov.b32 %r3, 1073758080; 31; SM80-NEXT: fma.rn.bf16x2 %r4, %r1, %r2, %r3; 32; SM80-NEXT: st.param.b32 [func_retval0], %r4; 33; SM80-NEXT: ret; 34; 35; SM90-LABEL: test_fadd_imm_0( 36; SM90: { 37; SM90-NEXT: .reg .b32 %r<4>; 38; SM90-EMPTY: 39; SM90-NEXT: // %bb.0: 40; SM90-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; 41; SM90-NEXT: mov.b32 %r2, 1073758080; 42; SM90-NEXT: add.rn.bf16x2 %r3, %r1, %r2; 43; SM90-NEXT: st.param.b32 [func_retval0], %r3; 44; SM90-NEXT: ret; 45 %r = fadd <2 x bfloat> <bfloat 1.0, bfloat 2.0>, %a 46 ret <2 x bfloat> %r 47} 48 49define bfloat @test_fadd_imm_1(bfloat %a) #0 { 50; SM80-LABEL: test_fadd_imm_1( 51; SM80: { 52; SM80-NEXT: .reg .b16 %rs<4>; 53; SM80-EMPTY: 54; SM80-NEXT: // %bb.0: 55; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; 56; SM80-NEXT: mov.b16 %rs2, 0x3F80; 57; SM80-NEXT: fma.rn.bf16 %rs3, %rs1, %rs2, %rs2; 58; SM80-NEXT: st.param.b16 [func_retval0], %rs3; 59; SM80-NEXT: ret; 60; 61; SM90-LABEL: test_fadd_imm_1( 62; SM90: { 63; SM90-NEXT: .reg .b16 %rs<4>; 64; SM90-EMPTY: 65; SM90-NEXT: // %bb.0: 66; SM90-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; 67; SM90-NEXT: mov.b16 %rs2, 0x3F80; 68; SM90-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; 69; SM90-NEXT: st.param.b16 [func_retval0], %rs3; 70; SM90-NEXT: ret; 71 %r = fadd bfloat %a, 1.0 72 ret bfloat %r 73} 74 75define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 76; SM80-LABEL: test_fsubx2( 77; SM80: { 78; SM80-NEXT: .reg .b32 %r<5>; 79; SM80-EMPTY: 80; SM80-NEXT: // %bb.0: 81; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; 82; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; 83; SM80-NEXT: mov.b32 %r3, -1082081408; 84; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r1; 85; SM80-NEXT: st.param.b32 [func_retval0], %r4; 86; SM80-NEXT: ret; 87; 88; SM90-LABEL: test_fsubx2( 89; SM90: { 90; SM90-NEXT: .reg .b32 %r<4>; 91; SM90-EMPTY: 92; SM90-NEXT: // %bb.0: 93; SM90-NEXT: ld.param.b32 %r1, [test_fsubx2_param_1]; 94; SM90-NEXT: ld.param.b32 %r2, [test_fsubx2_param_0]; 95; SM90-NEXT: sub.rn.bf16x2 %r3, %r2, %r1; 96; SM90-NEXT: st.param.b32 [func_retval0], %r3; 97; SM90-NEXT: ret; 98 %r = fsub <2 x bfloat> %a, %b 99 ret <2 x bfloat> %r 100} 101 102define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 103; SM80-LABEL: test_fmulx2( 104; SM80: { 105; SM80-NEXT: .reg .b32 %r<5>; 106; SM80-EMPTY: 107; SM80-NEXT: // %bb.0: 108; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1]; 109; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0]; 110; SM80-NEXT: mov.b32 %r3, -2147450880; 111; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r1, %r3; 112; SM80-NEXT: st.param.b32 [func_retval0], %r4; 113; SM80-NEXT: ret; 114; 115; SM90-LABEL: test_fmulx2( 116; SM90: { 117; SM90-NEXT: .reg .b32 %r<4>; 118; SM90-EMPTY: 119; SM90-NEXT: // %bb.0: 120; SM90-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1]; 121; SM90-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0]; 122; SM90-NEXT: mul.rn.bf16x2 %r3, %r2, %r1; 123; SM90-NEXT: st.param.b32 [func_retval0], %r3; 124; SM90-NEXT: ret; 125 %r = fmul <2 x bfloat> %a, %b 126 ret <2 x bfloat> %r 127} 128 129define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 130; CHECK-LABEL: test_fdiv( 131; CHECK: { 132; CHECK-NEXT: .reg .b16 %rs<5>; 133; CHECK-NEXT: .reg .b32 %r<4>; 134; CHECK-NEXT: .reg .f32 %f<7>; 135; CHECK-EMPTY: 136; CHECK-NEXT: // %bb.0: 137; CHECK-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; 138; CHECK-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; 139; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; 140; CHECK-NEXT: cvt.f32.bf16 %f1, %rs1; 141; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; 142; CHECK-NEXT: cvt.f32.bf16 %f2, %rs3; 143; CHECK-NEXT: div.rn.f32 %f3, %f2, %f1; 144; CHECK-NEXT: cvt.f32.bf16 %f4, %rs2; 145; CHECK-NEXT: cvt.f32.bf16 %f5, %rs4; 146; CHECK-NEXT: div.rn.f32 %f6, %f5, %f4; 147; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3; 148; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 149; CHECK-NEXT: ret; 150 %r = fdiv <2 x bfloat> %a, %b 151 ret <2 x bfloat> %r 152} 153 154define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 { 155; CHECK-LABEL: test_fneg( 156; CHECK: { 157; CHECK-NEXT: .reg .b32 %r<3>; 158; CHECK-EMPTY: 159; CHECK-NEXT: // %bb.0: 160; CHECK-NEXT: ld.param.u32 %r1, [test_fneg_param_0]; 161; CHECK-NEXT: xor.b32 %r2, %r1, -2147450880; 162; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 163; CHECK-NEXT: ret; 164 %r = fneg <2 x bfloat> %a 165 ret <2 x bfloat> %r 166} 167 168define void @test_ldst_v2bf16(ptr %a, ptr %b) { 169; CHECK-LABEL: test_ldst_v2bf16( 170; CHECK: { 171; CHECK-NEXT: .reg .b32 %r<2>; 172; CHECK-NEXT: .reg .b64 %rd<3>; 173; CHECK-EMPTY: 174; CHECK-NEXT: // %bb.0: 175; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2bf16_param_0]; 176; CHECK-NEXT: ld.b32 %r1, [%rd1]; 177; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2bf16_param_1]; 178; CHECK-NEXT: st.b32 [%rd2], %r1; 179; CHECK-NEXT: ret; 180 %t1 = load <2 x bfloat>, ptr %a 181 store <2 x bfloat> %t1, ptr %b, align 16 182 ret void 183} 184 185define void @test_ldst_v3bf16(ptr %a, ptr %b) { 186; CHECK-LABEL: test_ldst_v3bf16( 187; CHECK: { 188; CHECK-NEXT: .reg .b16 %rs<2>; 189; CHECK-NEXT: .reg .b32 %r<2>; 190; CHECK-NEXT: .reg .b64 %rd<4>; 191; CHECK-EMPTY: 192; CHECK-NEXT: // %bb.0: 193; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3bf16_param_0]; 194; CHECK-NEXT: ld.u64 %rd2, [%rd1]; 195; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; } 196; CHECK-NEXT: ld.param.u64 %rd3, [test_ldst_v3bf16_param_1]; 197; CHECK-NEXT: st.u32 [%rd3], %rd2; 198; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } 199; CHECK-NEXT: st.b16 [%rd3+4], %rs1; 200; CHECK-NEXT: ret; 201 %t1 = load <3 x bfloat>, ptr %a 202 store <3 x bfloat> %t1, ptr %b, align 16 203 ret void 204} 205 206declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0 207 208define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 209; CHECK-LABEL: test_call( 210; CHECK: { 211; CHECK-NEXT: .reg .b32 %r<5>; 212; CHECK-EMPTY: 213; CHECK-NEXT: // %bb.0: 214; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; 215; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; 216; CHECK-NEXT: { // callseq 0, 0 217; CHECK-NEXT: .param .align 4 .b8 param0[4]; 218; CHECK-NEXT: st.param.b32 [param0], %r1; 219; CHECK-NEXT: .param .align 4 .b8 param1[4]; 220; CHECK-NEXT: st.param.b32 [param1], %r2; 221; CHECK-NEXT: .param .align 4 .b8 retval0[4]; 222; CHECK-NEXT: call.uni (retval0), 223; CHECK-NEXT: test_callee, 224; CHECK-NEXT: ( 225; CHECK-NEXT: param0, 226; CHECK-NEXT: param1 227; CHECK-NEXT: ); 228; CHECK-NEXT: ld.param.b32 %r3, [retval0]; 229; CHECK-NEXT: } // callseq 0 230; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 231; CHECK-NEXT: ret; 232 %r = call <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) 233 ret <2 x bfloat> %r 234} 235 236define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c) #0 { 237; CHECK-LABEL: test_select( 238; CHECK: { 239; CHECK-NEXT: .reg .pred %p<2>; 240; CHECK-NEXT: .reg .b16 %rs<3>; 241; CHECK-NEXT: .reg .b32 %r<4>; 242; CHECK-EMPTY: 243; CHECK-NEXT: // %bb.0: 244; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2]; 245; CHECK-NEXT: and.b16 %rs2, %rs1, 1; 246; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1; 247; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_1]; 248; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_0]; 249; CHECK-NEXT: selp.b32 %r3, %r2, %r1, %p1; 250; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 251; CHECK-NEXT: ret; 252 %r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b 253 ret <2 x bfloat> %r 254} 255 256define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c, <2 x bfloat> %d) #0 { 257; SM80-LABEL: test_select_cc( 258; SM80: { 259; SM80-NEXT: .reg .pred %p<3>; 260; SM80-NEXT: .reg .b16 %rs<11>; 261; SM80-NEXT: .reg .b32 %r<6>; 262; SM80-NEXT: .reg .f32 %f<5>; 263; SM80-EMPTY: 264; SM80-NEXT: // %bb.0: 265; SM80-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; 266; SM80-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; 267; SM80-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; 268; SM80-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; 269; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; 270; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 271; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3; 272; SM80-NEXT: cvt.f32.bf16 %f2, %rs3; 273; SM80-NEXT: setp.neu.f32 %p1, %f2, %f1; 274; SM80-NEXT: cvt.f32.bf16 %f3, %rs2; 275; SM80-NEXT: cvt.f32.bf16 %f4, %rs4; 276; SM80-NEXT: setp.neu.f32 %p2, %f4, %f3; 277; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2; 278; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1; 279; SM80-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; 280; SM80-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; 281; SM80-NEXT: mov.b32 %r5, {%rs10, %rs9}; 282; SM80-NEXT: st.param.b32 [func_retval0], %r5; 283; SM80-NEXT: ret; 284; 285; SM90-LABEL: test_select_cc( 286; SM90: { 287; SM90-NEXT: .reg .pred %p<3>; 288; SM90-NEXT: .reg .b16 %rs<7>; 289; SM90-NEXT: .reg .b32 %r<6>; 290; SM90-EMPTY: 291; SM90-NEXT: // %bb.0: 292; SM90-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; 293; SM90-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; 294; SM90-NEXT: ld.param.b32 %r3, [test_select_cc_param_3]; 295; SM90-NEXT: ld.param.b32 %r4, [test_select_cc_param_2]; 296; SM90-NEXT: setp.neu.bf16x2 %p1|%p2, %r4, %r3; 297; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2; 298; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1; 299; SM90-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; 300; SM90-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; 301; SM90-NEXT: mov.b32 %r5, {%rs6, %rs5}; 302; SM90-NEXT: st.param.b32 [func_retval0], %r5; 303; SM90-NEXT: ret; 304 %cc = fcmp une <2 x bfloat> %c, %d 305 %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b 306 ret <2 x bfloat> %r 307} 308 309define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b, 310; SM80-LABEL: test_select_cc_f32_bf16( 311; SM80: { 312; SM80-NEXT: .reg .pred %p<3>; 313; SM80-NEXT: .reg .b16 %rs<5>; 314; SM80-NEXT: .reg .b32 %r<3>; 315; SM80-NEXT: .reg .f32 %f<11>; 316; SM80-EMPTY: 317; SM80-NEXT: // %bb.0: 318; SM80-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0]; 319; SM80-NEXT: ld.param.b32 %r1, [test_select_cc_f32_bf16_param_2]; 320; SM80-NEXT: ld.param.b32 %r2, [test_select_cc_f32_bf16_param_3]; 321; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; 322; SM80-NEXT: cvt.f32.bf16 %f3, %rs1; 323; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1; 324; SM80-NEXT: cvt.f32.bf16 %f4, %rs3; 325; SM80-NEXT: setp.neu.f32 %p1, %f4, %f3; 326; SM80-NEXT: cvt.f32.bf16 %f5, %rs2; 327; SM80-NEXT: cvt.f32.bf16 %f6, %rs4; 328; SM80-NEXT: setp.neu.f32 %p2, %f6, %f5; 329; SM80-NEXT: ld.param.v2.f32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1]; 330; SM80-NEXT: selp.f32 %f9, %f2, %f8, %p2; 331; SM80-NEXT: selp.f32 %f10, %f1, %f7, %p1; 332; SM80-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9}; 333; SM80-NEXT: ret; 334; 335; SM90-LABEL: test_select_cc_f32_bf16( 336; SM90: { 337; SM90-NEXT: .reg .pred %p<3>; 338; SM90-NEXT: .reg .b32 %r<3>; 339; SM90-NEXT: .reg .f32 %f<7>; 340; SM90-EMPTY: 341; SM90-NEXT: // %bb.0: 342; SM90-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0]; 343; SM90-NEXT: ld.param.b32 %r1, [test_select_cc_f32_bf16_param_3]; 344; SM90-NEXT: ld.param.b32 %r2, [test_select_cc_f32_bf16_param_2]; 345; SM90-NEXT: setp.neu.bf16x2 %p1|%p2, %r2, %r1; 346; SM90-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1]; 347; SM90-NEXT: selp.f32 %f5, %f2, %f4, %p2; 348; SM90-NEXT: selp.f32 %f6, %f1, %f3, %p1; 349; SM90-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5}; 350; SM90-NEXT: ret; 351 <2 x bfloat> %c, <2 x bfloat> %d) #0 { 352 %cc = fcmp une <2 x bfloat> %c, %d 353 %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b 354 ret <2 x float> %r 355} 356 357define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, 358; CHECK-LABEL: test_select_cc_bf16_f32( 359; CHECK: { 360; CHECK-NEXT: .reg .pred %p<3>; 361; CHECK-NEXT: .reg .b16 %rs<7>; 362; CHECK-NEXT: .reg .b32 %r<4>; 363; CHECK-NEXT: .reg .f32 %f<5>; 364; CHECK-EMPTY: 365; CHECK-NEXT: // %bb.0: 366; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0]; 367; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1]; 368; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2]; 369; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3]; 370; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3; 371; CHECK-NEXT: setp.neu.f32 %p2, %f2, %f4; 372; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; 373; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; 374; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; 375; CHECK-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; 376; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; 377; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 378; CHECK-NEXT: ret; 379 <2 x float> %c, <2 x float> %d) #0 { 380 %cc = fcmp une <2 x float> %c, %d 381 %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b 382 ret <2 x bfloat> %r 383} 384 385define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 { 386; CHECK-LABEL: test_fptrunc_2xfloat( 387; CHECK: { 388; CHECK-NEXT: .reg .b32 %r<2>; 389; CHECK-NEXT: .reg .f32 %f<3>; 390; CHECK-EMPTY: 391; CHECK-NEXT: // %bb.0: 392; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0]; 393; CHECK-NEXT: cvt.rn.bf16x2.f32 %r1, %f2, %f1; 394; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 395; CHECK-NEXT: ret; 396 %r = fptrunc <2 x float> %a to <2 x bfloat> 397 ret <2 x bfloat> %r 398} 399 400define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 { 401; CHECK-LABEL: test_fpext_2xfloat( 402; CHECK: { 403; CHECK-NEXT: .reg .b16 %rs<3>; 404; CHECK-NEXT: .reg .b32 %r<2>; 405; CHECK-NEXT: .reg .f32 %f<3>; 406; CHECK-EMPTY: 407; CHECK-NEXT: // %bb.0: 408; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0]; 409; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 410; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2; 411; CHECK-NEXT: cvt.f32.bf16 %f2, %rs1; 412; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; 413; CHECK-NEXT: ret; 414 %r = fpext <2 x bfloat> %a to <2 x float> 415 ret <2 x float> %r 416} 417 418define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 { 419; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16( 420; CHECK: { 421; CHECK-NEXT: .reg .b32 %r<2>; 422; CHECK-EMPTY: 423; CHECK-NEXT: // %bb.0: 424; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0]; 425; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 426; CHECK-NEXT: ret; 427 %r = bitcast <2 x bfloat> %a to <2 x i16> 428 ret <2 x i16> %r 429} 430 431define <2 x bfloat> @test_bitcast_2xi16_to_2xbf16(<2 x i16> %a) #0 { 432; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16( 433; CHECK: { 434; CHECK-NEXT: .reg .b32 %r<2>; 435; CHECK-EMPTY: 436; CHECK-NEXT: // %bb.0: 437; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xi16_to_2xbf16_param_0]; 438; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 439; CHECK-NEXT: ret; 440 %r = bitcast <2 x i16> %a to <2 x bfloat> 441 ret <2 x bfloat> %r 442} 443 444declare <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a) #0 445declare <2 x bfloat> @llvm.powi.f16(<2 x bfloat> %a, <2 x i32> %b) #0 446declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0 447declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0 448declare <2 x bfloat> @llvm.pow.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0 449declare <2 x bfloat> @llvm.exp.f16(<2 x bfloat> %a) #0 450declare <2 x bfloat> @llvm.exp2.f16(<2 x bfloat> %a) #0 451declare <2 x bfloat> @llvm.log.f16(<2 x bfloat> %a) #0 452declare <2 x bfloat> @llvm.log10.f16(<2 x bfloat> %a) #0 453declare <2 x bfloat> @llvm.log2.f16(<2 x bfloat> %a) #0 454declare <2 x bfloat> @llvm.fma.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 455declare <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a) #0 456declare <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0 457declare <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0 458declare <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0 459declare <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a) #0 460declare <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a) #0 461declare <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a) #0 462declare <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a) #0 463declare <2 x bfloat> @llvm.nearbyint.f16(<2 x bfloat> %a) #0 464declare <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) #0 465declare <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 466 467define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 { 468; CHECK-LABEL: test_sqrt( 469; CHECK: { 470; CHECK-NEXT: .reg .b16 %rs<3>; 471; CHECK-NEXT: .reg .b32 %r<3>; 472; CHECK-NEXT: .reg .f32 %f<5>; 473; CHECK-EMPTY: 474; CHECK-NEXT: // %bb.0: 475; CHECK-NEXT: ld.param.b32 %r1, [test_sqrt_param_0]; 476; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 477; CHECK-NEXT: cvt.f32.bf16 %f1, %rs1; 478; CHECK-NEXT: sqrt.rn.f32 %f2, %f1; 479; CHECK-NEXT: cvt.f32.bf16 %f3, %rs2; 480; CHECK-NEXT: sqrt.rn.f32 %f4, %f3; 481; CHECK-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2; 482; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 483; CHECK-NEXT: ret; 484 %r = call <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a) 485 ret <2 x bfloat> %r 486} 487 488define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { 489; CHECK-LABEL: test_fmuladd( 490; CHECK: { 491; CHECK-NEXT: .reg .b32 %r<5>; 492; CHECK-EMPTY: 493; CHECK-NEXT: // %bb.0: 494; CHECK-NEXT: ld.param.b32 %r1, [test_fmuladd_param_2]; 495; CHECK-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1]; 496; CHECK-NEXT: ld.param.b32 %r3, [test_fmuladd_param_0]; 497; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; 498; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 499; CHECK-NEXT: ret; 500 %r = call <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) 501 ret <2 x bfloat> %r 502} 503 504define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 { 505; CHECK-LABEL: test_fabs( 506; CHECK: { 507; CHECK-NEXT: .reg .b32 %r<3>; 508; CHECK-EMPTY: 509; CHECK-NEXT: // %bb.0: 510; CHECK-NEXT: ld.param.u32 %r1, [test_fabs_param_0]; 511; CHECK-NEXT: and.b32 %r2, %r1, 2147450879; 512; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 513; CHECK-NEXT: ret; 514 %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a) 515 ret <2 x bfloat> %r 516} 517 518define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 519; SM80-LABEL: test_fabs_add( 520; SM80: { 521; SM80-NEXT: .reg .b32 %r<7>; 522; SM80-EMPTY: 523; SM80-NEXT: // %bb.0: 524; SM80-NEXT: ld.param.b32 %r1, [test_fabs_add_param_1]; 525; SM80-NEXT: ld.param.b32 %r2, [test_fabs_add_param_0]; 526; SM80-NEXT: mov.b32 %r3, 1065369472; 527; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r2; 528; SM80-NEXT: abs.bf16x2 %r5, %r4; 529; SM80-NEXT: fma.rn.bf16x2 %r6, %r5, %r3, %r1; 530; SM80-NEXT: st.param.b32 [func_retval0], %r6; 531; SM80-NEXT: ret; 532; 533; SM90-LABEL: test_fabs_add( 534; SM90: { 535; SM90-NEXT: .reg .b32 %r<6>; 536; SM90-EMPTY: 537; SM90-NEXT: // %bb.0: 538; SM90-NEXT: ld.param.b32 %r1, [test_fabs_add_param_1]; 539; SM90-NEXT: ld.param.b32 %r2, [test_fabs_add_param_0]; 540; SM90-NEXT: add.rn.bf16x2 %r3, %r2, %r2; 541; SM90-NEXT: abs.bf16x2 %r4, %r3; 542; SM90-NEXT: add.rn.bf16x2 %r5, %r4, %r1; 543; SM90-NEXT: st.param.b32 [func_retval0], %r5; 544; SM90-NEXT: ret; 545 %s = fadd <2 x bfloat> %a, %a 546 %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %s) 547 %d = fadd <2 x bfloat> %r, %b 548 ret <2 x bfloat> %d 549} 550 551define <2 x bfloat> @test_minnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 552; CHECK-LABEL: test_minnum( 553; CHECK: { 554; CHECK-NEXT: .reg .b32 %r<4>; 555; CHECK-EMPTY: 556; CHECK-NEXT: // %bb.0: 557; CHECK-NEXT: ld.param.b32 %r1, [test_minnum_param_1]; 558; CHECK-NEXT: ld.param.b32 %r2, [test_minnum_param_0]; 559; CHECK-NEXT: min.bf16x2 %r3, %r2, %r1; 560; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 561; CHECK-NEXT: ret; 562 %r = call <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) 563 ret <2 x bfloat> %r 564} 565 566define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 567; CHECK-LABEL: test_maxnum( 568; CHECK: { 569; CHECK-NEXT: .reg .b32 %r<4>; 570; CHECK-EMPTY: 571; CHECK-NEXT: // %bb.0: 572; CHECK-NEXT: ld.param.b32 %r1, [test_maxnum_param_1]; 573; CHECK-NEXT: ld.param.b32 %r2, [test_maxnum_param_0]; 574; CHECK-NEXT: max.bf16x2 %r3, %r2, %r1; 575; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 576; CHECK-NEXT: ret; 577 %r = call <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) 578 ret <2 x bfloat> %r 579} 580 581define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 { 582; SM80-LABEL: test_floor( 583; SM80: { 584; SM80-NEXT: .reg .b16 %rs<3>; 585; SM80-NEXT: .reg .b32 %r<3>; 586; SM80-NEXT: .reg .f32 %f<5>; 587; SM80-EMPTY: 588; SM80-NEXT: // %bb.0: 589; SM80-NEXT: ld.param.b32 %r1, [test_floor_param_0]; 590; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; 591; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 592; SM80-NEXT: cvt.rmi.f32.f32 %f2, %f1; 593; SM80-NEXT: cvt.f32.bf16 %f3, %rs2; 594; SM80-NEXT: cvt.rmi.f32.f32 %f4, %f3; 595; SM80-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2; 596; SM80-NEXT: st.param.b32 [func_retval0], %r2; 597; SM80-NEXT: ret; 598; 599; SM90-LABEL: test_floor( 600; SM90: { 601; SM90-NEXT: .reg .b16 %rs<5>; 602; SM90-NEXT: .reg .b32 %r<3>; 603; SM90-EMPTY: 604; SM90-NEXT: // %bb.0: 605; SM90-NEXT: ld.param.b32 %r1, [test_floor_param_0]; 606; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; 607; SM90-NEXT: cvt.rmi.bf16.bf16 %rs3, %rs2; 608; SM90-NEXT: cvt.rmi.bf16.bf16 %rs4, %rs1; 609; SM90-NEXT: mov.b32 %r2, {%rs4, %rs3}; 610; SM90-NEXT: st.param.b32 [func_retval0], %r2; 611; SM90-NEXT: ret; 612 %r = call <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a) 613 ret <2 x bfloat> %r 614} 615 616define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 { 617; SM80-LABEL: test_ceil( 618; SM80: { 619; SM80-NEXT: .reg .b16 %rs<3>; 620; SM80-NEXT: .reg .b32 %r<3>; 621; SM80-NEXT: .reg .f32 %f<5>; 622; SM80-EMPTY: 623; SM80-NEXT: // %bb.0: 624; SM80-NEXT: ld.param.b32 %r1, [test_ceil_param_0]; 625; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; 626; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 627; SM80-NEXT: cvt.rpi.f32.f32 %f2, %f1; 628; SM80-NEXT: cvt.f32.bf16 %f3, %rs2; 629; SM80-NEXT: cvt.rpi.f32.f32 %f4, %f3; 630; SM80-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2; 631; SM80-NEXT: st.param.b32 [func_retval0], %r2; 632; SM80-NEXT: ret; 633; 634; SM90-LABEL: test_ceil( 635; SM90: { 636; SM90-NEXT: .reg .b16 %rs<5>; 637; SM90-NEXT: .reg .b32 %r<3>; 638; SM90-EMPTY: 639; SM90-NEXT: // %bb.0: 640; SM90-NEXT: ld.param.b32 %r1, [test_ceil_param_0]; 641; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; 642; SM90-NEXT: cvt.rpi.bf16.bf16 %rs3, %rs2; 643; SM90-NEXT: cvt.rpi.bf16.bf16 %rs4, %rs1; 644; SM90-NEXT: mov.b32 %r2, {%rs4, %rs3}; 645; SM90-NEXT: st.param.b32 [func_retval0], %r2; 646; SM90-NEXT: ret; 647 %r = call <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a) 648 ret <2 x bfloat> %r 649} 650 651define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 { 652; SM80-LABEL: test_trunc( 653; SM80: { 654; SM80-NEXT: .reg .b16 %rs<3>; 655; SM80-NEXT: .reg .b32 %r<3>; 656; SM80-NEXT: .reg .f32 %f<5>; 657; SM80-EMPTY: 658; SM80-NEXT: // %bb.0: 659; SM80-NEXT: ld.param.b32 %r1, [test_trunc_param_0]; 660; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; 661; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 662; SM80-NEXT: cvt.rzi.f32.f32 %f2, %f1; 663; SM80-NEXT: cvt.f32.bf16 %f3, %rs2; 664; SM80-NEXT: cvt.rzi.f32.f32 %f4, %f3; 665; SM80-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2; 666; SM80-NEXT: st.param.b32 [func_retval0], %r2; 667; SM80-NEXT: ret; 668; 669; SM90-LABEL: test_trunc( 670; SM90: { 671; SM90-NEXT: .reg .b16 %rs<5>; 672; SM90-NEXT: .reg .b32 %r<3>; 673; SM90-EMPTY: 674; SM90-NEXT: // %bb.0: 675; SM90-NEXT: ld.param.b32 %r1, [test_trunc_param_0]; 676; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; 677; SM90-NEXT: cvt.rzi.bf16.bf16 %rs3, %rs2; 678; SM90-NEXT: cvt.rzi.bf16.bf16 %rs4, %rs1; 679; SM90-NEXT: mov.b32 %r2, {%rs4, %rs3}; 680; SM90-NEXT: st.param.b32 [func_retval0], %r2; 681; SM90-NEXT: ret; 682 %r = call <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a) 683 ret <2 x bfloat> %r 684} 685 686define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 { 687; SM80-LABEL: test_rint( 688; SM80: { 689; SM80-NEXT: .reg .b16 %rs<3>; 690; SM80-NEXT: .reg .b32 %r<3>; 691; SM80-NEXT: .reg .f32 %f<5>; 692; SM80-EMPTY: 693; SM80-NEXT: // %bb.0: 694; SM80-NEXT: ld.param.b32 %r1, [test_rint_param_0]; 695; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; 696; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 697; SM80-NEXT: cvt.rni.f32.f32 %f2, %f1; 698; SM80-NEXT: cvt.f32.bf16 %f3, %rs2; 699; SM80-NEXT: cvt.rni.f32.f32 %f4, %f3; 700; SM80-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2; 701; SM80-NEXT: st.param.b32 [func_retval0], %r2; 702; SM80-NEXT: ret; 703; 704; SM90-LABEL: test_rint( 705; SM90: { 706; SM90-NEXT: .reg .b16 %rs<5>; 707; SM90-NEXT: .reg .b32 %r<3>; 708; SM90-EMPTY: 709; SM90-NEXT: // %bb.0: 710; SM90-NEXT: ld.param.b32 %r1, [test_rint_param_0]; 711; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; 712; SM90-NEXT: cvt.rni.bf16.bf16 %rs3, %rs2; 713; SM90-NEXT: cvt.rni.bf16.bf16 %rs4, %rs1; 714; SM90-NEXT: mov.b32 %r2, {%rs4, %rs3}; 715; SM90-NEXT: st.param.b32 [func_retval0], %r2; 716; SM90-NEXT: ret; 717 %r = call <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a) 718 ret <2 x bfloat> %r 719} 720 721define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 { 722; CHECK-LABEL: test_round( 723; CHECK: { 724; CHECK-NEXT: .reg .pred %p<5>; 725; CHECK-NEXT: .reg .b16 %rs<3>; 726; CHECK-NEXT: .reg .b32 %r<9>; 727; CHECK-NEXT: .reg .f32 %f<17>; 728; CHECK-EMPTY: 729; CHECK-NEXT: // %bb.0: 730; CHECK-NEXT: ld.param.b32 %r1, [test_round_param_0]; 731; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 732; CHECK-NEXT: cvt.f32.bf16 %f1, %rs1; 733; CHECK-NEXT: mov.b32 %r2, %f1; 734; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; 735; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; 736; CHECK-NEXT: mov.b32 %f2, %r4; 737; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; 738; CHECK-NEXT: cvt.rzi.f32.f32 %f4, %f3; 739; CHECK-NEXT: abs.f32 %f5, %f1; 740; CHECK-NEXT: setp.gt.f32 %p1, %f5, 0f4B000000; 741; CHECK-NEXT: selp.f32 %f6, %f1, %f4, %p1; 742; CHECK-NEXT: cvt.rzi.f32.f32 %f7, %f1; 743; CHECK-NEXT: setp.lt.f32 %p2, %f5, 0f3F000000; 744; CHECK-NEXT: selp.f32 %f8, %f7, %f6, %p2; 745; CHECK-NEXT: cvt.f32.bf16 %f9, %rs2; 746; CHECK-NEXT: mov.b32 %r5, %f9; 747; CHECK-NEXT: and.b32 %r6, %r5, -2147483648; 748; CHECK-NEXT: or.b32 %r7, %r6, 1056964608; 749; CHECK-NEXT: mov.b32 %f10, %r7; 750; CHECK-NEXT: add.rn.f32 %f11, %f9, %f10; 751; CHECK-NEXT: cvt.rzi.f32.f32 %f12, %f11; 752; CHECK-NEXT: abs.f32 %f13, %f9; 753; CHECK-NEXT: setp.gt.f32 %p3, %f13, 0f4B000000; 754; CHECK-NEXT: selp.f32 %f14, %f9, %f12, %p3; 755; CHECK-NEXT: cvt.rzi.f32.f32 %f15, %f9; 756; CHECK-NEXT: setp.lt.f32 %p4, %f13, 0f3F000000; 757; CHECK-NEXT: selp.f32 %f16, %f15, %f14, %p4; 758; CHECK-NEXT: cvt.rn.bf16x2.f32 %r8, %f16, %f8; 759; CHECK-NEXT: st.param.b32 [func_retval0], %r8; 760; CHECK-NEXT: ret; 761 %r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) 762 ret <2 x bfloat> %r 763} 764 765define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 766; CHECK-LABEL: test_copysign( 767; CHECK: { 768; CHECK-NEXT: .reg .b32 %r<6>; 769; CHECK-EMPTY: 770; CHECK-NEXT: // %bb.0: 771; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; 772; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; 773; CHECK-NEXT: and.b32 %r3, %r2, -2147450880; 774; CHECK-NEXT: and.b32 %r4, %r1, 2147450879; 775; CHECK-NEXT: or.b32 %r5, %r4, %r3; 776; CHECK-NEXT: st.param.b32 [func_retval0], %r5; 777; CHECK-NEXT: ret; 778 %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) 779 ret <2 x bfloat> %r 780} 781 782