1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s 3; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s 4; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s 5; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s 6; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} 7; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} 8; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} 9 10target triple = "nvptx64-nvidia-cuda" 11 12; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8}; 13@"bfloat_array" = addrspace(1) constant [4 x bfloat] 14 [bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807] 15 16define bfloat @test_fadd(bfloat %0, bfloat %1) { 17; SM70-LABEL: test_fadd( 18; SM70: { 19; SM70-NEXT: .reg .pred %p<2>; 20; SM70-NEXT: .reg .b16 %rs<2>; 21; SM70-NEXT: .reg .b32 %r<11>; 22; SM70-NEXT: .reg .f32 %f<4>; 23; SM70-EMPTY: 24; SM70-NEXT: // %bb.0: 25; SM70-NEXT: ld.param.u16 %r1, [test_fadd_param_1]; 26; SM70-NEXT: shl.b32 %r2, %r1, 16; 27; SM70-NEXT: mov.b32 %f1, %r2; 28; SM70-NEXT: ld.param.u16 %r3, [test_fadd_param_0]; 29; SM70-NEXT: shl.b32 %r4, %r3, 16; 30; SM70-NEXT: mov.b32 %f2, %r4; 31; SM70-NEXT: add.rn.f32 %f3, %f2, %f1; 32; SM70-NEXT: mov.b32 %r5, %f3; 33; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1; 34; SM70-NEXT: add.s32 %r7, %r6, %r5; 35; SM70-NEXT: add.s32 %r8, %r7, 32767; 36; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3; 37; SM70-NEXT: or.b32 %r9, %r5, 4194304; 38; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1; 39; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; } 40; SM70-NEXT: st.param.b16 [func_retval0], %rs1; 41; SM70-NEXT: ret; 42; 43; SM80-LABEL: test_fadd( 44; SM80: { 45; SM80-NEXT: .reg .b16 %rs<5>; 46; SM80-EMPTY: 47; SM80-NEXT: // %bb.0: 48; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; 49; SM80-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; 50; SM80-NEXT: mov.b16 %rs3, 0x3F80; 51; SM80-NEXT: fma.rn.bf16 %rs4, %rs1, %rs3, %rs2; 52; SM80-NEXT: st.param.b16 [func_retval0], %rs4; 53; SM80-NEXT: ret; 54; 55; SM80-FTZ-LABEL: test_fadd( 56; SM80-FTZ: { 57; SM80-FTZ-NEXT: .reg .b16 %rs<4>; 58; SM80-FTZ-NEXT: .reg .f32 %f<4>; 59; SM80-FTZ-EMPTY: 60; SM80-FTZ-NEXT: // %bb.0: 61; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; 62; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; 63; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2; 64; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1; 65; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1; 66; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f3; 67; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; 68; SM80-FTZ-NEXT: ret; 69; 70; SM90-LABEL: test_fadd( 71; SM90: { 72; SM90-NEXT: .reg .b16 %rs<4>; 73; SM90-EMPTY: 74; SM90-NEXT: // %bb.0: 75; SM90-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; 76; SM90-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; 77; SM90-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; 78; SM90-NEXT: st.param.b16 [func_retval0], %rs3; 79; SM90-NEXT: ret; 80 %3 = fadd bfloat %0, %1 81 ret bfloat %3 82} 83 84define bfloat @test_fsub(bfloat %0, bfloat %1) { 85; SM70-LABEL: test_fsub( 86; SM70: { 87; SM70-NEXT: .reg .pred %p<2>; 88; SM70-NEXT: .reg .b16 %rs<2>; 89; SM70-NEXT: .reg .b32 %r<11>; 90; SM70-NEXT: .reg .f32 %f<4>; 91; SM70-EMPTY: 92; SM70-NEXT: // %bb.0: 93; SM70-NEXT: ld.param.u16 %r1, [test_fsub_param_1]; 94; SM70-NEXT: shl.b32 %r2, %r1, 16; 95; SM70-NEXT: mov.b32 %f1, %r2; 96; SM70-NEXT: ld.param.u16 %r3, [test_fsub_param_0]; 97; SM70-NEXT: shl.b32 %r4, %r3, 16; 98; SM70-NEXT: mov.b32 %f2, %r4; 99; SM70-NEXT: sub.rn.f32 %f3, %f2, %f1; 100; SM70-NEXT: mov.b32 %r5, %f3; 101; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1; 102; SM70-NEXT: add.s32 %r7, %r6, %r5; 103; SM70-NEXT: add.s32 %r8, %r7, 32767; 104; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3; 105; SM70-NEXT: or.b32 %r9, %r5, 4194304; 106; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1; 107; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; } 108; SM70-NEXT: st.param.b16 [func_retval0], %rs1; 109; SM70-NEXT: ret; 110; 111; SM80-LABEL: test_fsub( 112; SM80: { 113; SM80-NEXT: .reg .b16 %rs<5>; 114; SM80-EMPTY: 115; SM80-NEXT: // %bb.0: 116; SM80-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; 117; SM80-NEXT: mov.b16 %rs2, 0xBF80; 118; SM80-NEXT: ld.param.b16 %rs3, [test_fsub_param_1]; 119; SM80-NEXT: fma.rn.bf16 %rs4, %rs3, %rs2, %rs1; 120; SM80-NEXT: st.param.b16 [func_retval0], %rs4; 121; SM80-NEXT: ret; 122; 123; SM80-FTZ-LABEL: test_fsub( 124; SM80-FTZ: { 125; SM80-FTZ-NEXT: .reg .b16 %rs<4>; 126; SM80-FTZ-NEXT: .reg .f32 %f<4>; 127; SM80-FTZ-EMPTY: 128; SM80-FTZ-NEXT: // %bb.0: 129; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; 130; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; 131; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2; 132; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1; 133; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1; 134; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f3; 135; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; 136; SM80-FTZ-NEXT: ret; 137; 138; SM90-LABEL: test_fsub( 139; SM90: { 140; SM90-NEXT: .reg .b16 %rs<4>; 141; SM90-EMPTY: 142; SM90-NEXT: // %bb.0: 143; SM90-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; 144; SM90-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; 145; SM90-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2; 146; SM90-NEXT: st.param.b16 [func_retval0], %rs3; 147; SM90-NEXT: ret; 148 %3 = fsub bfloat %0, %1 149 ret bfloat %3 150} 151 152define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 153; SM70-LABEL: test_faddx2( 154; SM70: { 155; SM70-NEXT: .reg .pred %p<3>; 156; SM70-NEXT: .reg .b16 %rs<5>; 157; SM70-NEXT: .reg .b32 %r<24>; 158; SM70-NEXT: .reg .f32 %f<7>; 159; SM70-EMPTY: 160; SM70-NEXT: // %bb.0: 161; SM70-NEXT: ld.param.b32 %r1, [test_faddx2_param_0]; 162; SM70-NEXT: ld.param.b32 %r2, [test_faddx2_param_1]; 163; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; 164; SM70-NEXT: cvt.u32.u16 %r3, %rs2; 165; SM70-NEXT: shl.b32 %r4, %r3, 16; 166; SM70-NEXT: mov.b32 %f1, %r4; 167; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; 168; SM70-NEXT: cvt.u32.u16 %r5, %rs4; 169; SM70-NEXT: shl.b32 %r6, %r5, 16; 170; SM70-NEXT: mov.b32 %f2, %r6; 171; SM70-NEXT: add.rn.f32 %f3, %f2, %f1; 172; SM70-NEXT: mov.b32 %r7, %f3; 173; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 174; SM70-NEXT: add.s32 %r9, %r8, %r7; 175; SM70-NEXT: add.s32 %r10, %r9, 32767; 176; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3; 177; SM70-NEXT: or.b32 %r11, %r7, 4194304; 178; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 179; SM70-NEXT: cvt.u32.u16 %r13, %rs1; 180; SM70-NEXT: shl.b32 %r14, %r13, 16; 181; SM70-NEXT: mov.b32 %f4, %r14; 182; SM70-NEXT: cvt.u32.u16 %r15, %rs3; 183; SM70-NEXT: shl.b32 %r16, %r15, 16; 184; SM70-NEXT: mov.b32 %f5, %r16; 185; SM70-NEXT: add.rn.f32 %f6, %f5, %f4; 186; SM70-NEXT: mov.b32 %r17, %f6; 187; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1; 188; SM70-NEXT: add.s32 %r19, %r18, %r17; 189; SM70-NEXT: add.s32 %r20, %r19, 32767; 190; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 191; SM70-NEXT: or.b32 %r21, %r17, 4194304; 192; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; 193; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U; 194; SM70-NEXT: st.param.b32 [func_retval0], %r23; 195; SM70-NEXT: ret; 196; 197; SM80-LABEL: test_faddx2( 198; SM80: { 199; SM80-NEXT: .reg .b32 %r<5>; 200; SM80-EMPTY: 201; SM80-NEXT: // %bb.0: 202; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_1]; 203; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_0]; 204; SM80-NEXT: mov.b32 %r3, 1065369472; 205; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r1; 206; SM80-NEXT: st.param.b32 [func_retval0], %r4; 207; SM80-NEXT: ret; 208; 209; SM80-FTZ-LABEL: test_faddx2( 210; SM80-FTZ: { 211; SM80-FTZ-NEXT: .reg .b16 %rs<5>; 212; SM80-FTZ-NEXT: .reg .b32 %r<4>; 213; SM80-FTZ-NEXT: .reg .f32 %f<7>; 214; SM80-FTZ-EMPTY: 215; SM80-FTZ-NEXT: // %bb.0: 216; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0]; 217; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1]; 218; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2; 219; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 220; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1; 221; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3; 222; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1; 223; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2; 224; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4; 225; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4; 226; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3; 227; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; 228; SM80-FTZ-NEXT: ret; 229; 230; SM90-LABEL: test_faddx2( 231; SM90: { 232; SM90-NEXT: .reg .b32 %r<4>; 233; SM90-EMPTY: 234; SM90-NEXT: // %bb.0: 235; SM90-NEXT: ld.param.b32 %r1, [test_faddx2_param_1]; 236; SM90-NEXT: ld.param.b32 %r2, [test_faddx2_param_0]; 237; SM90-NEXT: add.rn.bf16x2 %r3, %r2, %r1; 238; SM90-NEXT: st.param.b32 [func_retval0], %r3; 239; SM90-NEXT: ret; 240 %r = fadd <2 x bfloat> %a, %b 241 ret <2 x bfloat> %r 242} 243 244define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 245; SM70-LABEL: test_fsubx2( 246; SM70: { 247; SM70-NEXT: .reg .pred %p<3>; 248; SM70-NEXT: .reg .b16 %rs<5>; 249; SM70-NEXT: .reg .b32 %r<24>; 250; SM70-NEXT: .reg .f32 %f<7>; 251; SM70-EMPTY: 252; SM70-NEXT: // %bb.0: 253; SM70-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; 254; SM70-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; 255; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; 256; SM70-NEXT: cvt.u32.u16 %r3, %rs2; 257; SM70-NEXT: shl.b32 %r4, %r3, 16; 258; SM70-NEXT: mov.b32 %f1, %r4; 259; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; 260; SM70-NEXT: cvt.u32.u16 %r5, %rs4; 261; SM70-NEXT: shl.b32 %r6, %r5, 16; 262; SM70-NEXT: mov.b32 %f2, %r6; 263; SM70-NEXT: sub.rn.f32 %f3, %f2, %f1; 264; SM70-NEXT: mov.b32 %r7, %f3; 265; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 266; SM70-NEXT: add.s32 %r9, %r8, %r7; 267; SM70-NEXT: add.s32 %r10, %r9, 32767; 268; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3; 269; SM70-NEXT: or.b32 %r11, %r7, 4194304; 270; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 271; SM70-NEXT: cvt.u32.u16 %r13, %rs1; 272; SM70-NEXT: shl.b32 %r14, %r13, 16; 273; SM70-NEXT: mov.b32 %f4, %r14; 274; SM70-NEXT: cvt.u32.u16 %r15, %rs3; 275; SM70-NEXT: shl.b32 %r16, %r15, 16; 276; SM70-NEXT: mov.b32 %f5, %r16; 277; SM70-NEXT: sub.rn.f32 %f6, %f5, %f4; 278; SM70-NEXT: mov.b32 %r17, %f6; 279; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1; 280; SM70-NEXT: add.s32 %r19, %r18, %r17; 281; SM70-NEXT: add.s32 %r20, %r19, 32767; 282; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 283; SM70-NEXT: or.b32 %r21, %r17, 4194304; 284; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; 285; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U; 286; SM70-NEXT: st.param.b32 [func_retval0], %r23; 287; SM70-NEXT: ret; 288; 289; SM80-LABEL: test_fsubx2( 290; SM80: { 291; SM80-NEXT: .reg .b32 %r<5>; 292; SM80-EMPTY: 293; SM80-NEXT: // %bb.0: 294; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; 295; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; 296; SM80-NEXT: mov.b32 %r3, -1082081408; 297; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r1; 298; SM80-NEXT: st.param.b32 [func_retval0], %r4; 299; SM80-NEXT: ret; 300; 301; SM80-FTZ-LABEL: test_fsubx2( 302; SM80-FTZ: { 303; SM80-FTZ-NEXT: .reg .b16 %rs<5>; 304; SM80-FTZ-NEXT: .reg .b32 %r<4>; 305; SM80-FTZ-NEXT: .reg .f32 %f<7>; 306; SM80-FTZ-EMPTY: 307; SM80-FTZ-NEXT: // %bb.0: 308; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; 309; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; 310; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2; 311; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 312; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1; 313; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3; 314; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1; 315; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2; 316; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4; 317; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4; 318; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3; 319; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; 320; SM80-FTZ-NEXT: ret; 321; 322; SM90-LABEL: test_fsubx2( 323; SM90: { 324; SM90-NEXT: .reg .b32 %r<4>; 325; SM90-EMPTY: 326; SM90-NEXT: // %bb.0: 327; SM90-NEXT: ld.param.b32 %r1, [test_fsubx2_param_1]; 328; SM90-NEXT: ld.param.b32 %r2, [test_fsubx2_param_0]; 329; SM90-NEXT: sub.rn.bf16x2 %r3, %r2, %r1; 330; SM90-NEXT: st.param.b32 [func_retval0], %r3; 331; SM90-NEXT: ret; 332 %r = fsub <2 x bfloat> %a, %b 333 ret <2 x bfloat> %r 334} 335 336define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 337; SM70-LABEL: test_fmulx2( 338; SM70: { 339; SM70-NEXT: .reg .pred %p<3>; 340; SM70-NEXT: .reg .b16 %rs<5>; 341; SM70-NEXT: .reg .b32 %r<24>; 342; SM70-NEXT: .reg .f32 %f<7>; 343; SM70-EMPTY: 344; SM70-NEXT: // %bb.0: 345; SM70-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0]; 346; SM70-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1]; 347; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; 348; SM70-NEXT: cvt.u32.u16 %r3, %rs2; 349; SM70-NEXT: shl.b32 %r4, %r3, 16; 350; SM70-NEXT: mov.b32 %f1, %r4; 351; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; 352; SM70-NEXT: cvt.u32.u16 %r5, %rs4; 353; SM70-NEXT: shl.b32 %r6, %r5, 16; 354; SM70-NEXT: mov.b32 %f2, %r6; 355; SM70-NEXT: mul.rn.f32 %f3, %f2, %f1; 356; SM70-NEXT: mov.b32 %r7, %f3; 357; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 358; SM70-NEXT: add.s32 %r9, %r8, %r7; 359; SM70-NEXT: add.s32 %r10, %r9, 32767; 360; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3; 361; SM70-NEXT: or.b32 %r11, %r7, 4194304; 362; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 363; SM70-NEXT: cvt.u32.u16 %r13, %rs1; 364; SM70-NEXT: shl.b32 %r14, %r13, 16; 365; SM70-NEXT: mov.b32 %f4, %r14; 366; SM70-NEXT: cvt.u32.u16 %r15, %rs3; 367; SM70-NEXT: shl.b32 %r16, %r15, 16; 368; SM70-NEXT: mov.b32 %f5, %r16; 369; SM70-NEXT: mul.rn.f32 %f6, %f5, %f4; 370; SM70-NEXT: mov.b32 %r17, %f6; 371; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1; 372; SM70-NEXT: add.s32 %r19, %r18, %r17; 373; SM70-NEXT: add.s32 %r20, %r19, 32767; 374; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 375; SM70-NEXT: or.b32 %r21, %r17, 4194304; 376; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; 377; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U; 378; SM70-NEXT: st.param.b32 [func_retval0], %r23; 379; SM70-NEXT: ret; 380; 381; SM80-LABEL: test_fmulx2( 382; SM80: { 383; SM80-NEXT: .reg .b32 %r<5>; 384; SM80-EMPTY: 385; SM80-NEXT: // %bb.0: 386; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1]; 387; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0]; 388; SM80-NEXT: mov.b32 %r3, -2147450880; 389; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r1, %r3; 390; SM80-NEXT: st.param.b32 [func_retval0], %r4; 391; SM80-NEXT: ret; 392; 393; SM80-FTZ-LABEL: test_fmulx2( 394; SM80-FTZ: { 395; SM80-FTZ-NEXT: .reg .b16 %rs<5>; 396; SM80-FTZ-NEXT: .reg .b32 %r<4>; 397; SM80-FTZ-NEXT: .reg .f32 %f<7>; 398; SM80-FTZ-EMPTY: 399; SM80-FTZ-NEXT: // %bb.0: 400; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0]; 401; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1]; 402; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2; 403; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 404; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1; 405; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3; 406; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1; 407; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2; 408; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4; 409; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4; 410; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3; 411; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; 412; SM80-FTZ-NEXT: ret; 413; 414; SM90-LABEL: test_fmulx2( 415; SM90: { 416; SM90-NEXT: .reg .b32 %r<4>; 417; SM90-EMPTY: 418; SM90-NEXT: // %bb.0: 419; SM90-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1]; 420; SM90-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0]; 421; SM90-NEXT: mul.rn.bf16x2 %r3, %r2, %r1; 422; SM90-NEXT: st.param.b32 [func_retval0], %r3; 423; SM90-NEXT: ret; 424 %r = fmul <2 x bfloat> %a, %b 425 ret <2 x bfloat> %r 426} 427 428define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { 429; SM70-LABEL: test_fdiv( 430; SM70: { 431; SM70-NEXT: .reg .pred %p<3>; 432; SM70-NEXT: .reg .b16 %rs<5>; 433; SM70-NEXT: .reg .b32 %r<24>; 434; SM70-NEXT: .reg .f32 %f<7>; 435; SM70-EMPTY: 436; SM70-NEXT: // %bb.0: 437; SM70-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; 438; SM70-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; 439; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; 440; SM70-NEXT: cvt.u32.u16 %r3, %rs2; 441; SM70-NEXT: shl.b32 %r4, %r3, 16; 442; SM70-NEXT: mov.b32 %f1, %r4; 443; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; 444; SM70-NEXT: cvt.u32.u16 %r5, %rs4; 445; SM70-NEXT: shl.b32 %r6, %r5, 16; 446; SM70-NEXT: mov.b32 %f2, %r6; 447; SM70-NEXT: div.rn.f32 %f3, %f2, %f1; 448; SM70-NEXT: mov.b32 %r7, %f3; 449; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 450; SM70-NEXT: add.s32 %r9, %r8, %r7; 451; SM70-NEXT: add.s32 %r10, %r9, 32767; 452; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3; 453; SM70-NEXT: or.b32 %r11, %r7, 4194304; 454; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 455; SM70-NEXT: cvt.u32.u16 %r13, %rs1; 456; SM70-NEXT: shl.b32 %r14, %r13, 16; 457; SM70-NEXT: mov.b32 %f4, %r14; 458; SM70-NEXT: cvt.u32.u16 %r15, %rs3; 459; SM70-NEXT: shl.b32 %r16, %r15, 16; 460; SM70-NEXT: mov.b32 %f5, %r16; 461; SM70-NEXT: div.rn.f32 %f6, %f5, %f4; 462; SM70-NEXT: mov.b32 %r17, %f6; 463; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1; 464; SM70-NEXT: add.s32 %r19, %r18, %r17; 465; SM70-NEXT: add.s32 %r20, %r19, 32767; 466; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 467; SM70-NEXT: or.b32 %r21, %r17, 4194304; 468; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; 469; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U; 470; SM70-NEXT: st.param.b32 [func_retval0], %r23; 471; SM70-NEXT: ret; 472; 473; SM80-LABEL: test_fdiv( 474; SM80: { 475; SM80-NEXT: .reg .b16 %rs<5>; 476; SM80-NEXT: .reg .b32 %r<4>; 477; SM80-NEXT: .reg .f32 %f<7>; 478; SM80-EMPTY: 479; SM80-NEXT: // %bb.0: 480; SM80-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; 481; SM80-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; 482; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; 483; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 484; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1; 485; SM80-NEXT: cvt.f32.bf16 %f2, %rs3; 486; SM80-NEXT: div.rn.f32 %f3, %f2, %f1; 487; SM80-NEXT: cvt.f32.bf16 %f4, %rs2; 488; SM80-NEXT: cvt.f32.bf16 %f5, %rs4; 489; SM80-NEXT: div.rn.f32 %f6, %f5, %f4; 490; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3; 491; SM80-NEXT: st.param.b32 [func_retval0], %r3; 492; SM80-NEXT: ret; 493; 494; SM80-FTZ-LABEL: test_fdiv( 495; SM80-FTZ: { 496; SM80-FTZ-NEXT: .reg .b16 %rs<5>; 497; SM80-FTZ-NEXT: .reg .b32 %r<4>; 498; SM80-FTZ-NEXT: .reg .f32 %f<7>; 499; SM80-FTZ-EMPTY: 500; SM80-FTZ-NEXT: // %bb.0: 501; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; 502; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; 503; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2; 504; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 505; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1; 506; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3; 507; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1; 508; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2; 509; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4; 510; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4; 511; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3; 512; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; 513; SM80-FTZ-NEXT: ret; 514; 515; SM90-LABEL: test_fdiv( 516; SM90: { 517; SM90-NEXT: .reg .b16 %rs<5>; 518; SM90-NEXT: .reg .b32 %r<4>; 519; SM90-NEXT: .reg .f32 %f<7>; 520; SM90-EMPTY: 521; SM90-NEXT: // %bb.0: 522; SM90-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; 523; SM90-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; 524; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2; 525; SM90-NEXT: cvt.f32.bf16 %f1, %rs1; 526; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1; 527; SM90-NEXT: cvt.f32.bf16 %f2, %rs3; 528; SM90-NEXT: div.rn.f32 %f3, %f2, %f1; 529; SM90-NEXT: cvt.f32.bf16 %f4, %rs2; 530; SM90-NEXT: cvt.f32.bf16 %f5, %rs4; 531; SM90-NEXT: div.rn.f32 %f6, %f5, %f4; 532; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3; 533; SM90-NEXT: st.param.b32 [func_retval0], %r3; 534; SM90-NEXT: ret; 535 %r = fdiv <2 x bfloat> %a, %b 536 ret <2 x bfloat> %r 537} 538 539define bfloat @test_extract_0(<2 x bfloat> %a) #0 { 540; CHECK-LABEL: test_extract_0( 541; CHECK: { 542; CHECK-NEXT: .reg .b16 %rs<2>; 543; CHECK-EMPTY: 544; CHECK-NEXT: // %bb.0: 545; CHECK-NEXT: ld.param.b16 %rs1, [test_extract_0_param_0]; 546; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; 547; CHECK-NEXT: ret; 548 %e = extractelement <2 x bfloat> %a, i32 0 549 ret bfloat %e 550} 551 552define bfloat @test_extract_1(<2 x bfloat> %a) #0 { 553; CHECK-LABEL: test_extract_1( 554; CHECK: { 555; CHECK-NEXT: .reg .b16 %rs<2>; 556; CHECK-EMPTY: 557; CHECK-NEXT: // %bb.0: 558; CHECK-NEXT: ld.param.b16 %rs1, [test_extract_1_param_0+2]; 559; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; 560; CHECK-NEXT: ret; 561 %e = extractelement <2 x bfloat> %a, i32 1 562 ret bfloat %e 563} 564 565define float @test_fpext_float(bfloat %a) #0 { 566; SM70-LABEL: test_fpext_float( 567; SM70: { 568; SM70-NEXT: .reg .b32 %r<3>; 569; SM70-NEXT: .reg .f32 %f<2>; 570; SM70-EMPTY: 571; SM70-NEXT: // %bb.0: 572; SM70-NEXT: ld.param.u16 %r1, [test_fpext_float_param_0]; 573; SM70-NEXT: shl.b32 %r2, %r1, 16; 574; SM70-NEXT: mov.b32 %f1, %r2; 575; SM70-NEXT: st.param.f32 [func_retval0], %f1; 576; SM70-NEXT: ret; 577; 578; SM80-LABEL: test_fpext_float( 579; SM80: { 580; SM80-NEXT: .reg .b16 %rs<2>; 581; SM80-NEXT: .reg .f32 %f<2>; 582; SM80-EMPTY: 583; SM80-NEXT: // %bb.0: 584; SM80-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; 585; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 586; SM80-NEXT: st.param.f32 [func_retval0], %f1; 587; SM80-NEXT: ret; 588; 589; SM80-FTZ-LABEL: test_fpext_float( 590; SM80-FTZ: { 591; SM80-FTZ-NEXT: .reg .b16 %rs<2>; 592; SM80-FTZ-NEXT: .reg .f32 %f<2>; 593; SM80-FTZ-EMPTY: 594; SM80-FTZ-NEXT: // %bb.0: 595; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; 596; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 597; SM80-FTZ-NEXT: st.param.f32 [func_retval0], %f1; 598; SM80-FTZ-NEXT: ret; 599; 600; SM90-LABEL: test_fpext_float( 601; SM90: { 602; SM90-NEXT: .reg .b16 %rs<2>; 603; SM90-NEXT: .reg .f32 %f<2>; 604; SM90-EMPTY: 605; SM90-NEXT: // %bb.0: 606; SM90-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; 607; SM90-NEXT: cvt.f32.bf16 %f1, %rs1; 608; SM90-NEXT: st.param.f32 [func_retval0], %f1; 609; SM90-NEXT: ret; 610 %r = fpext bfloat %a to float 611 ret float %r 612} 613 614define bfloat @test_fptrunc_float(float %a) #0 { 615; SM70-LABEL: test_fptrunc_float( 616; SM70: { 617; SM70-NEXT: .reg .pred %p<2>; 618; SM70-NEXT: .reg .b16 %rs<2>; 619; SM70-NEXT: .reg .b32 %r<7>; 620; SM70-NEXT: .reg .f32 %f<2>; 621; SM70-EMPTY: 622; SM70-NEXT: // %bb.0: 623; SM70-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; 624; SM70-NEXT: mov.b32 %r1, %f1; 625; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; 626; SM70-NEXT: add.s32 %r3, %r2, %r1; 627; SM70-NEXT: add.s32 %r4, %r3, 32767; 628; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1; 629; SM70-NEXT: or.b32 %r5, %r1, 4194304; 630; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; 631; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; } 632; SM70-NEXT: st.param.b16 [func_retval0], %rs1; 633; SM70-NEXT: ret; 634; 635; SM80-LABEL: test_fptrunc_float( 636; SM80: { 637; SM80-NEXT: .reg .b16 %rs<2>; 638; SM80-NEXT: .reg .f32 %f<2>; 639; SM80-EMPTY: 640; SM80-NEXT: // %bb.0: 641; SM80-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; 642; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; 643; SM80-NEXT: st.param.b16 [func_retval0], %rs1; 644; SM80-NEXT: ret; 645; 646; SM80-FTZ-LABEL: test_fptrunc_float( 647; SM80-FTZ: { 648; SM80-FTZ-NEXT: .reg .b16 %rs<2>; 649; SM80-FTZ-NEXT: .reg .f32 %f<2>; 650; SM80-FTZ-EMPTY: 651; SM80-FTZ-NEXT: // %bb.0: 652; SM80-FTZ-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; 653; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; 654; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; 655; SM80-FTZ-NEXT: ret; 656; 657; SM90-LABEL: test_fptrunc_float( 658; SM90: { 659; SM90-NEXT: .reg .b16 %rs<2>; 660; SM90-NEXT: .reg .f32 %f<2>; 661; SM90-EMPTY: 662; SM90-NEXT: // %bb.0: 663; SM90-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; 664; SM90-NEXT: cvt.rn.bf16.f32 %rs1, %f1; 665; SM90-NEXT: st.param.b16 [func_retval0], %rs1; 666; SM90-NEXT: ret; 667 %r = fptrunc float %a to bfloat 668 ret bfloat %r 669} 670 671define bfloat @test_fadd_imm_1(bfloat %a) #0 { 672; SM70-LABEL: test_fadd_imm_1( 673; SM70: { 674; SM70-NEXT: .reg .pred %p<2>; 675; SM70-NEXT: .reg .b16 %rs<2>; 676; SM70-NEXT: .reg .b32 %r<9>; 677; SM70-NEXT: .reg .f32 %f<3>; 678; SM70-EMPTY: 679; SM70-NEXT: // %bb.0: 680; SM70-NEXT: ld.param.u16 %r1, [test_fadd_imm_1_param_0]; 681; SM70-NEXT: shl.b32 %r2, %r1, 16; 682; SM70-NEXT: mov.b32 %f1, %r2; 683; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; 684; SM70-NEXT: mov.b32 %r3, %f2; 685; SM70-NEXT: bfe.u32 %r4, %r3, 16, 1; 686; SM70-NEXT: add.s32 %r5, %r4, %r3; 687; SM70-NEXT: add.s32 %r6, %r5, 32767; 688; SM70-NEXT: setp.nan.f32 %p1, %f2, %f2; 689; SM70-NEXT: or.b32 %r7, %r3, 4194304; 690; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1; 691; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; } 692; SM70-NEXT: st.param.b16 [func_retval0], %rs1; 693; SM70-NEXT: ret; 694; 695; SM80-LABEL: test_fadd_imm_1( 696; SM80: { 697; SM80-NEXT: .reg .b16 %rs<4>; 698; SM80-EMPTY: 699; SM80-NEXT: // %bb.0: 700; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; 701; SM80-NEXT: mov.b16 %rs2, 0x3F80; 702; SM80-NEXT: fma.rn.bf16 %rs3, %rs1, %rs2, %rs2; 703; SM80-NEXT: st.param.b16 [func_retval0], %rs3; 704; SM80-NEXT: ret; 705; 706; SM80-FTZ-LABEL: test_fadd_imm_1( 707; SM80-FTZ: { 708; SM80-FTZ-NEXT: .reg .b16 %rs<3>; 709; SM80-FTZ-NEXT: .reg .f32 %f<3>; 710; SM80-FTZ-EMPTY: 711; SM80-FTZ-NEXT: // %bb.0: 712; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; 713; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 714; SM80-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f3F800000; 715; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f2; 716; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; 717; SM80-FTZ-NEXT: ret; 718; 719; SM90-LABEL: test_fadd_imm_1( 720; SM90: { 721; SM90-NEXT: .reg .b16 %rs<4>; 722; SM90-EMPTY: 723; SM90-NEXT: // %bb.0: 724; SM90-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; 725; SM90-NEXT: mov.b16 %rs2, 0x3F80; 726; SM90-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; 727; SM90-NEXT: st.param.b16 [func_retval0], %rs3; 728; SM90-NEXT: ret; 729 %r = fadd bfloat %a, 1.0 730 ret bfloat %r 731} 732 733define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat %d) #0 { 734; CHECK-LABEL: test_select_cc_bf16_f64( 735; CHECK: { 736; CHECK-NEXT: .reg .pred %p<2>; 737; CHECK-NEXT: .reg .b16 %rs<4>; 738; CHECK-NEXT: .reg .f64 %fd<3>; 739; CHECK-EMPTY: 740; CHECK-NEXT: // %bb.0: 741; CHECK-NEXT: ld.param.f64 %fd1, [test_select_cc_bf16_f64_param_0]; 742; CHECK-NEXT: ld.param.f64 %fd2, [test_select_cc_bf16_f64_param_1]; 743; CHECK-NEXT: setp.lt.f64 %p1, %fd1, %fd2; 744; CHECK-NEXT: ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2]; 745; CHECK-NEXT: ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3]; 746; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; 747; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; 748; CHECK-NEXT: ret; 749 %cc = fcmp olt double %a, %b 750 %r = select i1 %cc, bfloat %c, bfloat %d 751 ret bfloat %r 752} 753 754define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { 755; SM70-LABEL: test_extload_bf16x8( 756; SM70: { 757; SM70-NEXT: .reg .b16 %rs<9>; 758; SM70-NEXT: .reg .b32 %r<21>; 759; SM70-NEXT: .reg .f32 %f<9>; 760; SM70-NEXT: .reg .b64 %rd<2>; 761; SM70-EMPTY: 762; SM70-NEXT: // %bb.0: 763; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0]; 764; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 765; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1; 766; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; 767; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3; 768; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4; 769; SM70-NEXT: cvt.u32.u16 %r5, %rs8; 770; SM70-NEXT: shl.b32 %r6, %r5, 16; 771; SM70-NEXT: mov.b32 %f1, %r6; 772; SM70-NEXT: cvt.u32.u16 %r7, %rs7; 773; SM70-NEXT: shl.b32 %r8, %r7, 16; 774; SM70-NEXT: mov.b32 %f2, %r8; 775; SM70-NEXT: cvt.u32.u16 %r9, %rs6; 776; SM70-NEXT: shl.b32 %r10, %r9, 16; 777; SM70-NEXT: mov.b32 %f3, %r10; 778; SM70-NEXT: cvt.u32.u16 %r11, %rs5; 779; SM70-NEXT: shl.b32 %r12, %r11, 16; 780; SM70-NEXT: mov.b32 %f4, %r12; 781; SM70-NEXT: cvt.u32.u16 %r13, %rs4; 782; SM70-NEXT: shl.b32 %r14, %r13, 16; 783; SM70-NEXT: mov.b32 %f5, %r14; 784; SM70-NEXT: cvt.u32.u16 %r15, %rs3; 785; SM70-NEXT: shl.b32 %r16, %r15, 16; 786; SM70-NEXT: mov.b32 %f6, %r16; 787; SM70-NEXT: cvt.u32.u16 %r17, %rs2; 788; SM70-NEXT: shl.b32 %r18, %r17, 16; 789; SM70-NEXT: mov.b32 %f7, %r18; 790; SM70-NEXT: cvt.u32.u16 %r19, %rs1; 791; SM70-NEXT: shl.b32 %r20, %r19, 16; 792; SM70-NEXT: mov.b32 %f8, %r20; 793; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; 794; SM70-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; 795; SM70-NEXT: ret; 796; 797; SM80-LABEL: test_extload_bf16x8( 798; SM80: { 799; SM80-NEXT: .reg .b16 %rs<9>; 800; SM80-NEXT: .reg .b32 %r<5>; 801; SM80-NEXT: .reg .f32 %f<9>; 802; SM80-NEXT: .reg .b64 %rd<2>; 803; SM80-EMPTY: 804; SM80-NEXT: // %bb.0: 805; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0]; 806; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 807; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; 808; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; 809; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3; 810; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4; 811; SM80-NEXT: cvt.f32.bf16 %f1, %rs8; 812; SM80-NEXT: cvt.f32.bf16 %f2, %rs7; 813; SM80-NEXT: cvt.f32.bf16 %f3, %rs6; 814; SM80-NEXT: cvt.f32.bf16 %f4, %rs5; 815; SM80-NEXT: cvt.f32.bf16 %f5, %rs4; 816; SM80-NEXT: cvt.f32.bf16 %f6, %rs3; 817; SM80-NEXT: cvt.f32.bf16 %f7, %rs2; 818; SM80-NEXT: cvt.f32.bf16 %f8, %rs1; 819; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; 820; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; 821; SM80-NEXT: ret; 822; 823; SM80-FTZ-LABEL: test_extload_bf16x8( 824; SM80-FTZ: { 825; SM80-FTZ-NEXT: .reg .b16 %rs<9>; 826; SM80-FTZ-NEXT: .reg .b32 %r<5>; 827; SM80-FTZ-NEXT: .reg .f32 %f<9>; 828; SM80-FTZ-NEXT: .reg .b64 %rd<2>; 829; SM80-FTZ-EMPTY: 830; SM80-FTZ-NEXT: // %bb.0: 831; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0]; 832; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 833; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1; 834; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2; 835; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3; 836; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4; 837; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs8; 838; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs7; 839; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs6; 840; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs5; 841; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4; 842; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3; 843; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2; 844; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1; 845; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; 846; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; 847; SM80-FTZ-NEXT: ret; 848; 849; SM90-LABEL: test_extload_bf16x8( 850; SM90: { 851; SM90-NEXT: .reg .b16 %rs<9>; 852; SM90-NEXT: .reg .b32 %r<5>; 853; SM90-NEXT: .reg .f32 %f<9>; 854; SM90-NEXT: .reg .b64 %rd<2>; 855; SM90-EMPTY: 856; SM90-NEXT: // %bb.0: 857; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0]; 858; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 859; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; 860; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2; 861; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3; 862; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4; 863; SM90-NEXT: cvt.f32.bf16 %f1, %rs8; 864; SM90-NEXT: cvt.f32.bf16 %f2, %rs7; 865; SM90-NEXT: cvt.f32.bf16 %f3, %rs6; 866; SM90-NEXT: cvt.f32.bf16 %f4, %rs5; 867; SM90-NEXT: cvt.f32.bf16 %f5, %rs4; 868; SM90-NEXT: cvt.f32.bf16 %f6, %rs3; 869; SM90-NEXT: cvt.f32.bf16 %f7, %rs2; 870; SM90-NEXT: cvt.f32.bf16 %f8, %rs1; 871; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; 872; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; 873; SM90-NEXT: ret; 874 %load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16 875 %res = fpext <8 x bfloat> %load to <8 x float> 876 ret <8 x float> %res 877} 878 879define i16 @test_fptosi_i16(bfloat %a) { 880; SM70-LABEL: test_fptosi_i16( 881; SM70: { 882; SM70-NEXT: .reg .b16 %rs<2>; 883; SM70-NEXT: .reg .b32 %r<4>; 884; SM70-NEXT: .reg .f32 %f<2>; 885; SM70-EMPTY: 886; SM70-NEXT: // %bb.0: 887; SM70-NEXT: ld.param.u16 %r1, [test_fptosi_i16_param_0]; 888; SM70-NEXT: shl.b32 %r2, %r1, 16; 889; SM70-NEXT: mov.b32 %f1, %r2; 890; SM70-NEXT: cvt.rzi.s16.f32 %rs1, %f1; 891; SM70-NEXT: cvt.u32.u16 %r3, %rs1; 892; SM70-NEXT: st.param.b32 [func_retval0], %r3; 893; SM70-NEXT: ret; 894; 895; SM80-LABEL: test_fptosi_i16( 896; SM80: { 897; SM80-NEXT: .reg .b16 %rs<3>; 898; SM80-NEXT: .reg .b32 %r<2>; 899; SM80-NEXT: .reg .f32 %f<2>; 900; SM80-EMPTY: 901; SM80-NEXT: // %bb.0: 902; SM80-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; 903; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 904; SM80-NEXT: cvt.rzi.s16.f32 %rs2, %f1; 905; SM80-NEXT: cvt.u32.u16 %r1, %rs2; 906; SM80-NEXT: st.param.b32 [func_retval0], %r1; 907; SM80-NEXT: ret; 908; 909; SM80-FTZ-LABEL: test_fptosi_i16( 910; SM80-FTZ: { 911; SM80-FTZ-NEXT: .reg .b16 %rs<3>; 912; SM80-FTZ-NEXT: .reg .b32 %r<2>; 913; SM80-FTZ-NEXT: .reg .f32 %f<2>; 914; SM80-FTZ-EMPTY: 915; SM80-FTZ-NEXT: // %bb.0: 916; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; 917; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 918; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %f1; 919; SM80-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; 920; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1; 921; SM80-FTZ-NEXT: ret; 922; 923; SM90-LABEL: test_fptosi_i16( 924; SM90: { 925; SM90-NEXT: .reg .b16 %rs<3>; 926; SM90-NEXT: .reg .b32 %r<2>; 927; SM90-EMPTY: 928; SM90-NEXT: // %bb.0: 929; SM90-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; 930; SM90-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1; 931; SM90-NEXT: cvt.u32.u16 %r1, %rs2; 932; SM90-NEXT: st.param.b32 [func_retval0], %r1; 933; SM90-NEXT: ret; 934 %r = fptosi bfloat %a to i16 935 ret i16 %r 936} 937 938define i16 @test_fptoui_i16(bfloat %a) { 939; SM70-LABEL: test_fptoui_i16( 940; SM70: { 941; SM70-NEXT: .reg .b16 %rs<2>; 942; SM70-NEXT: .reg .b32 %r<4>; 943; SM70-NEXT: .reg .f32 %f<2>; 944; SM70-EMPTY: 945; SM70-NEXT: // %bb.0: 946; SM70-NEXT: ld.param.u16 %r1, [test_fptoui_i16_param_0]; 947; SM70-NEXT: shl.b32 %r2, %r1, 16; 948; SM70-NEXT: mov.b32 %f1, %r2; 949; SM70-NEXT: cvt.rzi.u16.f32 %rs1, %f1; 950; SM70-NEXT: cvt.u32.u16 %r3, %rs1; 951; SM70-NEXT: st.param.b32 [func_retval0], %r3; 952; SM70-NEXT: ret; 953; 954; SM80-LABEL: test_fptoui_i16( 955; SM80: { 956; SM80-NEXT: .reg .b16 %rs<3>; 957; SM80-NEXT: .reg .b32 %r<2>; 958; SM80-NEXT: .reg .f32 %f<2>; 959; SM80-EMPTY: 960; SM80-NEXT: // %bb.0: 961; SM80-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; 962; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 963; SM80-NEXT: cvt.rzi.u16.f32 %rs2, %f1; 964; SM80-NEXT: cvt.u32.u16 %r1, %rs2; 965; SM80-NEXT: st.param.b32 [func_retval0], %r1; 966; SM80-NEXT: ret; 967; 968; SM80-FTZ-LABEL: test_fptoui_i16( 969; SM80-FTZ: { 970; SM80-FTZ-NEXT: .reg .b16 %rs<3>; 971; SM80-FTZ-NEXT: .reg .b32 %r<2>; 972; SM80-FTZ-NEXT: .reg .f32 %f<2>; 973; SM80-FTZ-EMPTY: 974; SM80-FTZ-NEXT: // %bb.0: 975; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; 976; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 977; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %f1; 978; SM80-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; 979; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1; 980; SM80-FTZ-NEXT: ret; 981; 982; SM90-LABEL: test_fptoui_i16( 983; SM90: { 984; SM90-NEXT: .reg .b16 %rs<3>; 985; SM90-NEXT: .reg .b32 %r<2>; 986; SM90-EMPTY: 987; SM90-NEXT: // %bb.0: 988; SM90-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; 989; SM90-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1; 990; SM90-NEXT: cvt.u32.u16 %r1, %rs2; 991; SM90-NEXT: st.param.b32 [func_retval0], %r1; 992; SM90-NEXT: ret; 993 %r = fptoui bfloat %a to i16 994 ret i16 %r 995} 996 997define bfloat @test_sitofp_i16(i16 %a) { 998; SM70-LABEL: test_sitofp_i16( 999; SM70: { 1000; SM70-NEXT: .reg .pred %p<2>; 1001; SM70-NEXT: .reg .b16 %rs<3>; 1002; SM70-NEXT: .reg .b32 %r<7>; 1003; SM70-NEXT: .reg .f32 %f<2>; 1004; SM70-EMPTY: 1005; SM70-NEXT: // %bb.0: 1006; SM70-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; 1007; SM70-NEXT: cvt.rn.f32.s16 %f1, %rs1; 1008; SM70-NEXT: mov.b32 %r1, %f1; 1009; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; 1010; SM70-NEXT: add.s32 %r3, %r2, %r1; 1011; SM70-NEXT: add.s32 %r4, %r3, 32767; 1012; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1; 1013; SM70-NEXT: or.b32 %r5, %r1, 4194304; 1014; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; 1015; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; } 1016; SM70-NEXT: st.param.b16 [func_retval0], %rs2; 1017; SM70-NEXT: ret; 1018; 1019; SM80-LABEL: test_sitofp_i16( 1020; SM80: { 1021; SM80-NEXT: .reg .b16 %rs<3>; 1022; SM80-NEXT: .reg .f32 %f<2>; 1023; SM80-EMPTY: 1024; SM80-NEXT: // %bb.0: 1025; SM80-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; 1026; SM80-NEXT: cvt.rn.f32.s16 %f1, %rs1; 1027; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; 1028; SM80-NEXT: st.param.b16 [func_retval0], %rs2; 1029; SM80-NEXT: ret; 1030; 1031; SM80-FTZ-LABEL: test_sitofp_i16( 1032; SM80-FTZ: { 1033; SM80-FTZ-NEXT: .reg .b16 %rs<3>; 1034; SM80-FTZ-NEXT: .reg .f32 %f<2>; 1035; SM80-FTZ-EMPTY: 1036; SM80-FTZ-NEXT: // %bb.0: 1037; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; 1038; SM80-FTZ-NEXT: cvt.rn.f32.s16 %f1, %rs1; 1039; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; 1040; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; 1041; SM80-FTZ-NEXT: ret; 1042; 1043; SM90-LABEL: test_sitofp_i16( 1044; SM90: { 1045; SM90-NEXT: .reg .b16 %rs<3>; 1046; SM90-EMPTY: 1047; SM90-NEXT: // %bb.0: 1048; SM90-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; 1049; SM90-NEXT: cvt.rn.bf16.s16 %rs2, %rs1; 1050; SM90-NEXT: st.param.b16 [func_retval0], %rs2; 1051; SM90-NEXT: ret; 1052 %r = sitofp i16 %a to bfloat 1053 ret bfloat %r 1054} 1055 1056define bfloat @test_uitofp_i8(i8 %a) { 1057; SM70-LABEL: test_uitofp_i8( 1058; SM70: { 1059; SM70-NEXT: .reg .pred %p<2>; 1060; SM70-NEXT: .reg .b16 %rs<3>; 1061; SM70-NEXT: .reg .b32 %r<7>; 1062; SM70-NEXT: .reg .f32 %f<2>; 1063; SM70-EMPTY: 1064; SM70-NEXT: // %bb.0: 1065; SM70-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; 1066; SM70-NEXT: cvt.rn.f32.u16 %f1, %rs1; 1067; SM70-NEXT: mov.b32 %r1, %f1; 1068; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; 1069; SM70-NEXT: add.s32 %r3, %r2, %r1; 1070; SM70-NEXT: add.s32 %r4, %r3, 32767; 1071; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1; 1072; SM70-NEXT: or.b32 %r5, %r1, 4194304; 1073; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; 1074; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; } 1075; SM70-NEXT: st.param.b16 [func_retval0], %rs2; 1076; SM70-NEXT: ret; 1077; 1078; SM80-LABEL: test_uitofp_i8( 1079; SM80: { 1080; SM80-NEXT: .reg .b16 %rs<3>; 1081; SM80-NEXT: .reg .f32 %f<2>; 1082; SM80-EMPTY: 1083; SM80-NEXT: // %bb.0: 1084; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; 1085; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1; 1086; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; 1087; SM80-NEXT: st.param.b16 [func_retval0], %rs2; 1088; SM80-NEXT: ret; 1089; 1090; SM80-FTZ-LABEL: test_uitofp_i8( 1091; SM80-FTZ: { 1092; SM80-FTZ-NEXT: .reg .b16 %rs<3>; 1093; SM80-FTZ-NEXT: .reg .f32 %f<2>; 1094; SM80-FTZ-EMPTY: 1095; SM80-FTZ-NEXT: // %bb.0: 1096; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; 1097; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1; 1098; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; 1099; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; 1100; SM80-FTZ-NEXT: ret; 1101; 1102; SM90-LABEL: test_uitofp_i8( 1103; SM90: { 1104; SM90-NEXT: .reg .b16 %rs<3>; 1105; SM90-EMPTY: 1106; SM90-NEXT: // %bb.0: 1107; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; 1108; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; 1109; SM90-NEXT: st.param.b16 [func_retval0], %rs2; 1110; SM90-NEXT: ret; 1111 %r = uitofp i8 %a to bfloat 1112 ret bfloat %r 1113} 1114 1115define bfloat @test_uitofp_i1(i1 %a) { 1116; SM70-LABEL: test_uitofp_i1( 1117; SM70: { 1118; SM70-NEXT: .reg .pred %p<3>; 1119; SM70-NEXT: .reg .b16 %rs<4>; 1120; SM70-NEXT: .reg .b32 %r<8>; 1121; SM70-NEXT: .reg .f32 %f<2>; 1122; SM70-EMPTY: 1123; SM70-NEXT: // %bb.0: 1124; SM70-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0]; 1125; SM70-NEXT: and.b16 %rs2, %rs1, 1; 1126; SM70-NEXT: setp.eq.b16 %p1, %rs2, 1; 1127; SM70-NEXT: selp.u32 %r1, 1, 0, %p1; 1128; SM70-NEXT: cvt.rn.f32.u32 %f1, %r1; 1129; SM70-NEXT: mov.b32 %r2, %f1; 1130; SM70-NEXT: bfe.u32 %r3, %r2, 16, 1; 1131; SM70-NEXT: add.s32 %r4, %r3, %r2; 1132; SM70-NEXT: add.s32 %r5, %r4, 32767; 1133; SM70-NEXT: setp.nan.f32 %p2, %f1, %f1; 1134; SM70-NEXT: or.b32 %r6, %r2, 4194304; 1135; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p2; 1136; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r7; } 1137; SM70-NEXT: st.param.b16 [func_retval0], %rs3; 1138; SM70-NEXT: ret; 1139; 1140; SM80-LABEL: test_uitofp_i1( 1141; SM80: { 1142; SM80-NEXT: .reg .pred %p<2>; 1143; SM80-NEXT: .reg .b16 %rs<4>; 1144; SM80-NEXT: .reg .b32 %r<2>; 1145; SM80-NEXT: .reg .f32 %f<2>; 1146; SM80-EMPTY: 1147; SM80-NEXT: // %bb.0: 1148; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0]; 1149; SM80-NEXT: and.b16 %rs2, %rs1, 1; 1150; SM80-NEXT: setp.eq.b16 %p1, %rs2, 1; 1151; SM80-NEXT: selp.u32 %r1, 1, 0, %p1; 1152; SM80-NEXT: cvt.rn.f32.u32 %f1, %r1; 1153; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f1; 1154; SM80-NEXT: st.param.b16 [func_retval0], %rs3; 1155; SM80-NEXT: ret; 1156; 1157; SM80-FTZ-LABEL: test_uitofp_i1( 1158; SM80-FTZ: { 1159; SM80-FTZ-NEXT: .reg .pred %p<2>; 1160; SM80-FTZ-NEXT: .reg .b16 %rs<4>; 1161; SM80-FTZ-NEXT: .reg .b32 %r<2>; 1162; SM80-FTZ-NEXT: .reg .f32 %f<2>; 1163; SM80-FTZ-EMPTY: 1164; SM80-FTZ-NEXT: // %bb.0: 1165; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0]; 1166; SM80-FTZ-NEXT: and.b16 %rs2, %rs1, 1; 1167; SM80-FTZ-NEXT: setp.eq.b16 %p1, %rs2, 1; 1168; SM80-FTZ-NEXT: selp.u32 %r1, 1, 0, %p1; 1169; SM80-FTZ-NEXT: cvt.rn.f32.u32 %f1, %r1; 1170; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f1; 1171; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; 1172; SM80-FTZ-NEXT: ret; 1173; 1174; SM90-LABEL: test_uitofp_i1( 1175; SM90: { 1176; SM90-NEXT: .reg .pred %p<2>; 1177; SM90-NEXT: .reg .b16 %rs<4>; 1178; SM90-NEXT: .reg .b32 %r<2>; 1179; SM90-EMPTY: 1180; SM90-NEXT: // %bb.0: 1181; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0]; 1182; SM90-NEXT: and.b16 %rs2, %rs1, 1; 1183; SM90-NEXT: setp.eq.b16 %p1, %rs2, 1; 1184; SM90-NEXT: selp.u32 %r1, 1, 0, %p1; 1185; SM90-NEXT: cvt.rn.bf16.u32 %rs3, %r1; 1186; SM90-NEXT: st.param.b16 [func_retval0], %rs3; 1187; SM90-NEXT: ret; 1188 %r = uitofp i1 %a to bfloat 1189 ret bfloat %r 1190} 1191 1192define bfloat @test_uitofp_i16(i16 %a) { 1193; SM70-LABEL: test_uitofp_i16( 1194; SM70: { 1195; SM70-NEXT: .reg .pred %p<2>; 1196; SM70-NEXT: .reg .b16 %rs<3>; 1197; SM70-NEXT: .reg .b32 %r<7>; 1198; SM70-NEXT: .reg .f32 %f<2>; 1199; SM70-EMPTY: 1200; SM70-NEXT: // %bb.0: 1201; SM70-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; 1202; SM70-NEXT: cvt.rn.f32.u16 %f1, %rs1; 1203; SM70-NEXT: mov.b32 %r1, %f1; 1204; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; 1205; SM70-NEXT: add.s32 %r3, %r2, %r1; 1206; SM70-NEXT: add.s32 %r4, %r3, 32767; 1207; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1; 1208; SM70-NEXT: or.b32 %r5, %r1, 4194304; 1209; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; 1210; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; } 1211; SM70-NEXT: st.param.b16 [func_retval0], %rs2; 1212; SM70-NEXT: ret; 1213; 1214; SM80-LABEL: test_uitofp_i16( 1215; SM80: { 1216; SM80-NEXT: .reg .b16 %rs<3>; 1217; SM80-NEXT: .reg .f32 %f<2>; 1218; SM80-EMPTY: 1219; SM80-NEXT: // %bb.0: 1220; SM80-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; 1221; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1; 1222; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; 1223; SM80-NEXT: st.param.b16 [func_retval0], %rs2; 1224; SM80-NEXT: ret; 1225; 1226; SM80-FTZ-LABEL: test_uitofp_i16( 1227; SM80-FTZ: { 1228; SM80-FTZ-NEXT: .reg .b16 %rs<3>; 1229; SM80-FTZ-NEXT: .reg .f32 %f<2>; 1230; SM80-FTZ-EMPTY: 1231; SM80-FTZ-NEXT: // %bb.0: 1232; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; 1233; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1; 1234; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; 1235; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; 1236; SM80-FTZ-NEXT: ret; 1237; 1238; SM90-LABEL: test_uitofp_i16( 1239; SM90: { 1240; SM90-NEXT: .reg .b16 %rs<3>; 1241; SM90-EMPTY: 1242; SM90-NEXT: // %bb.0: 1243; SM90-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; 1244; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; 1245; SM90-NEXT: st.param.b16 [func_retval0], %rs2; 1246; SM90-NEXT: ret; 1247 %r = uitofp i16 %a to bfloat 1248 ret bfloat %r 1249} 1250 1251define bfloat @test_uitofp_i32(i32 %a) { 1252; SM70-LABEL: test_uitofp_i32( 1253; SM70: { 1254; SM70-NEXT: .reg .pred %p<2>; 1255; SM70-NEXT: .reg .b16 %rs<2>; 1256; SM70-NEXT: .reg .b32 %r<8>; 1257; SM70-NEXT: .reg .f32 %f<2>; 1258; SM70-EMPTY: 1259; SM70-NEXT: // %bb.0: 1260; SM70-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; 1261; SM70-NEXT: cvt.rn.f32.u32 %f1, %r1; 1262; SM70-NEXT: mov.b32 %r2, %f1; 1263; SM70-NEXT: bfe.u32 %r3, %r2, 16, 1; 1264; SM70-NEXT: add.s32 %r4, %r3, %r2; 1265; SM70-NEXT: add.s32 %r5, %r4, 32767; 1266; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1; 1267; SM70-NEXT: or.b32 %r6, %r2, 4194304; 1268; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1; 1269; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } 1270; SM70-NEXT: st.param.b16 [func_retval0], %rs1; 1271; SM70-NEXT: ret; 1272; 1273; SM80-LABEL: test_uitofp_i32( 1274; SM80: { 1275; SM80-NEXT: .reg .b16 %rs<2>; 1276; SM80-NEXT: .reg .b32 %r<2>; 1277; SM80-NEXT: .reg .f32 %f<2>; 1278; SM80-EMPTY: 1279; SM80-NEXT: // %bb.0: 1280; SM80-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; 1281; SM80-NEXT: cvt.rn.f32.u32 %f1, %r1; 1282; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; 1283; SM80-NEXT: st.param.b16 [func_retval0], %rs1; 1284; SM80-NEXT: ret; 1285; 1286; SM80-FTZ-LABEL: test_uitofp_i32( 1287; SM80-FTZ: { 1288; SM80-FTZ-NEXT: .reg .b16 %rs<2>; 1289; SM80-FTZ-NEXT: .reg .b32 %r<2>; 1290; SM80-FTZ-NEXT: .reg .f32 %f<2>; 1291; SM80-FTZ-EMPTY: 1292; SM80-FTZ-NEXT: // %bb.0: 1293; SM80-FTZ-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; 1294; SM80-FTZ-NEXT: cvt.rn.f32.u32 %f1, %r1; 1295; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; 1296; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; 1297; SM80-FTZ-NEXT: ret; 1298; 1299; SM90-LABEL: test_uitofp_i32( 1300; SM90: { 1301; SM90-NEXT: .reg .b16 %rs<2>; 1302; SM90-NEXT: .reg .b32 %r<2>; 1303; SM90-EMPTY: 1304; SM90-NEXT: // %bb.0: 1305; SM90-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; 1306; SM90-NEXT: cvt.rn.bf16.u32 %rs1, %r1; 1307; SM90-NEXT: st.param.b16 [func_retval0], %rs1; 1308; SM90-NEXT: ret; 1309 %r = uitofp i32 %a to bfloat 1310 ret bfloat %r 1311} 1312 1313define bfloat @test_uitofp_i64(i64 %a) { 1314; SM70-LABEL: test_uitofp_i64( 1315; SM70: { 1316; SM70-NEXT: .reg .pred %p<2>; 1317; SM70-NEXT: .reg .b16 %rs<2>; 1318; SM70-NEXT: .reg .b32 %r<7>; 1319; SM70-NEXT: .reg .f32 %f<2>; 1320; SM70-NEXT: .reg .b64 %rd<2>; 1321; SM70-EMPTY: 1322; SM70-NEXT: // %bb.0: 1323; SM70-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; 1324; SM70-NEXT: cvt.rn.f32.u64 %f1, %rd1; 1325; SM70-NEXT: mov.b32 %r1, %f1; 1326; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; 1327; SM70-NEXT: add.s32 %r3, %r2, %r1; 1328; SM70-NEXT: add.s32 %r4, %r3, 32767; 1329; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1; 1330; SM70-NEXT: or.b32 %r5, %r1, 4194304; 1331; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; 1332; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; } 1333; SM70-NEXT: st.param.b16 [func_retval0], %rs1; 1334; SM70-NEXT: ret; 1335; 1336; SM80-LABEL: test_uitofp_i64( 1337; SM80: { 1338; SM80-NEXT: .reg .b16 %rs<2>; 1339; SM80-NEXT: .reg .f32 %f<2>; 1340; SM80-NEXT: .reg .b64 %rd<2>; 1341; SM80-EMPTY: 1342; SM80-NEXT: // %bb.0: 1343; SM80-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; 1344; SM80-NEXT: cvt.rn.f32.u64 %f1, %rd1; 1345; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; 1346; SM80-NEXT: st.param.b16 [func_retval0], %rs1; 1347; SM80-NEXT: ret; 1348; 1349; SM80-FTZ-LABEL: test_uitofp_i64( 1350; SM80-FTZ: { 1351; SM80-FTZ-NEXT: .reg .b16 %rs<2>; 1352; SM80-FTZ-NEXT: .reg .f32 %f<2>; 1353; SM80-FTZ-NEXT: .reg .b64 %rd<2>; 1354; SM80-FTZ-EMPTY: 1355; SM80-FTZ-NEXT: // %bb.0: 1356; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; 1357; SM80-FTZ-NEXT: cvt.rn.f32.u64 %f1, %rd1; 1358; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; 1359; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; 1360; SM80-FTZ-NEXT: ret; 1361; 1362; SM90-LABEL: test_uitofp_i64( 1363; SM90: { 1364; SM90-NEXT: .reg .b16 %rs<2>; 1365; SM90-NEXT: .reg .b64 %rd<2>; 1366; SM90-EMPTY: 1367; SM90-NEXT: // %bb.0: 1368; SM90-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; 1369; SM90-NEXT: cvt.rn.bf16.u64 %rs1, %rd1; 1370; SM90-NEXT: st.param.b16 [func_retval0], %rs1; 1371; SM90-NEXT: ret; 1372 %r = uitofp i64 %a to bfloat 1373 ret bfloat %r 1374} 1375 1376define bfloat @test_roundeven(bfloat %a) { 1377; SM70-LABEL: test_roundeven( 1378; SM70: { 1379; SM70-NEXT: .reg .pred %p<2>; 1380; SM70-NEXT: .reg .b16 %rs<2>; 1381; SM70-NEXT: .reg .b32 %r<9>; 1382; SM70-NEXT: .reg .f32 %f<3>; 1383; SM70-EMPTY: 1384; SM70-NEXT: // %bb.0: 1385; SM70-NEXT: ld.param.u16 %r1, [test_roundeven_param_0]; 1386; SM70-NEXT: shl.b32 %r2, %r1, 16; 1387; SM70-NEXT: mov.b32 %f1, %r2; 1388; SM70-NEXT: cvt.rni.f32.f32 %f2, %f1; 1389; SM70-NEXT: mov.b32 %r3, %f2; 1390; SM70-NEXT: bfe.u32 %r4, %r3, 16, 1; 1391; SM70-NEXT: add.s32 %r5, %r4, %r3; 1392; SM70-NEXT: add.s32 %r6, %r5, 32767; 1393; SM70-NEXT: setp.nan.f32 %p1, %f2, %f2; 1394; SM70-NEXT: or.b32 %r7, %r3, 4194304; 1395; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1; 1396; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; } 1397; SM70-NEXT: st.param.b16 [func_retval0], %rs1; 1398; SM70-NEXT: ret; 1399; 1400; SM80-LABEL: test_roundeven( 1401; SM80: { 1402; SM80-NEXT: .reg .b16 %rs<3>; 1403; SM80-NEXT: .reg .f32 %f<3>; 1404; SM80-EMPTY: 1405; SM80-NEXT: // %bb.0: 1406; SM80-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; 1407; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; 1408; SM80-NEXT: cvt.rni.f32.f32 %f2, %f1; 1409; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2; 1410; SM80-NEXT: st.param.b16 [func_retval0], %rs2; 1411; SM80-NEXT: ret; 1412; 1413; SM80-FTZ-LABEL: test_roundeven( 1414; SM80-FTZ: { 1415; SM80-FTZ-NEXT: .reg .b16 %rs<3>; 1416; SM80-FTZ-NEXT: .reg .f32 %f<3>; 1417; SM80-FTZ-EMPTY: 1418; SM80-FTZ-NEXT: // %bb.0: 1419; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; 1420; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; 1421; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1; 1422; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f2; 1423; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; 1424; SM80-FTZ-NEXT: ret; 1425; 1426; SM90-LABEL: test_roundeven( 1427; SM90: { 1428; SM90-NEXT: .reg .b16 %rs<3>; 1429; SM90-EMPTY: 1430; SM90-NEXT: // %bb.0: 1431; SM90-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; 1432; SM90-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1; 1433; SM90-NEXT: st.param.b16 [func_retval0], %rs2; 1434; SM90-NEXT: ret; 1435 %r = call bfloat @llvm.roundeven.bf16(bfloat %a) 1436 ret bfloat %r 1437} 1438 1439define bfloat @test_maximum(bfloat %a, bfloat %b) { 1440; SM70-LABEL: test_maximum( 1441; SM70: { 1442; SM70-NEXT: .reg .pred %p<6>; 1443; SM70-NEXT: .reg .b16 %rs<8>; 1444; SM70-NEXT: .reg .b32 %r<7>; 1445; SM70-NEXT: .reg .f32 %f<4>; 1446; SM70-EMPTY: 1447; SM70-NEXT: // %bb.0: 1448; SM70-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; 1449; SM70-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; 1450; SM70-NEXT: cvt.u32.u16 %r1, %rs2; 1451; SM70-NEXT: shl.b32 %r2, %r1, 16; 1452; SM70-NEXT: mov.b32 %f1, %r2; 1453; SM70-NEXT: cvt.u32.u16 %r3, %rs1; 1454; SM70-NEXT: shl.b32 %r4, %r3, 16; 1455; SM70-NEXT: mov.b32 %f2, %r4; 1456; SM70-NEXT: setp.gt.f32 %p1, %f2, %f1; 1457; SM70-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; 1458; SM70-NEXT: setp.nan.f32 %p2, %f2, %f1; 1459; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2; 1460; SM70-NEXT: setp.eq.s16 %p3, %rs1, 0; 1461; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; 1462; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0; 1463; SM70-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; 1464; SM70-NEXT: cvt.u32.u16 %r5, %rs4; 1465; SM70-NEXT: shl.b32 %r6, %r5, 16; 1466; SM70-NEXT: mov.b32 %f3, %r6; 1467; SM70-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; 1468; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; 1469; SM70-NEXT: st.param.b16 [func_retval0], %rs7; 1470; SM70-NEXT: ret; 1471; 1472; SM80-LABEL: test_maximum( 1473; SM80: { 1474; SM80-NEXT: .reg .b16 %rs<4>; 1475; SM80-EMPTY: 1476; SM80-NEXT: // %bb.0: 1477; SM80-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; 1478; SM80-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; 1479; SM80-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; 1480; SM80-NEXT: st.param.b16 [func_retval0], %rs3; 1481; SM80-NEXT: ret; 1482; 1483; SM80-FTZ-LABEL: test_maximum( 1484; SM80-FTZ: { 1485; SM80-FTZ-NEXT: .reg .b16 %rs<4>; 1486; SM80-FTZ-EMPTY: 1487; SM80-FTZ-NEXT: // %bb.0: 1488; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; 1489; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; 1490; SM80-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; 1491; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; 1492; SM80-FTZ-NEXT: ret; 1493; 1494; SM90-LABEL: test_maximum( 1495; SM90: { 1496; SM90-NEXT: .reg .b16 %rs<4>; 1497; SM90-EMPTY: 1498; SM90-NEXT: // %bb.0: 1499; SM90-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; 1500; SM90-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; 1501; SM90-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; 1502; SM90-NEXT: st.param.b16 [func_retval0], %rs3; 1503; SM90-NEXT: ret; 1504 %r = call bfloat @llvm.maximum.bf16(bfloat %a, bfloat %b) 1505 ret bfloat %r 1506} 1507 1508define bfloat @test_maxnum(bfloat %a, bfloat %b) { 1509; SM70-LABEL: test_maxnum( 1510; SM70: { 1511; SM70-NEXT: .reg .pred %p<2>; 1512; SM70-NEXT: .reg .b16 %rs<2>; 1513; SM70-NEXT: .reg .b32 %r<11>; 1514; SM70-NEXT: .reg .f32 %f<4>; 1515; SM70-EMPTY: 1516; SM70-NEXT: // %bb.0: 1517; SM70-NEXT: ld.param.u16 %r1, [test_maxnum_param_1]; 1518; SM70-NEXT: shl.b32 %r2, %r1, 16; 1519; SM70-NEXT: mov.b32 %f1, %r2; 1520; SM70-NEXT: ld.param.u16 %r3, [test_maxnum_param_0]; 1521; SM70-NEXT: shl.b32 %r4, %r3, 16; 1522; SM70-NEXT: mov.b32 %f2, %r4; 1523; SM70-NEXT: max.f32 %f3, %f2, %f1; 1524; SM70-NEXT: mov.b32 %r5, %f3; 1525; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1; 1526; SM70-NEXT: add.s32 %r7, %r6, %r5; 1527; SM70-NEXT: add.s32 %r8, %r7, 32767; 1528; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3; 1529; SM70-NEXT: or.b32 %r9, %r5, 4194304; 1530; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1; 1531; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; } 1532; SM70-NEXT: st.param.b16 [func_retval0], %rs1; 1533; SM70-NEXT: ret; 1534; 1535; SM80-LABEL: test_maxnum( 1536; SM80: { 1537; SM80-NEXT: .reg .b16 %rs<4>; 1538; SM80-EMPTY: 1539; SM80-NEXT: // %bb.0: 1540; SM80-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; 1541; SM80-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; 1542; SM80-NEXT: max.bf16 %rs3, %rs1, %rs2; 1543; SM80-NEXT: st.param.b16 [func_retval0], %rs3; 1544; SM80-NEXT: ret; 1545; 1546; SM80-FTZ-LABEL: test_maxnum( 1547; SM80-FTZ: { 1548; SM80-FTZ-NEXT: .reg .b16 %rs<4>; 1549; SM80-FTZ-EMPTY: 1550; SM80-FTZ-NEXT: // %bb.0: 1551; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; 1552; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; 1553; SM80-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2; 1554; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; 1555; SM80-FTZ-NEXT: ret; 1556; 1557; SM90-LABEL: test_maxnum( 1558; SM90: { 1559; SM90-NEXT: .reg .b16 %rs<4>; 1560; SM90-EMPTY: 1561; SM90-NEXT: // %bb.0: 1562; SM90-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; 1563; SM90-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; 1564; SM90-NEXT: max.bf16 %rs3, %rs1, %rs2; 1565; SM90-NEXT: st.param.b16 [func_retval0], %rs3; 1566; SM90-NEXT: ret; 1567 %r = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) 1568 ret bfloat %r 1569} 1570 1571define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { 1572; SM70-LABEL: test_maximum_v2( 1573; SM70: { 1574; SM70-NEXT: .reg .pred %p<11>; 1575; SM70-NEXT: .reg .b16 %rs<15>; 1576; SM70-NEXT: .reg .b32 %r<16>; 1577; SM70-NEXT: .reg .f32 %f<7>; 1578; SM70-EMPTY: 1579; SM70-NEXT: // %bb.0: 1580; SM70-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0]; 1581; SM70-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1]; 1582; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; 1583; SM70-NEXT: cvt.u32.u16 %r3, %rs2; 1584; SM70-NEXT: shl.b32 %r4, %r3, 16; 1585; SM70-NEXT: mov.b32 %f1, %r4; 1586; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; 1587; SM70-NEXT: cvt.u32.u16 %r5, %rs4; 1588; SM70-NEXT: shl.b32 %r6, %r5, 16; 1589; SM70-NEXT: mov.b32 %f2, %r6; 1590; SM70-NEXT: setp.gt.f32 %p1, %f2, %f1; 1591; SM70-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; 1592; SM70-NEXT: setp.nan.f32 %p2, %f2, %f1; 1593; SM70-NEXT: selp.b16 %rs6, 0x7FC0, %rs5, %p2; 1594; SM70-NEXT: setp.eq.s16 %p3, %rs4, 0; 1595; SM70-NEXT: selp.b16 %rs7, %rs4, %rs6, %p3; 1596; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0; 1597; SM70-NEXT: selp.b16 %rs8, %rs2, %rs7, %p4; 1598; SM70-NEXT: cvt.u32.u16 %r7, %rs6; 1599; SM70-NEXT: shl.b32 %r8, %r7, 16; 1600; SM70-NEXT: mov.b32 %f3, %r8; 1601; SM70-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; 1602; SM70-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; 1603; SM70-NEXT: cvt.u32.u16 %r9, %rs1; 1604; SM70-NEXT: shl.b32 %r10, %r9, 16; 1605; SM70-NEXT: mov.b32 %f4, %r10; 1606; SM70-NEXT: cvt.u32.u16 %r11, %rs3; 1607; SM70-NEXT: shl.b32 %r12, %r11, 16; 1608; SM70-NEXT: mov.b32 %f5, %r12; 1609; SM70-NEXT: setp.gt.f32 %p6, %f5, %f4; 1610; SM70-NEXT: selp.b16 %rs10, %rs3, %rs1, %p6; 1611; SM70-NEXT: setp.nan.f32 %p7, %f5, %f4; 1612; SM70-NEXT: selp.b16 %rs11, 0x7FC0, %rs10, %p7; 1613; SM70-NEXT: setp.eq.s16 %p8, %rs3, 0; 1614; SM70-NEXT: selp.b16 %rs12, %rs3, %rs11, %p8; 1615; SM70-NEXT: setp.eq.s16 %p9, %rs1, 0; 1616; SM70-NEXT: selp.b16 %rs13, %rs1, %rs12, %p9; 1617; SM70-NEXT: cvt.u32.u16 %r13, %rs11; 1618; SM70-NEXT: shl.b32 %r14, %r13, 16; 1619; SM70-NEXT: mov.b32 %f6, %r14; 1620; SM70-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; 1621; SM70-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; 1622; SM70-NEXT: mov.b32 %r15, {%rs14, %rs9}; 1623; SM70-NEXT: st.param.b32 [func_retval0], %r15; 1624; SM70-NEXT: ret; 1625; 1626; SM80-LABEL: test_maximum_v2( 1627; SM80: { 1628; SM80-NEXT: .reg .b32 %r<4>; 1629; SM80-EMPTY: 1630; SM80-NEXT: // %bb.0: 1631; SM80-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1]; 1632; SM80-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0]; 1633; SM80-NEXT: max.NaN.bf16x2 %r3, %r2, %r1; 1634; SM80-NEXT: st.param.b32 [func_retval0], %r3; 1635; SM80-NEXT: ret; 1636; 1637; SM80-FTZ-LABEL: test_maximum_v2( 1638; SM80-FTZ: { 1639; SM80-FTZ-NEXT: .reg .b32 %r<4>; 1640; SM80-FTZ-EMPTY: 1641; SM80-FTZ-NEXT: // %bb.0: 1642; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1]; 1643; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0]; 1644; SM80-FTZ-NEXT: max.NaN.bf16x2 %r3, %r2, %r1; 1645; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; 1646; SM80-FTZ-NEXT: ret; 1647; 1648; SM90-LABEL: test_maximum_v2( 1649; SM90: { 1650; SM90-NEXT: .reg .b32 %r<4>; 1651; SM90-EMPTY: 1652; SM90-NEXT: // %bb.0: 1653; SM90-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1]; 1654; SM90-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0]; 1655; SM90-NEXT: max.NaN.bf16x2 %r3, %r2, %r1; 1656; SM90-NEXT: st.param.b32 [func_retval0], %r3; 1657; SM90-NEXT: ret; 1658 %r = call <2 x bfloat> @llvm.maximum.bf16(<2 x bfloat> %a, <2 x bfloat> %b) 1659 ret <2 x bfloat> %r 1660} 1661 1662define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { 1663; SM70-LABEL: test_maxnum_v2( 1664; SM70: { 1665; SM70-NEXT: .reg .pred %p<3>; 1666; SM70-NEXT: .reg .b16 %rs<5>; 1667; SM70-NEXT: .reg .b32 %r<24>; 1668; SM70-NEXT: .reg .f32 %f<7>; 1669; SM70-EMPTY: 1670; SM70-NEXT: // %bb.0: 1671; SM70-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0]; 1672; SM70-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1]; 1673; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; 1674; SM70-NEXT: cvt.u32.u16 %r3, %rs2; 1675; SM70-NEXT: shl.b32 %r4, %r3, 16; 1676; SM70-NEXT: mov.b32 %f1, %r4; 1677; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; 1678; SM70-NEXT: cvt.u32.u16 %r5, %rs4; 1679; SM70-NEXT: shl.b32 %r6, %r5, 16; 1680; SM70-NEXT: mov.b32 %f2, %r6; 1681; SM70-NEXT: max.f32 %f3, %f2, %f1; 1682; SM70-NEXT: mov.b32 %r7, %f3; 1683; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1; 1684; SM70-NEXT: add.s32 %r9, %r8, %r7; 1685; SM70-NEXT: add.s32 %r10, %r9, 32767; 1686; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3; 1687; SM70-NEXT: or.b32 %r11, %r7, 4194304; 1688; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1; 1689; SM70-NEXT: cvt.u32.u16 %r13, %rs1; 1690; SM70-NEXT: shl.b32 %r14, %r13, 16; 1691; SM70-NEXT: mov.b32 %f4, %r14; 1692; SM70-NEXT: cvt.u32.u16 %r15, %rs3; 1693; SM70-NEXT: shl.b32 %r16, %r15, 16; 1694; SM70-NEXT: mov.b32 %f5, %r16; 1695; SM70-NEXT: max.f32 %f6, %f5, %f4; 1696; SM70-NEXT: mov.b32 %r17, %f6; 1697; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1; 1698; SM70-NEXT: add.s32 %r19, %r18, %r17; 1699; SM70-NEXT: add.s32 %r20, %r19, 32767; 1700; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6; 1701; SM70-NEXT: or.b32 %r21, %r17, 4194304; 1702; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; 1703; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U; 1704; SM70-NEXT: st.param.b32 [func_retval0], %r23; 1705; SM70-NEXT: ret; 1706; 1707; SM80-LABEL: test_maxnum_v2( 1708; SM80: { 1709; SM80-NEXT: .reg .b32 %r<4>; 1710; SM80-EMPTY: 1711; SM80-NEXT: // %bb.0: 1712; SM80-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1]; 1713; SM80-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0]; 1714; SM80-NEXT: max.bf16x2 %r3, %r2, %r1; 1715; SM80-NEXT: st.param.b32 [func_retval0], %r3; 1716; SM80-NEXT: ret; 1717; 1718; SM80-FTZ-LABEL: test_maxnum_v2( 1719; SM80-FTZ: { 1720; SM80-FTZ-NEXT: .reg .b32 %r<4>; 1721; SM80-FTZ-EMPTY: 1722; SM80-FTZ-NEXT: // %bb.0: 1723; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1]; 1724; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0]; 1725; SM80-FTZ-NEXT: max.bf16x2 %r3, %r2, %r1; 1726; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; 1727; SM80-FTZ-NEXT: ret; 1728; 1729; SM90-LABEL: test_maxnum_v2( 1730; SM90: { 1731; SM90-NEXT: .reg .b32 %r<4>; 1732; SM90-EMPTY: 1733; SM90-NEXT: // %bb.0: 1734; SM90-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1]; 1735; SM90-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0]; 1736; SM90-NEXT: max.bf16x2 %r3, %r2, %r1; 1737; SM90-NEXT: st.param.b32 [func_retval0], %r3; 1738; SM90-NEXT: ret; 1739 %r = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) 1740 ret <2 x bfloat> %r 1741} 1742