1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK 3; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64 4; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71 5; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} 6; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} 7; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %} 8 9target triple = "nvptx64-nvidia-cuda" 10 11define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat %val) { 12; CHECK-LABEL: test( 13; CHECK: { 14; CHECK-NEXT: .reg .b16 %rs<7>; 15; CHECK-NEXT: .reg .b32 %r<4>; 16; CHECK-EMPTY: 17; CHECK-NEXT: // %bb.0: 18; CHECK-NEXT: ld.param.u32 %r1, [test_param_0]; 19; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3]; 20; CHECK-NEXT: atom.add.noftz.bf16 %rs2, [%r1], %rs1; 21; CHECK-NEXT: ld.param.u32 %r2, [test_param_1]; 22; CHECK-NEXT: mov.b16 %rs3, 0x3F80; 23; CHECK-NEXT: atom.add.noftz.bf16 %rs4, [%r1], %rs3; 24; CHECK-NEXT: ld.param.u32 %r3, [test_param_2]; 25; CHECK-NEXT: atom.global.add.noftz.bf16 %rs5, [%r2], %rs1; 26; CHECK-NEXT: atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1; 27; CHECK-NEXT: ret; 28; 29; CHECK64-LABEL: test( 30; CHECK64: { 31; CHECK64-NEXT: .reg .b16 %rs<7>; 32; CHECK64-NEXT: .reg .b64 %rd<4>; 33; CHECK64-EMPTY: 34; CHECK64-NEXT: // %bb.0: 35; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0]; 36; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3]; 37; CHECK64-NEXT: atom.add.noftz.bf16 %rs2, [%rd1], %rs1; 38; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1]; 39; CHECK64-NEXT: mov.b16 %rs3, 0x3F80; 40; CHECK64-NEXT: atom.add.noftz.bf16 %rs4, [%rd1], %rs3; 41; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2]; 42; CHECK64-NEXT: atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1; 43; CHECK64-NEXT: atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1; 44; CHECK64-NEXT: ret; 45; 46; CHECKPTX71-LABEL: test( 47; CHECKPTX71: { 48; CHECKPTX71-NEXT: .reg .pred %p<5>; 49; CHECKPTX71-NEXT: .reg .b16 %rs<14>; 50; CHECKPTX71-NEXT: .reg .b32 %r<58>; 51; CHECKPTX71-EMPTY: 52; CHECKPTX71-NEXT: // %bb.0: 53; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3]; 54; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2]; 55; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1]; 56; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0]; 57; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4; 58; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3; 59; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3; 60; CHECKPTX71-NEXT: mov.b32 %r26, 65535; 61; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; 62; CHECKPTX71-NEXT: not.b32 %r3, %r27; 63; CHECKPTX71-NEXT: ld.u32 %r54, [%r1]; 64; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45 65; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 66; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2; 67; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28; 68; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80; 69; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1; 70; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4; 71; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; 72; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; 73; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; 74; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; 75; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54; 76; CHECKPTX71-NEXT: mov.u32 %r54, %r6; 77; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; 78; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44 79; CHECKPTX71-NEXT: ld.u32 %r55, [%r1]; 80; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27 81; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 82; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2; 83; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r33; 84; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80; 85; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6; 86; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs7; 87; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; 88; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; 89; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; 90; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; 91; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55; 92; CHECKPTX71-NEXT: mov.u32 %r55, %r9; 93; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; 94; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26 95; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4; 96; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3; 97; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24; 98; CHECKPTX71-NEXT: mov.b32 %r39, 65535; 99; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11; 100; CHECKPTX71-NEXT: not.b32 %r12, %r40; 101; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10]; 102; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9 103; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 104; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11; 105; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r41; 106; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80; 107; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1; 108; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs10; 109; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; 110; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; 111; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; 112; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; 113; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56; 114; CHECKPTX71-NEXT: mov.u32 %r56, %r15; 115; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; 116; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8 117; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4; 118; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3; 119; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24; 120; CHECKPTX71-NEXT: mov.b32 %r47, 65535; 121; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17; 122; CHECKPTX71-NEXT: not.b32 %r18, %r48; 123; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16]; 124; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start 125; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 126; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17; 127; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r49; 128; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80; 129; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1; 130; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs13; 131; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; 132; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; 133; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; 134; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; 135; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57; 136; CHECKPTX71-NEXT: mov.u32 %r57, %r21; 137; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; 138; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end 139; CHECKPTX71-NEXT: ret; 140 %r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic 141 %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic 142 %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic 143 %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val monotonic 144 ret void 145} 146 147attributes #1 = { argmemonly nounwind } 148