xref: /llvm-project/llvm/test/CodeGen/NVPTX/atomics-sm90.ll (revision 892a804d93d44ddfd7cd351852fe6aef32d4dcd0)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK
3; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64
4; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71
5; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
6; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
7; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %}
8
9target triple = "nvptx64-nvidia-cuda"
10
11define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat %val) {
12; CHECK-LABEL: test(
13; CHECK:       {
14; CHECK-NEXT:    .reg .b16 %rs<7>;
15; CHECK-NEXT:    .reg .b32 %r<4>;
16; CHECK-EMPTY:
17; CHECK-NEXT:  // %bb.0:
18; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
19; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
20; CHECK-NEXT:    atom.add.noftz.bf16 %rs2, [%r1], %rs1;
21; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
22; CHECK-NEXT:    mov.b16 %rs3, 0x3F80;
23; CHECK-NEXT:    atom.add.noftz.bf16 %rs4, [%r1], %rs3;
24; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
25; CHECK-NEXT:    atom.global.add.noftz.bf16 %rs5, [%r2], %rs1;
26; CHECK-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1;
27; CHECK-NEXT:    ret;
28;
29; CHECK64-LABEL: test(
30; CHECK64:       {
31; CHECK64-NEXT:    .reg .b16 %rs<7>;
32; CHECK64-NEXT:    .reg .b64 %rd<4>;
33; CHECK64-EMPTY:
34; CHECK64-NEXT:  // %bb.0:
35; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
36; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
37; CHECK64-NEXT:    atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
38; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
39; CHECK64-NEXT:    mov.b16 %rs3, 0x3F80;
40; CHECK64-NEXT:    atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
41; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
42; CHECK64-NEXT:    atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
43; CHECK64-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
44; CHECK64-NEXT:    ret;
45;
46; CHECKPTX71-LABEL: test(
47; CHECKPTX71:       {
48; CHECKPTX71-NEXT:    .reg .pred %p<5>;
49; CHECKPTX71-NEXT:    .reg .b16 %rs<14>;
50; CHECKPTX71-NEXT:    .reg .b32 %r<58>;
51; CHECKPTX71-EMPTY:
52; CHECKPTX71-NEXT:  // %bb.0:
53; CHECKPTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
54; CHECKPTX71-NEXT:    ld.param.u32 %r23, [test_param_2];
55; CHECKPTX71-NEXT:    ld.param.u32 %r22, [test_param_1];
56; CHECKPTX71-NEXT:    ld.param.u32 %r24, [test_param_0];
57; CHECKPTX71-NEXT:    and.b32 %r1, %r24, -4;
58; CHECKPTX71-NEXT:    and.b32 %r25, %r24, 3;
59; CHECKPTX71-NEXT:    shl.b32 %r2, %r25, 3;
60; CHECKPTX71-NEXT:    mov.b32 %r26, 65535;
61; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
62; CHECKPTX71-NEXT:    not.b32 %r3, %r27;
63; CHECKPTX71-NEXT:    ld.u32 %r54, [%r1];
64; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start45
65; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
66; CHECKPTX71-NEXT:    shr.u32 %r28, %r54, %r2;
67; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r28;
68; CHECKPTX71-NEXT:    mov.b16 %rs3, 0x3F80;
69; CHECKPTX71-NEXT:    fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
70; CHECKPTX71-NEXT:    cvt.u32.u16 %r29, %rs4;
71; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
72; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
73; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
74; CHECKPTX71-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
75; CHECKPTX71-NEXT:    setp.ne.s32 %p1, %r6, %r54;
76; CHECKPTX71-NEXT:    mov.u32 %r54, %r6;
77; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
78; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end44
79; CHECKPTX71-NEXT:    ld.u32 %r55, [%r1];
80; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start27
81; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
82; CHECKPTX71-NEXT:    shr.u32 %r33, %r55, %r2;
83; CHECKPTX71-NEXT:    cvt.u16.u32 %rs5, %r33;
84; CHECKPTX71-NEXT:    mov.b16 %rs6, 0x3F80;
85; CHECKPTX71-NEXT:    fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
86; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs7;
87; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
88; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
89; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
90; CHECKPTX71-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
91; CHECKPTX71-NEXT:    setp.ne.s32 %p2, %r9, %r55;
92; CHECKPTX71-NEXT:    mov.u32 %r55, %r9;
93; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
94; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end26
95; CHECKPTX71-NEXT:    and.b32 %r10, %r22, -4;
96; CHECKPTX71-NEXT:    shl.b32 %r38, %r22, 3;
97; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
98; CHECKPTX71-NEXT:    mov.b32 %r39, 65535;
99; CHECKPTX71-NEXT:    shl.b32 %r40, %r39, %r11;
100; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
101; CHECKPTX71-NEXT:    ld.global.u32 %r56, [%r10];
102; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start9
103; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
104; CHECKPTX71-NEXT:    shr.u32 %r41, %r56, %r11;
105; CHECKPTX71-NEXT:    cvt.u16.u32 %rs8, %r41;
106; CHECKPTX71-NEXT:    mov.b16 %rs9, 0x3F80;
107; CHECKPTX71-NEXT:    fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
108; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs10;
109; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
110; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
111; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
112; CHECKPTX71-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
113; CHECKPTX71-NEXT:    setp.ne.s32 %p3, %r15, %r56;
114; CHECKPTX71-NEXT:    mov.u32 %r56, %r15;
115; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
116; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end8
117; CHECKPTX71-NEXT:    and.b32 %r16, %r23, -4;
118; CHECKPTX71-NEXT:    shl.b32 %r46, %r23, 3;
119; CHECKPTX71-NEXT:    and.b32 %r17, %r46, 24;
120; CHECKPTX71-NEXT:    mov.b32 %r47, 65535;
121; CHECKPTX71-NEXT:    shl.b32 %r48, %r47, %r17;
122; CHECKPTX71-NEXT:    not.b32 %r18, %r48;
123; CHECKPTX71-NEXT:    ld.shared.u32 %r57, [%r16];
124; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
125; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
126; CHECKPTX71-NEXT:    shr.u32 %r49, %r57, %r17;
127; CHECKPTX71-NEXT:    cvt.u16.u32 %rs11, %r49;
128; CHECKPTX71-NEXT:    mov.b16 %rs12, 0x3F80;
129; CHECKPTX71-NEXT:    fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
130; CHECKPTX71-NEXT:    cvt.u32.u16 %r50, %rs13;
131; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
132; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
133; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
134; CHECKPTX71-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
135; CHECKPTX71-NEXT:    setp.ne.s32 %p4, %r21, %r57;
136; CHECKPTX71-NEXT:    mov.u32 %r57, %r21;
137; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
138; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
139; CHECKPTX71-NEXT:    ret;
140  %r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic
141  %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic
142  %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic
143  %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val monotonic
144  ret void
145}
146
147attributes #1 = { argmemonly nounwind }
148