1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK 3; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64 4; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62 5; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} 6; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} 7; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %} 8 9target triple = "nvptx64-nvidia-cuda" 10 11define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %val) { 12; CHECK-LABEL: test( 13; CHECK: { 14; CHECK-NEXT: .reg .b16 %rs<7>; 15; CHECK-NEXT: .reg .b32 %r<4>; 16; CHECK-EMPTY: 17; CHECK-NEXT: // %bb.0: 18; CHECK-NEXT: ld.param.u32 %r1, [test_param_0]; 19; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3]; 20; CHECK-NEXT: atom.add.noftz.f16 %rs2, [%r1], %rs1; 21; CHECK-NEXT: ld.param.u32 %r2, [test_param_1]; 22; CHECK-NEXT: mov.b16 %rs3, 0x3C00; 23; CHECK-NEXT: atom.add.noftz.f16 %rs4, [%r1], %rs3; 24; CHECK-NEXT: ld.param.u32 %r3, [test_param_2]; 25; CHECK-NEXT: atom.global.add.noftz.f16 %rs5, [%r2], %rs1; 26; CHECK-NEXT: atom.shared.add.noftz.f16 %rs6, [%r3], %rs1; 27; CHECK-NEXT: ret; 28; 29; CHECK64-LABEL: test( 30; CHECK64: { 31; CHECK64-NEXT: .reg .b16 %rs<7>; 32; CHECK64-NEXT: .reg .b64 %rd<4>; 33; CHECK64-EMPTY: 34; CHECK64-NEXT: // %bb.0: 35; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0]; 36; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3]; 37; CHECK64-NEXT: atom.add.noftz.f16 %rs2, [%rd1], %rs1; 38; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1]; 39; CHECK64-NEXT: mov.b16 %rs3, 0x3C00; 40; CHECK64-NEXT: atom.add.noftz.f16 %rs4, [%rd1], %rs3; 41; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2]; 42; CHECK64-NEXT: atom.global.add.noftz.f16 %rs5, [%rd2], %rs1; 43; CHECK64-NEXT: atom.shared.add.noftz.f16 %rs6, [%rd3], %rs1; 44; CHECK64-NEXT: ret; 45; 46; CHECKPTX62-LABEL: test( 47; CHECKPTX62: { 48; CHECKPTX62-NEXT: .reg .pred %p<5>; 49; CHECKPTX62-NEXT: .reg .b16 %rs<11>; 50; CHECKPTX62-NEXT: .reg .b32 %r<58>; 51; CHECKPTX62-EMPTY: 52; CHECKPTX62-NEXT: // %bb.0: 53; CHECKPTX62-NEXT: ld.param.b16 %rs1, [test_param_3]; 54; CHECKPTX62-NEXT: ld.param.u32 %r23, [test_param_2]; 55; CHECKPTX62-NEXT: ld.param.u32 %r22, [test_param_1]; 56; CHECKPTX62-NEXT: ld.param.u32 %r24, [test_param_0]; 57; CHECKPTX62-NEXT: and.b32 %r1, %r24, -4; 58; CHECKPTX62-NEXT: and.b32 %r25, %r24, 3; 59; CHECKPTX62-NEXT: shl.b32 %r2, %r25, 3; 60; CHECKPTX62-NEXT: mov.b32 %r26, 65535; 61; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; 62; CHECKPTX62-NEXT: not.b32 %r3, %r27; 63; CHECKPTX62-NEXT: ld.u32 %r54, [%r1]; 64; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45 65; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 66; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2; 67; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r28; 68; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1; 69; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs3; 70; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2; 71; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3; 72; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30; 73; CHECKPTX62-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; 74; CHECKPTX62-NEXT: setp.ne.s32 %p1, %r6, %r54; 75; CHECKPTX62-NEXT: mov.u32 %r54, %r6; 76; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; 77; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44 78; CHECKPTX62-NEXT: ld.u32 %r55, [%r1]; 79; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27 80; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 81; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2; 82; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r33; 83; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00; 84; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5; 85; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs6; 86; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2; 87; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3; 88; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; 89; CHECKPTX62-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; 90; CHECKPTX62-NEXT: setp.ne.s32 %p2, %r9, %r55; 91; CHECKPTX62-NEXT: mov.u32 %r55, %r9; 92; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3; 93; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26 94; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4; 95; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3; 96; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24; 97; CHECKPTX62-NEXT: mov.b32 %r39, 65535; 98; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11; 99; CHECKPTX62-NEXT: not.b32 %r12, %r40; 100; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10]; 101; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 102; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 103; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11; 104; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r41; 105; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1; 106; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs8; 107; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; 108; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12; 109; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; 110; CHECKPTX62-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; 111; CHECKPTX62-NEXT: setp.ne.s32 %p3, %r15, %r56; 112; CHECKPTX62-NEXT: mov.u32 %r56, %r15; 113; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5; 114; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8 115; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4; 116; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3; 117; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24; 118; CHECKPTX62-NEXT: mov.b32 %r47, 65535; 119; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17; 120; CHECKPTX62-NEXT: not.b32 %r18, %r48; 121; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16]; 122; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start 123; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 124; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17; 125; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r49; 126; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1; 127; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs10; 128; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17; 129; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18; 130; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51; 131; CHECKPTX62-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; 132; CHECKPTX62-NEXT: setp.ne.s32 %p4, %r21, %r57; 133; CHECKPTX62-NEXT: mov.u32 %r57, %r21; 134; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; 135; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end 136; CHECKPTX62-NEXT: ret; 137 %r1 = atomicrmw fadd ptr %dp0, half %val monotonic 138 %r2 = atomicrmw fadd ptr %dp0, half 1.0 monotonic 139 %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val monotonic 140 %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val monotonic 141 ret void 142} 143 144attributes #1 = { argmemonly nounwind } 145