14372cab9SAdrian Kuegel; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK 3b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64 4b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62 5b279f6b0SFangrui Song; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} 6b279f6b0SFangrui Song; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} 7b279f6b0SFangrui Song; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %} 84372cab9SAdrian Kuegel 94372cab9SAdrian Kuegeltarget triple = "nvptx64-nvidia-cuda" 104372cab9SAdrian Kuegel 114372cab9SAdrian Kuegeldefine void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %val) { 124372cab9SAdrian Kuegel; CHECK-LABEL: test( 134372cab9SAdrian Kuegel; CHECK: { 144372cab9SAdrian Kuegel; CHECK-NEXT: .reg .b16 %rs<7>; 154372cab9SAdrian Kuegel; CHECK-NEXT: .reg .b32 %r<4>; 164372cab9SAdrian Kuegel; CHECK-EMPTY: 174372cab9SAdrian Kuegel; CHECK-NEXT: // %bb.0: 184372cab9SAdrian Kuegel; CHECK-NEXT: ld.param.u32 %r1, [test_param_0]; 194372cab9SAdrian Kuegel; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3]; 204372cab9SAdrian Kuegel; CHECK-NEXT: atom.add.noftz.f16 %rs2, [%r1], %rs1; 214372cab9SAdrian Kuegel; CHECK-NEXT: ld.param.u32 %r2, [test_param_1]; 224372cab9SAdrian Kuegel; CHECK-NEXT: mov.b16 %rs3, 0x3C00; 234372cab9SAdrian Kuegel; CHECK-NEXT: atom.add.noftz.f16 %rs4, [%r1], %rs3; 244372cab9SAdrian Kuegel; CHECK-NEXT: ld.param.u32 %r3, [test_param_2]; 254372cab9SAdrian Kuegel; CHECK-NEXT: atom.global.add.noftz.f16 %rs5, [%r2], %rs1; 264372cab9SAdrian Kuegel; CHECK-NEXT: atom.shared.add.noftz.f16 %rs6, [%r3], %rs1; 274372cab9SAdrian Kuegel; CHECK-NEXT: ret; 284372cab9SAdrian Kuegel; 294372cab9SAdrian Kuegel; CHECK64-LABEL: test( 304372cab9SAdrian Kuegel; CHECK64: { 314372cab9SAdrian Kuegel; CHECK64-NEXT: .reg .b16 %rs<7>; 324372cab9SAdrian Kuegel; CHECK64-NEXT: .reg .b64 %rd<4>; 334372cab9SAdrian Kuegel; CHECK64-EMPTY: 344372cab9SAdrian Kuegel; CHECK64-NEXT: // %bb.0: 354372cab9SAdrian Kuegel; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0]; 364372cab9SAdrian Kuegel; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3]; 374372cab9SAdrian Kuegel; CHECK64-NEXT: atom.add.noftz.f16 %rs2, [%rd1], %rs1; 384372cab9SAdrian Kuegel; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1]; 394372cab9SAdrian Kuegel; CHECK64-NEXT: mov.b16 %rs3, 0x3C00; 404372cab9SAdrian Kuegel; CHECK64-NEXT: atom.add.noftz.f16 %rs4, [%rd1], %rs3; 414372cab9SAdrian Kuegel; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2]; 424372cab9SAdrian Kuegel; CHECK64-NEXT: atom.global.add.noftz.f16 %rs5, [%rd2], %rs1; 434372cab9SAdrian Kuegel; CHECK64-NEXT: atom.shared.add.noftz.f16 %rs6, [%rd3], %rs1; 444372cab9SAdrian Kuegel; CHECK64-NEXT: ret; 454372cab9SAdrian Kuegel; 464372cab9SAdrian Kuegel; CHECKPTX62-LABEL: test( 474372cab9SAdrian Kuegel; CHECKPTX62: { 484372cab9SAdrian Kuegel; CHECKPTX62-NEXT: .reg .pred %p<5>; 49310e7987SAlex MacLean; CHECKPTX62-NEXT: .reg .b16 %rs<11>; 504372cab9SAdrian Kuegel; CHECKPTX62-NEXT: .reg .b32 %r<58>; 514372cab9SAdrian Kuegel; CHECKPTX62-EMPTY: 524372cab9SAdrian Kuegel; CHECKPTX62-NEXT: // %bb.0: 534372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ld.param.b16 %rs1, [test_param_3]; 544372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ld.param.u32 %r23, [test_param_2]; 554372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ld.param.u32 %r22, [test_param_1]; 564372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ld.param.u32 %r24, [test_param_0]; 574372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r1, %r24, -4; 584372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r25, %r24, 3; 594372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shl.b32 %r2, %r25, 3; 604372cab9SAdrian Kuegel; CHECKPTX62-NEXT: mov.b32 %r26, 65535; 614372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; 624372cab9SAdrian Kuegel; CHECKPTX62-NEXT: not.b32 %r3, %r27; 634372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ld.u32 %r54, [%r1]; 64100d9b89SMatt Arsenault; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45 654372cab9SAdrian Kuegel; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 664372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2; 674372cab9SAdrian Kuegel; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r28; 68310e7987SAlex MacLean; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1; 69310e7987SAlex MacLean; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs3; 704372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2; 714372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3; 724372cab9SAdrian Kuegel; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30; 734372cab9SAdrian Kuegel; CHECKPTX62-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; 744372cab9SAdrian Kuegel; CHECKPTX62-NEXT: setp.ne.s32 %p1, %r6, %r54; 754372cab9SAdrian Kuegel; CHECKPTX62-NEXT: mov.u32 %r54, %r6; 764372cab9SAdrian Kuegel; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; 77100d9b89SMatt Arsenault; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44 784372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ld.u32 %r55, [%r1]; 79100d9b89SMatt Arsenault; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27 804372cab9SAdrian Kuegel; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 814372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2; 82310e7987SAlex MacLean; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r33; 83310e7987SAlex MacLean; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00; 84310e7987SAlex MacLean; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5; 85310e7987SAlex MacLean; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs6; 864372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2; 874372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3; 884372cab9SAdrian Kuegel; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; 894372cab9SAdrian Kuegel; CHECKPTX62-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; 904372cab9SAdrian Kuegel; CHECKPTX62-NEXT: setp.ne.s32 %p2, %r9, %r55; 914372cab9SAdrian Kuegel; CHECKPTX62-NEXT: mov.u32 %r55, %r9; 924372cab9SAdrian Kuegel; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3; 93100d9b89SMatt Arsenault; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26 944372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4; 954372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3; 964372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24; 97310e7987SAlex MacLean; CHECKPTX62-NEXT: mov.b32 %r39, 65535; 98310e7987SAlex MacLean; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11; 994372cab9SAdrian Kuegel; CHECKPTX62-NEXT: not.b32 %r12, %r40; 1004372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10]; 101100d9b89SMatt Arsenault; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 1024372cab9SAdrian Kuegel; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 1034372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11; 104310e7987SAlex MacLean; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r41; 105310e7987SAlex MacLean; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1; 106310e7987SAlex MacLean; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs8; 1074372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; 1084372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12; 1094372cab9SAdrian Kuegel; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; 1104372cab9SAdrian Kuegel; CHECKPTX62-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; 1114372cab9SAdrian Kuegel; CHECKPTX62-NEXT: setp.ne.s32 %p3, %r15, %r56; 1124372cab9SAdrian Kuegel; CHECKPTX62-NEXT: mov.u32 %r56, %r15; 1134372cab9SAdrian Kuegel; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5; 114100d9b89SMatt Arsenault; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8 1154372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4; 1164372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3; 1174372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24; 118310e7987SAlex MacLean; CHECKPTX62-NEXT: mov.b32 %r47, 65535; 119310e7987SAlex MacLean; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17; 1204372cab9SAdrian Kuegel; CHECKPTX62-NEXT: not.b32 %r18, %r48; 1214372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16]; 122100d9b89SMatt Arsenault; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start 1234372cab9SAdrian Kuegel; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 1244372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17; 125310e7987SAlex MacLean; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r49; 126310e7987SAlex MacLean; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1; 127310e7987SAlex MacLean; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs10; 1284372cab9SAdrian Kuegel; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17; 1294372cab9SAdrian Kuegel; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18; 1304372cab9SAdrian Kuegel; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51; 1314372cab9SAdrian Kuegel; CHECKPTX62-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; 1324372cab9SAdrian Kuegel; CHECKPTX62-NEXT: setp.ne.s32 %p4, %r21, %r57; 1334372cab9SAdrian Kuegel; CHECKPTX62-NEXT: mov.u32 %r57, %r21; 1344372cab9SAdrian Kuegel; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; 135100d9b89SMatt Arsenault; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end 1364372cab9SAdrian Kuegel; CHECKPTX62-NEXT: ret; 137*892a804dSAkshay Deodhar %r1 = atomicrmw fadd ptr %dp0, half %val monotonic 138*892a804dSAkshay Deodhar %r2 = atomicrmw fadd ptr %dp0, half 1.0 monotonic 139*892a804dSAkshay Deodhar %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val monotonic 140*892a804dSAkshay Deodhar %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val monotonic 1414372cab9SAdrian Kuegel ret void 1424372cab9SAdrian Kuegel} 1434372cab9SAdrian Kuegel 1444372cab9SAdrian Kuegelattributes #1 = { argmemonly nounwind } 145