1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s 3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} 4 5target triple = "nvptx64-nvidia-cuda" 6 7@value = internal addrspace(1) global i128 0, align 16 8 9define void @test_b128_input_from_const() { 10; CHECK-LABEL: test_b128_input_from_const( 11; CHECK: { 12; CHECK-NEXT: .reg .b64 %rd<5>; 13; CHECK-NEXT: .reg .b128 %rq<2>; 14; CHECK-EMPTY: 15; CHECK-NEXT: // %bb.0: 16; CHECK-NEXT: mov.b64 %rd2, 0; 17; CHECK-NEXT: mov.b64 %rd3, 42; 18; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2}; 19; CHECK-NEXT: mov.u64 %rd4, value; 20; CHECK-NEXT: cvta.global.u64 %rd1, %rd4; 21; CHECK-NEXT: // begin inline asm 22; CHECK-NEXT: { st.b128 [%rd1], %rq1; } 23; CHECK-NEXT: // end inline asm 24; CHECK-NEXT: ret; 25 tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42) 26 ret void 27} 28 29define void @test_b128_input_from_load(ptr nocapture readonly %data) { 30; CHECK-LABEL: test_b128_input_from_load( 31; CHECK: { 32; CHECK-NEXT: .reg .b64 %rd<7>; 33; CHECK-NEXT: .reg .b128 %rq<2>; 34; CHECK-EMPTY: 35; CHECK-NEXT: // %bb.0: 36; CHECK-NEXT: ld.param.u64 %rd2, [test_b128_input_from_load_param_0]; 37; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2; 38; CHECK-NEXT: ld.global.u64 %rd4, [%rd3+8]; 39; CHECK-NEXT: ld.global.u64 %rd5, [%rd3]; 40; CHECK-NEXT: mov.b128 %rq1, {%rd5, %rd4}; 41; CHECK-NEXT: mov.u64 %rd6, value; 42; CHECK-NEXT: cvta.global.u64 %rd1, %rd6; 43; CHECK-NEXT: // begin inline asm 44; CHECK-NEXT: { st.b128 [%rd1], %rq1; } 45; CHECK-NEXT: // end inline asm 46; CHECK-NEXT: ret; 47 %1 = addrspacecast ptr %data to ptr addrspace(1) 48 %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 49 %3 = bitcast <2 x i64> %2 to i128 50 tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3) 51 ret void 52} 53 54define void @test_b128_input_from_select(ptr nocapture readonly %flag) { 55; CHECK-LABEL: test_b128_input_from_select( 56; CHECK: { 57; CHECK-NEXT: .reg .pred %p<2>; 58; CHECK-NEXT: .reg .b16 %rs<2>; 59; CHECK-NEXT: .reg .b64 %rd<7>; 60; CHECK-NEXT: .reg .b128 %rq<2>; 61; CHECK-EMPTY: 62; CHECK-NEXT: // %bb.0: 63; CHECK-NEXT: ld.param.u64 %rd2, [test_b128_input_from_select_param_0]; 64; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2; 65; CHECK-NEXT: ld.global.u8 %rs1, [%rd3]; 66; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0; 67; CHECK-NEXT: selp.b64 %rd4, 24, 42, %p1; 68; CHECK-NEXT: mov.b64 %rd5, 0; 69; CHECK-NEXT: mov.b128 %rq1, {%rd4, %rd5}; 70; CHECK-NEXT: mov.u64 %rd6, value; 71; CHECK-NEXT: cvta.global.u64 %rd1, %rd6; 72; CHECK-NEXT: // begin inline asm 73; CHECK-NEXT: { st.b128 [%rd1], %rq1; } 74; CHECK-NEXT: // end inline asm 75; CHECK-NEXT: ret; 76 %1 = addrspacecast ptr %flag to ptr addrspace(1) 77 %2 = load i8, ptr addrspace(1) %1, align 1 78 %3 = icmp eq i8 %2, 0 79 %4 = select i1 %3, i128 24, i128 42 80 tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %4) 81 ret void 82} 83 84define void @test_store_b128_output() { 85; CHECK-LABEL: test_store_b128_output( 86; CHECK: { 87; CHECK-NEXT: .reg .b64 %rd<5>; 88; CHECK-NEXT: .reg .b128 %rq<2>; 89; CHECK-EMPTY: 90; CHECK-NEXT: // %bb.0: 91; CHECK-NEXT: // begin inline asm 92; CHECK-NEXT: { mov.b128 %rq1, 41; } 93; CHECK-NEXT: // end inline asm 94; CHECK-NEXT: mov.b128 {%rd1, %rd2}, %rq1; 95; CHECK-NEXT: add.cc.s64 %rd3, %rd1, 1; 96; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, 0; 97; CHECK-NEXT: st.global.u64 [value+8], %rd4; 98; CHECK-NEXT: st.global.u64 [value], %rd3; 99; CHECK-NEXT: ret; 100 %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() 101 %add = add nsw i128 %1, 1 102 %2 = bitcast i128 %add to <2 x i64> 103 store <2 x i64> %2, ptr addrspace(1) @value, align 16 104 ret void 105} 106 107define void @test_use_of_b128_output(ptr nocapture readonly %data) { 108; CHECK-LABEL: test_use_of_b128_output( 109; CHECK: { 110; CHECK-NEXT: .reg .b64 %rd<9>; 111; CHECK-NEXT: .reg .b128 %rq<3>; 112; CHECK-EMPTY: 113; CHECK-NEXT: // %bb.0: 114; CHECK-NEXT: ld.param.u64 %rd1, [test_use_of_b128_output_param_0]; 115; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; 116; CHECK-NEXT: ld.global.u64 %rd3, [%rd2+8]; 117; CHECK-NEXT: ld.global.u64 %rd4, [%rd2]; 118; CHECK-NEXT: mov.b128 %rq2, {%rd4, %rd3}; 119; CHECK-NEXT: // begin inline asm 120; CHECK-NEXT: { mov.b128 %rq1, %rq2; } 121; CHECK-NEXT: // end inline asm 122; CHECK-NEXT: mov.b128 {%rd5, %rd6}, %rq1; 123; CHECK-NEXT: add.cc.s64 %rd7, %rd5, 1; 124; CHECK-NEXT: addc.cc.s64 %rd8, %rd6, 0; 125; CHECK-NEXT: st.global.u64 [value], %rd7; 126; CHECK-NEXT: st.global.u64 [value+8], %rd8; 127; CHECK-NEXT: ret; 128 %1 = addrspacecast ptr %data to ptr addrspace(1) 129 %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 130 %3 = bitcast <2 x i64> %2 to i128 131 %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3) 132 %add = add nsw i128 %4, 1 133 %5 = bitcast i128 %add to <2 x i64> 134 store <2 x i64> %5, ptr addrspace(1) @value, align 16 135 ret void 136} 137