1; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck -check-prefix=SM20 %s 2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck -check-prefix=SM35 %s 3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} 4; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} 5 6target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 7target triple = "nvptx64-unknown-unknown" 8 9; SM20-LABEL: .visible .entry foo1( 10; SM20: ld.global.f32 11; SM35-LABEL: .visible .entry foo1( 12; SM35: ld.global.nc.f32 13define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) { 14 %1 = load float, ptr %from 15 store float %1, ptr %to 16 ret void 17} 18 19; SM20-LABEL: .visible .entry foo2( 20; SM20: ld.global.f64 21; SM35-LABEL: .visible .entry foo2( 22; SM35: ld.global.nc.f64 23define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) { 24 %1 = load double, ptr %from 25 store double %1, ptr %to 26 ret void 27} 28 29; SM20-LABEL: .visible .entry foo3( 30; SM20: ld.global.u16 31; SM35-LABEL: .visible .entry foo3( 32; SM35: ld.global.nc.u16 33define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) { 34 %1 = load i16, ptr %from 35 store i16 %1, ptr %to 36 ret void 37} 38 39; SM20-LABEL: .visible .entry foo4( 40; SM20: ld.global.u32 41; SM35-LABEL: .visible .entry foo4( 42; SM35: ld.global.nc.u32 43define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) { 44 %1 = load i32, ptr %from 45 store i32 %1, ptr %to 46 ret void 47} 48 49; SM20-LABEL: .visible .entry foo5( 50; SM20: ld.global.u64 51; SM35-LABEL: .visible .entry foo5( 52; SM35: ld.global.nc.u64 53define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) { 54 %1 = load i64, ptr %from 55 store i64 %1, ptr %to 56 ret void 57} 58 59; i128 is non standard integer in nvptx64 60; SM20-LABEL: .visible .entry foo6( 61; SM20: ld.global.u64 62; SM20: ld.global.u64 63; SM35-LABEL: .visible .entry foo6( 64; SM35: ld.global.nc.u64 65; SM35: ld.global.nc.u64 66define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) { 67 %1 = load i128, ptr %from 68 store i128 %1, ptr %to 69 ret void 70} 71 72; SM20-LABEL: .visible .entry foo7( 73; SM20: ld.global.v2.u8 74; SM35-LABEL: .visible .entry foo7( 75; SM35: ld.global.nc.v2.u8 76define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) { 77 %1 = load <2 x i8>, ptr %from 78 store <2 x i8> %1, ptr %to 79 ret void 80} 81 82; SM20-LABEL: .visible .entry foo8( 83; SM20: ld.global.u32 84; SM35-LABEL: .visible .entry foo8( 85; SM35: ld.global.nc.u32 86define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) { 87 %1 = load <2 x i16>, ptr %from 88 store <2 x i16> %1, ptr %to 89 ret void 90} 91 92; SM20-LABEL: .visible .entry foo9( 93; SM20: ld.global.v2.u32 94; SM35-LABEL: .visible .entry foo9( 95; SM35: ld.global.nc.v2.u32 96define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) { 97 %1 = load <2 x i32>, ptr %from 98 store <2 x i32> %1, ptr %to 99 ret void 100} 101 102; SM20-LABEL: .visible .entry foo10( 103; SM20: ld.global.v2.u64 104; SM35-LABEL: .visible .entry foo10( 105; SM35: ld.global.nc.v2.u64 106define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) { 107 %1 = load <2 x i64>, ptr %from 108 store <2 x i64> %1, ptr %to 109 ret void 110} 111 112; SM20-LABEL: .visible .entry foo11( 113; SM20: ld.global.v2.f32 114; SM35-LABEL: .visible .entry foo11( 115; SM35: ld.global.nc.v2.f32 116define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) { 117 %1 = load <2 x float>, ptr %from 118 store <2 x float> %1, ptr %to 119 ret void 120} 121 122; SM20-LABEL: .visible .entry foo12( 123; SM20: ld.global.v2.f64 124; SM35-LABEL: .visible .entry foo12( 125; SM35: ld.global.nc.v2.f64 126define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) { 127 %1 = load <2 x double>, ptr %from 128 store <2 x double> %1, ptr %to 129 ret void 130} 131 132; SM20-LABEL: .visible .entry foo13( 133; SM20: ld.global.u32 134; SM35-LABEL: .visible .entry foo13( 135; SM35: ld.global.nc.u32 136define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) { 137 %1 = load <4 x i8>, ptr %from 138 store <4 x i8> %1, ptr %to 139 ret void 140} 141 142; SM20-LABEL: .visible .entry foo14( 143; SM20: ld.global.v4.u16 144; SM35-LABEL: .visible .entry foo14( 145; SM35: ld.global.nc.v4.u16 146define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) { 147 %1 = load <4 x i16>, ptr %from 148 store <4 x i16> %1, ptr %to 149 ret void 150} 151 152; SM20-LABEL: .visible .entry foo15( 153; SM20: ld.global.v4.u32 154; SM35-LABEL: .visible .entry foo15( 155; SM35: ld.global.nc.v4.u32 156define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) { 157 %1 = load <4 x i32>, ptr %from 158 store <4 x i32> %1, ptr %to 159 ret void 160} 161 162; SM20-LABEL: .visible .entry foo16( 163; SM20: ld.global.v4.f32 164; SM35-LABEL: .visible .entry foo16( 165; SM35: ld.global.nc.v4.f32 166define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) { 167 %1 = load <4 x float>, ptr %from 168 store <4 x float> %1, ptr %to 169 ret void 170} 171 172; SM20-LABEL: .visible .entry foo17( 173; SM20: ld.global.v2.f64 174; SM20: ld.global.v2.f64 175; SM35-LABEL: .visible .entry foo17( 176; SM35: ld.global.nc.v2.f64 177; SM35: ld.global.nc.v2.f64 178define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) { 179 %1 = load <4 x double>, ptr %from 180 store <4 x double> %1, ptr %to 181 ret void 182} 183 184; SM20-LABEL: .visible .entry foo18( 185; SM20: ld.global.u64 186; SM35-LABEL: .visible .entry foo18( 187; SM35: ld.global.nc.u64 188define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) { 189 %1 = load ptr, ptr %from 190 store ptr %1, ptr %to 191 ret void 192} 193 194; Test that we can infer a cached load for a pointer induction variable. 195; SM20-LABEL: .visible .entry foo19( 196; SM20: ld.global.f32 197; SM35-LABEL: .visible .entry foo19( 198; SM35: ld.global.nc.f32 199define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) { 200entry: 201 br label %loop 202 203loop: 204 %i = phi i32 [ 0, %entry ], [ %nexti, %loop ] 205 %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ] 206 %ptr = getelementptr inbounds float, ptr %from, i32 %i 207 %value = load float, ptr %ptr, align 4 208 %nextsum = fadd float %value, %sum 209 %nexti = add nsw i32 %i, 1 210 %exitcond = icmp eq i32 %nexti, %n 211 br i1 %exitcond, label %exit, label %loop 212 213exit: 214 store float %nextsum, ptr %to 215 ret void 216} 217 218; This test captures the case of a non-kernel function. In a 219; non-kernel function, without interprocedural analysis, we do not 220; know that the parameter is global. We also do not know that the 221; pointed-to memory is never written to (for the duration of the 222; kernel). For both reasons, we cannot use a cached load here. 223; SM20-LABEL: notkernel( 224; SM20: ld.f32 225; SM35-LABEL: notkernel( 226; SM35: ld.f32 227define void @notkernel(ptr noalias readonly %from, ptr %to) { 228 %1 = load float, ptr %from 229 store float %1, ptr %to 230 ret void 231} 232 233; As @notkernel, but with the parameter explicitly marked as global. We still 234; do not know that the parameter is never written to (for the duration of the 235; kernel). This case does not currently come up normally since we do not infer 236; that pointers are global interprocedurally as of 2015-08-05. 237; SM20-LABEL: notkernel2( 238; SM20: ld.global.f32 239; SM35-LABEL: notkernel2( 240; SM35: ld.global.f32 241define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) { 242 %1 = load float, ptr addrspace(1) %from 243 store float %1, ptr %to 244 ret void 245} 246