xref: /llvm-project/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll (revision 4583f6d3443c8dc6605c868724e3743161954210)
1; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck -check-prefix=SM20 %s
2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck -check-prefix=SM35 %s
3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
4; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
5
6target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
7target triple = "nvptx64-unknown-unknown"
8
9; SM20-LABEL: .visible .entry foo1(
10; SM20: ld.global.f32
11; SM35-LABEL: .visible .entry foo1(
12; SM35: ld.global.nc.f32
13define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) {
14  %1 = load float, ptr %from
15  store float %1, ptr %to
16  ret void
17}
18
19; SM20-LABEL: .visible .entry foo2(
20; SM20: ld.global.f64
21; SM35-LABEL: .visible .entry foo2(
22; SM35: ld.global.nc.f64
23define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) {
24  %1 = load double, ptr %from
25  store double %1, ptr %to
26  ret void
27}
28
29; SM20-LABEL: .visible .entry foo3(
30; SM20: ld.global.u16
31; SM35-LABEL: .visible .entry foo3(
32; SM35: ld.global.nc.u16
33define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) {
34  %1 = load i16, ptr %from
35  store i16 %1, ptr %to
36  ret void
37}
38
39; SM20-LABEL: .visible .entry foo4(
40; SM20: ld.global.u32
41; SM35-LABEL: .visible .entry foo4(
42; SM35: ld.global.nc.u32
43define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) {
44  %1 = load i32, ptr %from
45  store i32 %1, ptr %to
46  ret void
47}
48
49; SM20-LABEL: .visible .entry foo5(
50; SM20: ld.global.u64
51; SM35-LABEL: .visible .entry foo5(
52; SM35: ld.global.nc.u64
53define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) {
54  %1 = load i64, ptr %from
55  store i64 %1, ptr %to
56  ret void
57}
58
59; i128 is non standard integer in nvptx64
60; SM20-LABEL: .visible .entry foo6(
61; SM20: ld.global.u64
62; SM20: ld.global.u64
63; SM35-LABEL: .visible .entry foo6(
64; SM35: ld.global.nc.u64
65; SM35: ld.global.nc.u64
66define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) {
67  %1 = load i128, ptr %from
68  store i128 %1, ptr %to
69  ret void
70}
71
72; SM20-LABEL: .visible .entry foo7(
73; SM20: ld.global.v2.u8
74; SM35-LABEL: .visible .entry foo7(
75; SM35: ld.global.nc.v2.u8
76define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
77  %1 = load <2 x i8>, ptr %from
78  store <2 x i8> %1, ptr %to
79  ret void
80}
81
82; SM20-LABEL: .visible .entry foo8(
83; SM20: ld.global.u32
84; SM35-LABEL: .visible .entry foo8(
85; SM35: ld.global.nc.u32
86define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
87  %1 = load <2 x i16>, ptr %from
88  store <2 x i16> %1, ptr %to
89  ret void
90}
91
92; SM20-LABEL: .visible .entry foo9(
93; SM20: ld.global.v2.u32
94; SM35-LABEL: .visible .entry foo9(
95; SM35: ld.global.nc.v2.u32
96define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) {
97  %1 = load <2 x i32>, ptr %from
98  store <2 x i32> %1, ptr %to
99  ret void
100}
101
102; SM20-LABEL: .visible .entry foo10(
103; SM20: ld.global.v2.u64
104; SM35-LABEL: .visible .entry foo10(
105; SM35: ld.global.nc.v2.u64
106define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
107  %1 = load <2 x i64>, ptr %from
108  store <2 x i64> %1, ptr %to
109  ret void
110}
111
112; SM20-LABEL: .visible .entry foo11(
113; SM20: ld.global.v2.f32
114; SM35-LABEL: .visible .entry foo11(
115; SM35: ld.global.nc.v2.f32
116define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
117  %1 = load <2 x float>, ptr %from
118  store <2 x float> %1, ptr %to
119  ret void
120}
121
122; SM20-LABEL: .visible .entry foo12(
123; SM20: ld.global.v2.f64
124; SM35-LABEL: .visible .entry foo12(
125; SM35: ld.global.nc.v2.f64
126define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
127  %1 = load <2 x double>, ptr %from
128  store <2 x double> %1, ptr %to
129  ret void
130}
131
132; SM20-LABEL: .visible .entry foo13(
133; SM20: ld.global.u32
134; SM35-LABEL: .visible .entry foo13(
135; SM35: ld.global.nc.u32
136define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
137  %1 = load <4 x i8>, ptr %from
138  store <4 x i8> %1, ptr %to
139  ret void
140}
141
142; SM20-LABEL: .visible .entry foo14(
143; SM20: ld.global.v4.u16
144; SM35-LABEL: .visible .entry foo14(
145; SM35: ld.global.nc.v4.u16
146define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) {
147  %1 = load <4 x i16>, ptr %from
148  store <4 x i16> %1, ptr %to
149  ret void
150}
151
152; SM20-LABEL: .visible .entry foo15(
153; SM20: ld.global.v4.u32
154; SM35-LABEL: .visible .entry foo15(
155; SM35: ld.global.nc.v4.u32
156define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
157  %1 = load <4 x i32>, ptr %from
158  store <4 x i32> %1, ptr %to
159  ret void
160}
161
162; SM20-LABEL: .visible .entry foo16(
163; SM20: ld.global.v4.f32
164; SM35-LABEL: .visible .entry foo16(
165; SM35: ld.global.nc.v4.f32
166define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
167  %1 = load <4 x float>, ptr %from
168  store <4 x float> %1, ptr %to
169  ret void
170}
171
172; SM20-LABEL: .visible .entry foo17(
173; SM20: ld.global.v2.f64
174; SM20: ld.global.v2.f64
175; SM35-LABEL: .visible .entry foo17(
176; SM35: ld.global.nc.v2.f64
177; SM35: ld.global.nc.v2.f64
178define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) {
179  %1 = load <4 x double>, ptr %from
180  store <4 x double> %1, ptr %to
181  ret void
182}
183
184; SM20-LABEL: .visible .entry foo18(
185; SM20: ld.global.u64
186; SM35-LABEL: .visible .entry foo18(
187; SM35: ld.global.nc.u64
188define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) {
189  %1 = load ptr, ptr %from
190  store ptr %1, ptr %to
191  ret void
192}
193
194; Test that we can infer a cached load for a pointer induction variable.
195; SM20-LABEL: .visible .entry foo19(
196; SM20: ld.global.f32
197; SM35-LABEL: .visible .entry foo19(
198; SM35: ld.global.nc.f32
199define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
200entry:
201  br label %loop
202
203loop:
204  %i = phi i32 [ 0, %entry ], [ %nexti, %loop ]
205  %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ]
206  %ptr = getelementptr inbounds float, ptr %from, i32 %i
207  %value = load float, ptr %ptr, align 4
208  %nextsum = fadd float %value, %sum
209  %nexti = add nsw i32 %i, 1
210  %exitcond = icmp eq i32 %nexti, %n
211  br i1 %exitcond, label %exit, label %loop
212
213exit:
214  store float %nextsum, ptr %to
215  ret void
216}
217
218; This test captures the case of a non-kernel function. In a
219; non-kernel function, without interprocedural analysis, we do not
220; know that the parameter is global. We also do not know that the
221; pointed-to memory is never written to (for the duration of the
222; kernel). For both reasons, we cannot use a cached load here.
223; SM20-LABEL: notkernel(
224; SM20: ld.f32
225; SM35-LABEL: notkernel(
226; SM35: ld.f32
227define void @notkernel(ptr noalias readonly %from, ptr %to) {
228  %1 = load float, ptr %from
229  store float %1, ptr %to
230  ret void
231}
232
233; As @notkernel, but with the parameter explicitly marked as global. We still
234; do not know that the parameter is never written to (for the duration of the
235; kernel). This case does not currently come up normally since we do not infer
236; that pointers are global interprocedurally as of 2015-08-05.
237; SM20-LABEL: notkernel2(
238; SM20: ld.global.f32
239; SM35-LABEL: notkernel2(
240; SM35: ld.global.f32
241define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) {
242  %1 = load float, ptr addrspace(1) %from
243  store float %1, ptr %to
244  ret void
245}
246