xref: /llvm-project/llvm/test/CodeGen/NVPTX/ldg-invariant.ll (revision 932d9c13faa3de1deca3874d3b864901aa5ec9a5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
4
5; Check that invariant loads from the global addrspace are lowered to
6; ld.global.nc.
7
8define i32 @ld_global(ptr addrspace(1) %ptr) {
9; CHECK-LABEL: ld_global(
10; CHECK:       {
11; CHECK-NEXT:    .reg .b32 %r<2>;
12; CHECK-NEXT:    .reg .b64 %rd<2>;
13; CHECK-EMPTY:
14; CHECK-NEXT:  // %bb.0:
15; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_param_0];
16; CHECK-NEXT:    ld.global.nc.u32 %r1, [%rd1];
17; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
18; CHECK-NEXT:    ret;
19  %a = load i32, ptr addrspace(1) %ptr, !invariant.load !0
20  ret i32 %a
21}
22
23define half @ld_global_v2f16(ptr addrspace(1) %ptr) {
24; Load of v2f16 is weird. We consider it to be a legal type, which happens to be
25; loaded/stored as a 32-bit scalar.
26; CHECK-LABEL: ld_global_v2f16(
27; CHECK:       {
28; CHECK-NEXT:    .reg .b16 %rs<4>;
29; CHECK-NEXT:    .reg .b32 %r<2>;
30; CHECK-NEXT:    .reg .f32 %f<4>;
31; CHECK-NEXT:    .reg .b64 %rd<2>;
32; CHECK-EMPTY:
33; CHECK-NEXT:  // %bb.0:
34; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v2f16_param_0];
35; CHECK-NEXT:    ld.global.nc.u32 %r1, [%rd1];
36; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
37; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
38; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
39; CHECK-NEXT:    add.rn.f32 %f3, %f2, %f1;
40; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
41; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
42; CHECK-NEXT:    ret;
43  %a = load <2 x half>, ptr addrspace(1) %ptr, !invariant.load !0
44  %v1 = extractelement <2 x half> %a, i32 0
45  %v2 = extractelement <2 x half> %a, i32 1
46  %sum = fadd half %v1, %v2
47  ret half %sum
48}
49
50define half @ld_global_v4f16(ptr addrspace(1) %ptr) {
51; Larger f16 vectors may be split into individual f16 elements and multiple
52; loads/stores may be vectorized using f16 element type. Practically it's
53; limited to v4 variant only.
54; CHECK-LABEL: ld_global_v4f16(
55; CHECK:       {
56; CHECK-NEXT:    .reg .b16 %rs<8>;
57; CHECK-NEXT:    .reg .f32 %f<10>;
58; CHECK-NEXT:    .reg .b64 %rd<2>;
59; CHECK-EMPTY:
60; CHECK-NEXT:  // %bb.0:
61; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v4f16_param_0];
62; CHECK-NEXT:    ld.global.nc.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
63; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
64; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
65; CHECK-NEXT:    add.rn.f32 %f3, %f2, %f1;
66; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
67; CHECK-NEXT:    cvt.f32.f16 %f4, %rs4;
68; CHECK-NEXT:    cvt.f32.f16 %f5, %rs3;
69; CHECK-NEXT:    add.rn.f32 %f6, %f5, %f4;
70; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
71; CHECK-NEXT:    cvt.f32.f16 %f7, %rs6;
72; CHECK-NEXT:    cvt.f32.f16 %f8, %rs5;
73; CHECK-NEXT:    add.rn.f32 %f9, %f8, %f7;
74; CHECK-NEXT:    cvt.rn.f16.f32 %rs7, %f9;
75; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
76; CHECK-NEXT:    ret;
77  %a = load <4 x half>, ptr addrspace(1) %ptr, !invariant.load !0
78  %v1 = extractelement <4 x half> %a, i32 0
79  %v2 = extractelement <4 x half> %a, i32 1
80  %v3 = extractelement <4 x half> %a, i32 2
81  %v4 = extractelement <4 x half> %a, i32 3
82  %sum1 = fadd half %v1, %v2
83  %sum2 = fadd half %v3, %v4
84  %sum = fadd half %sum1, %sum2
85  ret half %sum
86}
87
88define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
89; Larger vectors are, again, loaded as v4i32. PTX has no v8 variants of loads/stores,
90; so load/store vectorizer has to convert v8f16 -> v4 x v2f16.
91; CHECK-LABEL: ld_global_v8f16(
92; CHECK:       {
93; CHECK-NEXT:    .reg .b16 %rs<8>;
94; CHECK-NEXT:    .reg .b32 %r<5>;
95; CHECK-NEXT:    .reg .f32 %f<10>;
96; CHECK-NEXT:    .reg .b64 %rd<2>;
97; CHECK-EMPTY:
98; CHECK-NEXT:  // %bb.0:
99; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v8f16_param_0];
100; CHECK-NEXT:    ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
101; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
102; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; }
103; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; }
104; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs4, tmp}, %r2; }
105; CHECK-NEXT:    cvt.f32.f16 %f1, %rs4;
106; CHECK-NEXT:    cvt.f32.f16 %f2, %rs3;
107; CHECK-NEXT:    add.rn.f32 %f3, %f2, %f1;
108; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
109; CHECK-NEXT:    cvt.f32.f16 %f4, %rs2;
110; CHECK-NEXT:    cvt.f32.f16 %f5, %rs1;
111; CHECK-NEXT:    add.rn.f32 %f6, %f5, %f4;
112; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
113; CHECK-NEXT:    cvt.f32.f16 %f7, %rs6;
114; CHECK-NEXT:    cvt.f32.f16 %f8, %rs5;
115; CHECK-NEXT:    add.rn.f32 %f9, %f8, %f7;
116; CHECK-NEXT:    cvt.rn.f16.f32 %rs7, %f9;
117; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
118; CHECK-NEXT:    ret;
119  %a = load <8 x half>, ptr addrspace(1) %ptr, !invariant.load !0
120  %v1 = extractelement <8 x half> %a, i32 0
121  %v2 = extractelement <8 x half> %a, i32 2
122  %v3 = extractelement <8 x half> %a, i32 4
123  %v4 = extractelement <8 x half> %a, i32 6
124  %sum1 = fadd half %v1, %v2
125  %sum2 = fadd half %v3, %v4
126  %sum = fadd half %sum1, %sum2
127  ret half %sum
128}
129
130define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
131; CHECK-LABEL: ld_global_v8i8(
132; CHECK:       {
133; CHECK-NEXT:    .reg .b16 %rs<8>;
134; CHECK-NEXT:    .reg .b32 %r<9>;
135; CHECK-NEXT:    .reg .b64 %rd<2>;
136; CHECK-EMPTY:
137; CHECK-NEXT:  // %bb.0:
138; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v8i8_param_0];
139; CHECK-NEXT:    ld.global.nc.v2.u32 {%r1, %r2}, [%rd1];
140; CHECK-NEXT:    bfe.u32 %r3, %r2, 16, 8;
141; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
142; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
143; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
144; CHECK-NEXT:    bfe.u32 %r5, %r1, 16, 8;
145; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
146; CHECK-NEXT:    bfe.u32 %r6, %r1, 0, 8;
147; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
148; CHECK-NEXT:    add.s16 %rs5, %rs4, %rs3;
149; CHECK-NEXT:    add.s16 %rs6, %rs2, %rs1;
150; CHECK-NEXT:    add.s16 %rs7, %rs5, %rs6;
151; CHECK-NEXT:    cvt.u32.u16 %r7, %rs7;
152; CHECK-NEXT:    and.b32 %r8, %r7, 255;
153; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
154; CHECK-NEXT:    ret;
155  %a = load <8 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
156  %v1 = extractelement <8 x i8> %a, i32 0
157  %v2 = extractelement <8 x i8> %a, i32 2
158  %v3 = extractelement <8 x i8> %a, i32 4
159  %v4 = extractelement <8 x i8> %a, i32 6
160  %sum1 = add i8 %v1, %v2
161  %sum2 = add i8 %v3, %v4
162  %sum = add i8 %sum1, %sum2
163  ret i8 %sum
164}
165
166define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) {
167; CHECK-LABEL: ld_global_v16i8(
168; CHECK:       {
169; CHECK-NEXT:    .reg .b16 %rs<16>;
170; CHECK-NEXT:    .reg .b32 %r<15>;
171; CHECK-NEXT:    .reg .b64 %rd<2>;
172; CHECK-EMPTY:
173; CHECK-NEXT:  // %bb.0:
174; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v16i8_param_0];
175; CHECK-NEXT:    ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
176; CHECK-NEXT:    bfe.u32 %r5, %r4, 16, 8;
177; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
178; CHECK-NEXT:    bfe.u32 %r6, %r4, 0, 8;
179; CHECK-NEXT:    cvt.u16.u32 %rs2, %r6;
180; CHECK-NEXT:    bfe.u32 %r7, %r3, 16, 8;
181; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
182; CHECK-NEXT:    bfe.u32 %r8, %r3, 0, 8;
183; CHECK-NEXT:    cvt.u16.u32 %rs4, %r8;
184; CHECK-NEXT:    bfe.u32 %r9, %r2, 16, 8;
185; CHECK-NEXT:    cvt.u16.u32 %rs5, %r9;
186; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
187; CHECK-NEXT:    cvt.u16.u32 %rs6, %r10;
188; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
189; CHECK-NEXT:    cvt.u16.u32 %rs7, %r11;
190; CHECK-NEXT:    bfe.u32 %r12, %r1, 0, 8;
191; CHECK-NEXT:    cvt.u16.u32 %rs8, %r12;
192; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
193; CHECK-NEXT:    add.s16 %rs10, %rs6, %rs5;
194; CHECK-NEXT:    add.s16 %rs11, %rs4, %rs3;
195; CHECK-NEXT:    add.s16 %rs12, %rs2, %rs1;
196; CHECK-NEXT:    add.s16 %rs13, %rs9, %rs10;
197; CHECK-NEXT:    add.s16 %rs14, %rs11, %rs12;
198; CHECK-NEXT:    add.s16 %rs15, %rs13, %rs14;
199; CHECK-NEXT:    cvt.u32.u16 %r13, %rs15;
200; CHECK-NEXT:    and.b32 %r14, %r13, 255;
201; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
202; CHECK-NEXT:    ret;
203  %a = load <16 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
204  %v1 = extractelement <16 x i8> %a, i32 0
205  %v2 = extractelement <16 x i8> %a, i32 2
206  %v3 = extractelement <16 x i8> %a, i32 4
207  %v4 = extractelement <16 x i8> %a, i32 6
208  %v5 = extractelement <16 x i8> %a, i32 8
209  %v6 = extractelement <16 x i8> %a, i32 10
210  %v7 = extractelement <16 x i8> %a, i32 12
211  %v8 = extractelement <16 x i8> %a, i32 14
212  %sum1 = add i8 %v1, %v2
213  %sum2 = add i8 %v3, %v4
214  %sum3 = add i8 %v5, %v6
215  %sum4 = add i8 %v7, %v8
216  %sum5 = add i8 %sum1, %sum2
217  %sum6 = add i8 %sum3, %sum4
218  %sum7 = add i8 %sum5, %sum6
219  ret i8 %sum7
220}
221
222define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) {
223; CHECK-LABEL: ld_global_v2i32(
224; CHECK:       {
225; CHECK-NEXT:    .reg .b32 %r<4>;
226; CHECK-NEXT:    .reg .b64 %rd<2>;
227; CHECK-EMPTY:
228; CHECK-NEXT:  // %bb.0:
229; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v2i32_param_0];
230; CHECK-NEXT:    ld.global.nc.v2.u32 {%r1, %r2}, [%rd1];
231; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
232; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
233; CHECK-NEXT:    ret;
234  %a = load <2 x i32>, ptr addrspace(1) %ptr, !invariant.load !0
235  %v1 = extractelement <2 x i32> %a, i32 0
236  %v2 = extractelement <2 x i32> %a, i32 1
237  %sum = add i32 %v1, %v2
238  ret i32 %sum
239}
240
241define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) {
242; CHECK-LABEL: ld_global_v4i32(
243; CHECK:       {
244; CHECK-NEXT:    .reg .b32 %r<8>;
245; CHECK-NEXT:    .reg .b64 %rd<2>;
246; CHECK-EMPTY:
247; CHECK-NEXT:  // %bb.0:
248; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v4i32_param_0];
249; CHECK-NEXT:    ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
250; CHECK-NEXT:    add.s32 %r5, %r1, %r2;
251; CHECK-NEXT:    add.s32 %r6, %r3, %r4;
252; CHECK-NEXT:    add.s32 %r7, %r5, %r6;
253; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
254; CHECK-NEXT:    ret;
255  %a = load <4 x i32>, ptr addrspace(1) %ptr, !invariant.load !0
256  %v1 = extractelement <4 x i32> %a, i32 0
257  %v2 = extractelement <4 x i32> %a, i32 1
258  %v3 = extractelement <4 x i32> %a, i32 2
259  %v4 = extractelement <4 x i32> %a, i32 3
260  %sum1 = add i32 %v1, %v2
261  %sum2 = add i32 %v3, %v4
262  %sum3 = add i32 %sum1, %sum2
263  ret i32 %sum3
264}
265
266define i32 @ld_not_invariant(ptr addrspace(1) %ptr) {
267; CHECK-LABEL: ld_not_invariant(
268; CHECK:       {
269; CHECK-NEXT:    .reg .b32 %r<2>;
270; CHECK-NEXT:    .reg .b64 %rd<2>;
271; CHECK-EMPTY:
272; CHECK-NEXT:  // %bb.0:
273; CHECK-NEXT:    ld.param.u64 %rd1, [ld_not_invariant_param_0];
274; CHECK-NEXT:    ld.global.u32 %r1, [%rd1];
275; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
276; CHECK-NEXT:    ret;
277  %a = load i32, ptr addrspace(1) %ptr
278  ret i32 %a
279}
280
281define i32 @ld_not_global_addrspace(ptr addrspace(0) %ptr) {
282; CHECK-LABEL: ld_not_global_addrspace(
283; CHECK:       {
284; CHECK-NEXT:    .reg .b32 %r<2>;
285; CHECK-NEXT:    .reg .b64 %rd<2>;
286; CHECK-EMPTY:
287; CHECK-NEXT:  // %bb.0:
288; CHECK-NEXT:    ld.param.u64 %rd1, [ld_not_global_addrspace_param_0];
289; CHECK-NEXT:    ld.u32 %r1, [%rd1];
290; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
291; CHECK-NEXT:    ret;
292  %a = load i32, ptr addrspace(0) %ptr
293  ret i32 %a
294}
295
296!0 = !{}
297