xref: /llvm-project/llvm/test/CodeGen/AMDGPU/vector-alloca.ll (revision 2cbfe4a823020b2efe53d32ad7eccbc5a037943f)
1; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
2; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
5; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
6; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
7target datalayout = "A5"
8
9; OPT-LABEL: @vector_read(
10; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
11; OPT: store i32 %0, ptr addrspace(1) %out, align 4
12
13; FUNC-LABEL: {{^}}vector_read:
14; EG: MOV
15; EG: MOV
16; EG: MOV
17; EG: MOV
18; EG: MOVA_INT
19define amdgpu_kernel void @vector_read(ptr addrspace(1) %out, i32 %index) {
20entry:
21  %tmp = alloca [4 x i32], addrspace(5)
22  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
23  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
24  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
25  store i32 0, ptr addrspace(5) %tmp
26  store i32 1, ptr addrspace(5) %y
27  store i32 2, ptr addrspace(5) %z
28  store i32 3, ptr addrspace(5) %w
29  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
30  %tmp2 = load i32, ptr addrspace(5) %tmp1
31  store i32 %tmp2, ptr addrspace(1) %out
32  ret void
33}
34
35; OPT-LABEL: @vector_write(
36; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
37; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
38; OPT: store i32 %1, ptr addrspace(1) %out, align 4
39
40; FUNC-LABEL: {{^}}vector_write:
41; EG: MOV
42; EG: MOV
43; EG: MOV
44; EG: MOV
45; EG: MOVA_INT
46; EG: MOVA_INT
47define amdgpu_kernel void @vector_write(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
48entry:
49  %tmp = alloca [4 x i32], addrspace(5)
50  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
51  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
52  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
53  store i32 0, ptr addrspace(5) %tmp
54  store i32 0, ptr addrspace(5) %y
55  store i32 0, ptr addrspace(5) %z
56  store i32 0, ptr addrspace(5) %w
57  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
58  store i32 1, ptr addrspace(5) %tmp1
59  %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
60  %tmp3 = load i32, ptr addrspace(5) %tmp2
61  store i32 %tmp3, ptr addrspace(1) %out
62  ret void
63}
64
65; This test should be optimize to:
66; store i32 0, ptr addrspace(1) %out
67
68; OPT-LABEL: @bitcast_gep(
69; OPT-LABEL: store i32 0, ptr addrspace(1) %out, align 4
70
71; FUNC-LABEL: {{^}}bitcast_gep:
72; EG: STORE_RAW
73define amdgpu_kernel void @bitcast_gep(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
74entry:
75  %tmp = alloca [4 x i32], addrspace(5)
76  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
77  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
78  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
79  store i32 0, ptr addrspace(5) %tmp
80  store i32 0, ptr addrspace(5) %y
81  store i32 0, ptr addrspace(5) %z
82  store i32 0, ptr addrspace(5) %w
83  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
84  %tmp4 = load i32, ptr addrspace(5) %tmp1
85  store i32 %tmp4, ptr addrspace(1) %out
86  ret void
87}
88
89; OPT-LABEL: @vector_read_bitcast_gep(
90; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
91; OPT: store i32 %0, ptr addrspace(1) %out, align 4
92define amdgpu_kernel void @vector_read_bitcast_gep(ptr addrspace(1) %out, i32 %index) {
93entry:
94  %tmp = alloca [4 x i32], addrspace(5)
95  %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
96  %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
97  %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
98  store float 1.0, ptr addrspace(5) %tmp
99  store i32 1, ptr addrspace(5) %y
100  store i32 2, ptr addrspace(5) %z
101  store i32 3, ptr addrspace(5) %w
102  %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
103  %tmp2 = load i32, ptr addrspace(5) %tmp1
104  store i32 %tmp2, ptr addrspace(1) %out
105  ret void
106}
107
108; OPT-LABEL: @vector_read_bitcast_alloca(
109; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index
110; OPT: store float %0, ptr addrspace(1) %out, align 4
111define amdgpu_kernel void @vector_read_bitcast_alloca(ptr addrspace(1) %out, i32 %index) {
112entry:
113  %tmp = alloca [4 x i32], addrspace(5)
114  %y = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 1
115  %z = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 2
116  %w = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 3
117  store float 0.0, ptr addrspace(5) %tmp
118  store float 1.0, ptr addrspace(5) %y
119  store float 2.0, ptr addrspace(5) %z
120  store float 4.0, ptr addrspace(5) %w
121  %tmp1 = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 %index
122  %tmp2 = load float, ptr addrspace(5) %tmp1
123  store float %tmp2, ptr addrspace(1) %out
124  ret void
125}
126
127; The pointer arguments in local address space should not affect promotion to vector.
128
129; OPT-LABEL: @vector_read_with_local_arg(
130; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
131; OPT: store i32 %0, ptr addrspace(1) %out, align 4
132define amdgpu_kernel void @vector_read_with_local_arg(ptr addrspace(3) %stopper, ptr addrspace(1) %out, i32 %index) {
133entry:
134  %tmp = alloca [4 x i32], addrspace(5)
135  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
136  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
137  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
138  store i32 0, ptr addrspace(5) %tmp
139  store i32 1, ptr addrspace(5) %y
140  store i32 2, ptr addrspace(5) %z
141  store i32 3, ptr addrspace(5) %w
142  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
143  %tmp2 = load i32, ptr addrspace(5) %tmp1
144  store i32 %tmp2, ptr addrspace(1) %out
145  ret void
146}
147