xref: /llvm-project/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll (revision 462cb3cd6cecd0511ecaf0e3ebcaba455ece587d)
1; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll
2; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
4; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s
5
6target datalayout = "A5"
7
8; OPT-LABEL: @vector_read_alloca_bitcast(
9; OPT-NOT:   alloca
10; OPT:       %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
11; OPT-NEXT:  store i32 %0, ptr addrspace(1) %out, align 4
12
13; GCN-LABEL: {{^}}vector_read_alloca_bitcast:
14; GCN-ALLOCA-COUNT-4: buffer_store_dword
15; GCN-ALLOCA:         buffer_load_dword
16
17; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
18; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
19; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
20; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
21; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
22; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
23; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
24; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
25; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
26; GCN-PROMOTE: ScratchSize: 0
27
28define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) {
29entry:
30  %tmp = alloca [4 x i32], addrspace(5)
31  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
32  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
33  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
34  store i32 0, ptr addrspace(5) %tmp
35  store i32 1, ptr addrspace(5) %y
36  store i32 2, ptr addrspace(5) %z
37  store i32 3, ptr addrspace(5) %w
38  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
39  %tmp2 = load i32, ptr addrspace(5) %tmp1
40  store i32 %tmp2, ptr addrspace(1) %out
41  ret void
42}
43
44; OPT-LABEL: @vector_write_alloca_bitcast(
45; OPT-NOT:   alloca
46; OPT:       %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
47; OPT-NEXT:  %1 = extractelement <4 x i32> %0, i32 %r_index
48; OPT-NEXT:  store i32 %1, ptr addrspace(1) %out, align
49
50; GCN-LABEL: {{^}}vector_write_alloca_bitcast:
51; GCN-ALLOCA-COUNT-5: buffer_store_dword
52; GCN-ALLOCA:         buffer_load_dword
53
54; GCN-PROMOTE-COUNT-7: v_cndmask
55
56; GCN-PROMOTE: ScratchSize: 0
57
58define amdgpu_kernel void @vector_write_alloca_bitcast(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
59entry:
60  %tmp = alloca [4 x i32], addrspace(5)
61  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
62  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
63  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
64  store i32 0, ptr addrspace(5) %tmp
65  store i32 0, ptr addrspace(5) %y
66  store i32 0, ptr addrspace(5) %z
67  store i32 0, ptr addrspace(5) %w
68  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
69  store i32 1, ptr addrspace(5) %tmp1
70  %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
71  %tmp3 = load i32, ptr addrspace(5) %tmp2
72  store i32 %tmp3, ptr addrspace(1) %out
73  ret void
74}
75
76; OPT-LABEL: @vector_write_read_bitcast_to_float(
77; OPT-NOT:   alloca
78; OPT: bb2:
79; OPT:  %promotealloca = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
80; OPT:  %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10
81; OPT: .preheader:
82; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
83; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
84
85; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
86; GCN-ALLOCA: buffer_store_dword
87
88; GCN-PROMOTE: s_cmp_eq_u32
89
90; GCN: s_cbranch
91
92; GCN-ALLOCA: buffer_load_dword
93
94; GCN-PROMOTE: ScratchSize: 0
95
96define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
97bb:
98  %tmp = alloca [6 x float], align 4, addrspace(5)
99  call void @llvm.lifetime.start.p5(i64 24, ptr addrspace(5) %tmp) #2
100  br label %bb2
101
102bb2:                                              ; preds = %bb2, %bb
103  %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
104  %tmp4 = zext i32 %tmp3 to i64
105  %tmp5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp4
106  %tmp7 = load i32, ptr addrspace(1) %tmp5, align 4
107  %tmp8 = trunc i32 %tmp3 to i16
108  %tmp9 = urem i16 %tmp8, 6
109  %tmp10 = zext i16 %tmp9 to i32
110  %tmp11 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp10
111  store i32 %tmp7, ptr addrspace(5) %tmp11, align 4
112  %tmp13 = add nuw nsw i32 %tmp3, 1
113  %tmp14 = icmp eq i32 %tmp13, 1000
114  br i1 %tmp14, label %.preheader, label %bb2
115
116bb15:                                             ; preds = %.preheader
117  call void @llvm.lifetime.end.p5(i64 24, ptr addrspace(5) %tmp) #2
118  ret void
119
120.preheader:                                       ; preds = %.preheader, %bb2
121  %tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ]
122  %tmp17 = trunc i32 %tmp16 to i16
123  %tmp18 = urem i16 %tmp17, 6
124  %tmp19 = sub nuw nsw i16 5, %tmp18
125  %tmp20 = zext i16 %tmp19 to i32
126  %tmp21 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp20
127  %tmp23 = load i32, ptr addrspace(5) %tmp21, align 4
128  %tmp24 = zext i32 %tmp16 to i64
129  %tmp25 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp24
130  store i32 %tmp23, ptr addrspace(1) %tmp25, align 4
131  %tmp27 = add nuw nsw i32 %tmp16, 1
132  %tmp28 = icmp eq i32 %tmp27, 1000
133  br i1 %tmp28, label %bb15, label %.preheader
134}
135
136; OPT-LABEL: @vector_write_read_bitcast_to_double(
137; OPT-NOT:   alloca
138; OPT: bb2:
139; OPT:  %promotealloca = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
140; OPT:  %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10
141; OPT: .preheader:
142; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
143; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20
144
145; GCN-LABEL: {{^}}vector_write_read_bitcast_to_double:
146
147; GCN-ALLOCA-COUNT-2: buffer_store_dword
148; GCN-PROMOTE-COUNT-2: v_movreld_b32_e32
149
150; GCN: s_cbranch
151
152; GCN-ALLOCA-COUNT-2: buffer_load_dword
153; GCN-PROMOTE-COUNT-2: v_movrels_b32_e32
154
155; GCN-PROMOTE: ScratchSize: 0
156
157define amdgpu_kernel void @vector_write_read_bitcast_to_double(ptr addrspace(1) %arg) {
158bb:
159  %tmp = alloca [6 x double], align 8, addrspace(5)
160  call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2
161  br label %bb2
162
163bb2:                                              ; preds = %bb2, %bb
164  %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
165  %tmp4 = zext i32 %tmp3 to i64
166  %tmp5 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp4
167  %tmp7 = load i64, ptr addrspace(1) %tmp5, align 8
168  %tmp8 = trunc i32 %tmp3 to i16
169  %tmp9 = urem i16 %tmp8, 6
170  %tmp10 = zext i16 %tmp9 to i32
171  %tmp11 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp10
172  store i64 %tmp7, ptr addrspace(5) %tmp11, align 8
173  %tmp13 = add nuw nsw i32 %tmp3, 1
174  %tmp14 = icmp eq i32 %tmp13, 1000
175  br i1 %tmp14, label %.preheader, label %bb2
176
177bb15:                                             ; preds = %.preheader
178  call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2
179  ret void
180
181.preheader:                                       ; preds = %.preheader, %bb2
182  %tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ]
183  %tmp17 = trunc i32 %tmp16 to i16
184  %tmp18 = urem i16 %tmp17, 6
185  %tmp19 = sub nuw nsw i16 5, %tmp18
186  %tmp20 = zext i16 %tmp19 to i32
187  %tmp21 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp20
188  %tmp23 = load i64, ptr addrspace(5) %tmp21, align 8
189  %tmp24 = zext i32 %tmp16 to i64
190  %tmp25 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp24
191  store i64 %tmp23, ptr addrspace(1) %tmp25, align 8
192  %tmp27 = add nuw nsw i32 %tmp16, 1
193  %tmp28 = icmp eq i32 %tmp27, 1000
194  br i1 %tmp28, label %bb15, label %.preheader
195}
196
197; OPT-LABEL: @vector_write_read_bitcast_to_i64(
198; OPT-NOT:   alloca
199; OPT: bb2:
200; OPT:  %promotealloca = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
201; OPT:  %0 = insertelement <6 x i64> %promotealloca, i64 %tmp6, i32 %tmp9
202; OPT: .preheader:
203; OPT:  %1 = extractelement <6 x i64> %0, i32 %tmp18
204
205; GCN-LABEL: {{^}}vector_write_read_bitcast_to_i64:
206
207; GCN-ALLOCA-COUNT-2: buffer_store_dword
208; GCN-PROMOTE-COUNT-2: v_movreld_b32_e32
209
210; GCN: s_cbranch
211
212; GCN-ALLOCA-COUNT-2: buffer_load_dword
213; GCN-PROMOTE-COUNT-2: v_movrels_b32_e32
214
215; GCN-PROMOTE: ScratchSize: 0
216
217define amdgpu_kernel void @vector_write_read_bitcast_to_i64(ptr addrspace(1) %arg) {
218bb:
219  %tmp = alloca [6 x i64], align 8, addrspace(5)
220  call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2
221  br label %bb2
222
223bb2:                                              ; preds = %bb2, %bb
224  %tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ]
225  %tmp4 = zext i32 %tmp3 to i64
226  %tmp5 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp4
227  %tmp6 = load i64, ptr addrspace(1) %tmp5, align 8
228  %tmp7 = trunc i32 %tmp3 to i16
229  %tmp8 = urem i16 %tmp7, 6
230  %tmp9 = zext i16 %tmp8 to i32
231  %tmp10 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp9
232  store i64 %tmp6, ptr addrspace(5) %tmp10, align 8
233  %tmp11 = add nuw nsw i32 %tmp3, 1
234  %tmp12 = icmp eq i32 %tmp11, 1000
235  br i1 %tmp12, label %.preheader, label %bb2
236
237bb13:                                             ; preds = %.preheader
238  call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2
239  ret void
240
241.preheader:                                       ; preds = %.preheader, %bb2
242  %tmp14 = phi i32 [ %tmp23, %.preheader ], [ 0, %bb2 ]
243  %tmp15 = trunc i32 %tmp14 to i16
244  %tmp16 = urem i16 %tmp15, 6
245  %tmp17 = sub nuw nsw i16 5, %tmp16
246  %tmp18 = zext i16 %tmp17 to i32
247  %tmp19 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp18
248  %tmp20 = load i64, ptr addrspace(5) %tmp19, align 8
249  %tmp21 = zext i32 %tmp14 to i64
250  %tmp22 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp21
251  store i64 %tmp20, ptr addrspace(1) %tmp22, align 8
252  %tmp23 = add nuw nsw i32 %tmp14, 1
253  %tmp24 = icmp eq i32 %tmp23, 1000
254  br i1 %tmp24, label %bb13, label %.preheader
255}
256
257; TODO: llvm.assume can be ingored
258
259; OPT-LABEL: @vector_read_alloca_bitcast_assume(
260; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
261; OPT: store i32 %0, ptr addrspace(1) %out, align 4
262
263; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
264; GCN-COUNT: buffer_store_dword
265
266define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
267entry:
268  %tmp = alloca [4 x i32], addrspace(5)
269  %cmp = icmp ne ptr addrspace(5) %tmp, null
270  call void @llvm.assume(i1 %cmp)
271  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
272  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
273  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
274  store i32 0, ptr addrspace(5) %tmp
275  store i32 1, ptr addrspace(5) %y
276  store i32 2, ptr addrspace(5) %z
277  store i32 3, ptr addrspace(5) %w
278  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
279  %tmp2 = load i32, ptr addrspace(5) %tmp1
280  store i32 %tmp2, ptr addrspace(1) %out
281  ret void
282}
283
284; OPT-LABEL: @vector_read_alloca_multiuse(
285; OPT-NOT:   alloca
286; OPT:       %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
287; OPT-NEXT:  %add2 = add nuw nsw i32 %0, 1
288; OPT-NEXT:  store i32 %add2, ptr addrspace(1) %out, align 4
289
290; GCN-LABEL: {{^}}vector_read_alloca_multiuse:
291; GCN-ALLOCA-COUNT-4: buffer_store_dword
292; GCN-ALLOCA:         buffer_load_dword
293
294; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
295; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
296; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
297; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
298; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
299; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
300; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
301; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
302; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
303
304; GCN-PROMOTE: ScratchSize: 0
305
306define amdgpu_kernel void @vector_read_alloca_multiuse(ptr addrspace(1) %out, i32 %index) {
307entry:
308  %tmp = alloca [4 x i32], addrspace(5)
309  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
310  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
311  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
312  store i32 0, ptr addrspace(5) %tmp
313  store i32 1, ptr addrspace(5) %y
314  store i32 2, ptr addrspace(5) %z
315  store i32 3, ptr addrspace(5) %w
316  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
317  %tmp2 = load i32, ptr addrspace(5) %tmp1
318  %tmp3 = load i32, ptr addrspace(5) %tmp
319  %tmp4 = load i32, ptr addrspace(5) %y
320  %add1 = add i32 %tmp2, %tmp3
321  %add2 = add i32 %add1, %tmp4
322  store i32 %add2, ptr addrspace(1) %out
323  ret void
324}
325
326; OPT-LABEL: @bitcast_vector_to_vector(
327; OPT-NOT:   alloca
328; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(1) %out, align 16
329
330; GCN-LABEL: {{^}}bitcast_vector_to_vector:
331; GCN: v_mov_b32_e32 v0, 1
332; GCN: v_mov_b32_e32 v1, 2
333; GCN: v_mov_b32_e32 v2, 3
334; GCN: v_mov_b32_e32 v3, 4
335
336; GCN: ScratchSize: 0
337
338define amdgpu_kernel void @bitcast_vector_to_vector(ptr addrspace(1) %out)  {
339.entry:
340  %alloca = alloca <4 x float>, align 16, addrspace(5)
341  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca
342  %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16
343  store <4 x i32> %load, ptr addrspace(1) %out
344  ret void
345}
346
347; OPT-LABEL: @vector_bitcast_from_alloca_array(
348; OPT-NOT:   alloca
349; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(1) %out, align 16
350
351; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
352; GCN: v_mov_b32_e32 v0, 1
353; GCN: v_mov_b32_e32 v1, 2
354; GCN: v_mov_b32_e32 v2, 3
355; GCN: v_mov_b32_e32 v3, 4
356
357; GCN: ScratchSize: 0
358
359define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %out)  {
360.entry:
361  %alloca = alloca [4 x float], align 16, addrspace(5)
362  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca
363  %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16
364  store <4 x i32> %load, ptr addrspace(1) %out
365  ret void
366}
367
368; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
369; OPT-NOT:   alloca
370; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
371; OPT-NEXT: %out.repack1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 4
372; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
373; OPT-NEXT: %out.repack2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 8
374; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
375; OPT-NEXT: %out.repack3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 12
376; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4
377
378; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
379; GCN: v_mov_b32_e32 v0, 1
380; GCN: v_mov_b32_e32 v1, 2
381; GCN: v_mov_b32_e32 v2, 3
382; GCN: v_mov_b32_e32 v3, 4
383
384; GCN: ScratchSize: 0
385
386define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspace(1) %out)  {
387  %alloca = alloca [4 x float], align 16, addrspace(5)
388  store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr addrspace(5) %alloca
389  %load = load [4 x i32], ptr addrspace(5) %alloca, align 16
390  store [4 x i32] %load, ptr addrspace(1) %out
391  ret void
392}
393
394; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
395; OPT-NOT:   alloca
396; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
397; OPT-NEXT: %out.repack1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 4
398; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
399; OPT-NEXT: %out.repack2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 8
400; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
401; OPT-NEXT: %out.repack3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 12
402; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4
403
404; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
405; GCN: v_mov_b32_e32 v0, 1
406; GCN: v_mov_b32_e32 v1, 2
407; GCN: v_mov_b32_e32 v2, 3
408; GCN: v_mov_b32_e32 v3, 4
409
410; GCN: ScratchSize: 0
411
412%struct.v4 = type { i32, i32, i32, i32 }
413
414define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(ptr addrspace(1) %out)  {
415  %alloca = alloca [4 x float], align 16, addrspace(5)
416  store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, ptr addrspace(5) %alloca
417  %load = load %struct.v4, ptr addrspace(5) %alloca, align 16
418  store %struct.v4 %load, ptr addrspace(1) %out
419  ret void
420}
421
422declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture)
423
424declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture)
425
426declare void @llvm.assume(i1)
427