xref: /llvm-project/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll (revision 2d69827c5c754f0eca98e497ecf0e52ed54b4fd3)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes='sroa<preserve-cfg>' -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
3; RUN: opt -passes='sroa<modify-cfg>' -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
4
5%"struct.a" = type { <8 x half> }
6%"struct.b" = type { %"struct.a" }
7%"struct.c" = type { %"struct.a", i32, i8 }
8%"struct.d" = type { [4 x i32], %"struct.a" }
9%"struct.e" = type { [2 x <8 x half>], i32, i32 }
10%"struct.f" = type { [2 x <8 x i16>], i32, i32 }
11%"array.a" = type [2 x <8 x half>]
12%"array.b" = type [2 x %"struct.a"]
13
14define amdgpu_kernel void @test_zeroinit() #0 {
15; CHECK-LABEL: @test_zeroinit(
16; CHECK-NEXT:  entry:
17; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
18; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
19; CHECK-NEXT:    br label [[BB:%.*]]
20; CHECK:       bb:
21; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
22; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
23; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
24; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
25; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
26; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
27; CHECK-NEXT:    ret void
28;
29entry:
30  %b_blockwise_copy = alloca %"struct.b", align 16
31  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
32  %data = load <4 x float>, ptr undef
33  store <4 x float> %data, ptr %b_blockwise_copy, align 16
34  br label %bb
35
36bb:
37  %load1 = load half, ptr %b_blockwise_copy, align 16
38  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
39  %load2 = load half, ptr %ptr2, align 16
40  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
41  %load3 = load half, ptr %ptr3, align 16
42  ret void
43}
44
45define amdgpu_kernel void @test_memset() #0 {
46; CHECK-LABEL: @test_memset(
47; CHECK-NEXT:  entry:
48; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
49; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
50; CHECK-NEXT:    br label [[BB:%.*]]
51; CHECK:       bb:
52; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
53; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
54; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
55; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
56; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
57; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
58; CHECK-NEXT:    ret void
59;
60entry:
61  %b_blockwise_copy = alloca %"struct.b", align 16
62  call void @llvm.memset.p0.i64(ptr align 16 %b_blockwise_copy, i8 0, i64 16, i1 false)
63  %data = load <4 x float>, ptr undef
64  store <4 x float> %data, ptr %b_blockwise_copy, align 16
65  br label %bb
66
67bb:
68  %load1 = load half, ptr %b_blockwise_copy, align 16
69  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
70  %load2 = load half, ptr %ptr2, align 16
71  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
72  %load3 = load half, ptr %ptr3, align 16
73  ret void
74}
75
76define amdgpu_kernel void @vector_type_alloca() #0 {
77; CHECK-LABEL: @vector_type_alloca(
78; CHECK-NEXT:  entry:
79; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
80; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
81; CHECK-NEXT:    br label [[BB:%.*]]
82; CHECK:       bb:
83; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
84; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
85; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
86; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
87; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
88; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
89; CHECK-NEXT:    ret void
90;
91entry:
92  %b_blockwise_copy = alloca <8 x half>, align 16
93  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
94  %data = load <4 x float>, ptr undef
95  store <4 x float> %data, ptr %b_blockwise_copy, align 16
96  br label %bb
97
98bb:
99  %load1 = load half, ptr %b_blockwise_copy, align 16
100  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
101  %load2 = load half, ptr %ptr2, align 16
102  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
103  %load3 = load half, ptr %ptr3, align 16
104  ret void
105}
106
107define amdgpu_kernel void @test_struct_contain_multiple_types1() #0 {
108; CHECK-LABEL: @test_struct_contain_multiple_types1(
109; CHECK-NEXT:  entry:
110; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
111; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
112; CHECK-NEXT:    br label [[BB:%.*]]
113; CHECK:       bb:
114; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
115; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
116; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
117; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
118; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
119; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
120; CHECK-NEXT:    ret void
121;
122entry:
123  %b_blockwise_copy = alloca %"struct.c", align 16
124  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
125  %data = load <4 x float>, ptr undef
126  store <4 x float> %data, ptr %b_blockwise_copy, align 16
127  br label %bb
128
129bb:
130  %load1 = load half, ptr %b_blockwise_copy, align 16
131  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
132  %load2 = load half, ptr %ptr2, align 16
133  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
134  %load3 = load half, ptr %ptr3, align 16
135  ret void
136}
137
138define amdgpu_kernel void @test_struct_contain_multiple_types2() #0 {
139; CHECK-LABEL: @test_struct_contain_multiple_types2(
140; CHECK-NEXT:  entry:
141; CHECK-NEXT:    [[DATA1:%.*]] = load [4 x i32], ptr undef, align 4
142; CHECK-NEXT:    [[DATA1_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 0
143; CHECK-NEXT:    [[DATA1_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 1
144; CHECK-NEXT:    [[DATA1_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 2
145; CHECK-NEXT:    [[DATA1_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 3
146; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x float>, ptr undef, align 16
147; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x i16>
148; CHECK-NEXT:    br label [[BB:%.*]]
149; CHECK:       bb:
150; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
151; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_16_VEC_EXTRACT]] to half
152; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_18_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
153; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_18_VEC_EXTRACT]] to half
154; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_20_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
155; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_20_VEC_EXTRACT]] to half
156; CHECK-NEXT:    ret void
157;
158entry:
159  %b_blockwise_copy = alloca %"struct.d", align 16
160  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
161  %data1 = load [4 x i32], ptr undef
162  store [4 x i32] %data1, ptr %b_blockwise_copy, align 16
163  %data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
164  store <8 x half> zeroinitializer, ptr %data2_gep, align 16
165  %data2 = load <4 x float>, ptr undef
166  store <4 x float> %data2, ptr %data2_gep, align 16
167  br label %bb
168
169bb:
170  %ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
171  %load1 = load half, ptr %ptr1, align 16
172  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 18
173  %load2 = load half, ptr %ptr2, align 16
174  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 20
175  %load3 = load half, ptr %ptr3, align 16
176  ret void
177}
178
179define amdgpu_kernel void @test_struct_array_vector() #0 {
180; CHECK-LABEL: @test_struct_array_vector(
181; CHECK-NEXT:  entry:
182; CHECK-NEXT:    [[DATA0:%.*]] = load <4 x float>, ptr undef, align 16
183; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x i16>
184; CHECK-NEXT:    [[DATA1:%.*]] = load <4 x float>, ptr undef, align 16
185; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x i16>
186; CHECK-NEXT:    br label [[BB:%.*]]
187; CHECK:       bb:
188; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
189; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
190; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_3_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
191; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_3_16_VEC_EXTRACT]] to half
192; CHECK-NEXT:    ret void
193;
194entry:
195  %b_blockwise_copy = alloca %"struct.e", align 16
196  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
197  %0 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
198  store <8 x half> zeroinitializer, ptr %0, align 16
199  %data0 = load <4 x float>, ptr undef
200  store <4 x float> %data0, ptr %b_blockwise_copy, align 16
201  %data1 = load <4 x float>, ptr undef
202  store <4 x float> %data1, ptr %0, align 16
203  br label %bb
204
205bb:
206  %load1 = load half, ptr %b_blockwise_copy, align 16
207  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
208  %load2 = load half, ptr %ptr2, align 16
209  ret void
210}
211
212define amdgpu_kernel void @test_struct_array_vector_i16() #0 {
213; CHECK-LABEL: @test_struct_array_vector_i16(
214; CHECK-NEXT:  entry:
215; CHECK-NEXT:    [[DATA:%.*]] = load <4 x i32>, ptr undef, align 16
216; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[DATA]] to <8 x i16>
217; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x i32>, ptr undef, align 16
218; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[DATA2]] to <8 x i16>
219; CHECK-NEXT:    br label [[BB:%.*]]
220; CHECK:       bb:
221; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
222; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
223; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
224; CHECK-NEXT:    ret void
225;
226entry:
227  %b_blockwise_copy = alloca %"struct.f", align 16
228  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
229  %data = load <4 x i32>, ptr undef
230  store <4 x i32> %data, ptr %b_blockwise_copy, align 16
231  %data2 = load <4 x i32>, ptr undef
232  %data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
233  store <4 x i32> %data2, ptr %data2_gep, align 16
234  br label %bb
235
236bb:
237  %load1 = load i16, ptr %b_blockwise_copy, align 16
238  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
239  %load2 = load i16, ptr %ptr2, align 16
240  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
241  %load3 = load i16, ptr %ptr3, align 16
242  ret void
243}
244
245define amdgpu_kernel void @test_half_array() #0 {
246; CHECK-LABEL: @test_half_array(
247; CHECK-NEXT:  entry:
248; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
249; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
250; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
251; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
252; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float undef to i32
253; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float undef to i32
254; CHECK-NEXT:    [[DATA:%.*]] = load [4 x float], ptr undef, align 4
255; CHECK-NEXT:    [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
256; CHECK-NEXT:    store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
257; CHECK-NEXT:    [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
258; CHECK-NEXT:    store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
259; CHECK-NEXT:    [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
260; CHECK-NEXT:    [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
261; CHECK-NEXT:    br label [[BB:%.*]]
262; CHECK:       bb:
263; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
264; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
265; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
266; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
267; CHECK-NEXT:    ret void
268;
269entry:
270  %b_blockwise_copy = alloca [8 x half], align 16
271  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
272  %data = load [4 x float], ptr undef
273  store [4 x float] %data, ptr %b_blockwise_copy, align 16
274  br label %bb
275
276bb:
277  %load1 = load half, ptr %b_blockwise_copy, align 16
278  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
279  %load2 = load half, ptr %ptr2, align 16
280  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
281  %load3 = load half, ptr %ptr3, align 16
282  ret void
283}
284
285define amdgpu_kernel void @test_array_vector() #0 {
286; CHECK-LABEL: @test_array_vector(
287; CHECK-NEXT:  entry:
288; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
289; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
290; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
291; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
292; CHECK-NEXT:    br label [[BB:%.*]]
293; CHECK:       bb:
294; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
295; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
296; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
297; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
298; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
299; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
300; CHECK-NEXT:    ret void
301;
302entry:
303  %b_blockwise_copy = alloca %"array.a", align 16
304  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
305  %data = load <4 x float>, ptr undef
306  store <4 x float> %data, ptr %b_blockwise_copy, align 16
307  br label %bb
308
309bb:
310  %load1 = load half, ptr %b_blockwise_copy, align 16
311  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
312  %load2 = load half, ptr %ptr2, align 16
313  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
314  %load3 = load half, ptr %ptr3, align 16
315  ret void
316}
317
318define amdgpu_kernel void @test_array_vector2() #0 {
319; CHECK-LABEL: @test_array_vector2(
320; CHECK-NEXT:  entry:
321; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
322; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
323; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
324; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
325; CHECK-NEXT:    br label [[BB:%.*]]
326; CHECK:       bb:
327; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
328; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
329; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
330; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
331; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
332; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
333; CHECK-NEXT:    ret void
334;
335entry:
336  %b_blockwise_copy = alloca %"array.b", align 16
337  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
338  %data = load <4 x float>, ptr undef
339  store <4 x float> %data, ptr %b_blockwise_copy, align 16
340  br label %bb
341
342bb:
343  %load1 = load half, ptr %b_blockwise_copy, align 16
344  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
345  %load2 = load half, ptr %ptr2, align 16
346  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
347  %load3 = load half, ptr %ptr3, align 16
348  ret void
349}
350
351define amdgpu_kernel void @test_array_vector_no_vector_common_type() #0 {
352; CHECK-LABEL: @test_array_vector_no_vector_common_type(
353; CHECK-NEXT:  entry:
354; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
355; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
356; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7:%.*]] = alloca float, align 8
357; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10:%.*]] = alloca float, align 4
358; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_13:%.*]] = alloca <8 x half>, align 16
359; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
360; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
361; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 8 [[B_BLOCKWISE_COPY_SROA_7]], i8 0, i32 4, i1 false)
362; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_10]], i8 0, i32 4, i1 false)
363; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_13]], i8 0, i32 16, i1 false)
364; CHECK-NEXT:    [[DATA1:%.*]] = load float, ptr undef, align 4
365; CHECK-NEXT:    [[DATA2:%.*]] = load float, ptr undef, align 4
366; CHECK-NEXT:    [[DATA3:%.*]] = load float, ptr undef, align 4
367; CHECK-NEXT:    [[DATA4:%.*]] = load float, ptr undef, align 4
368; CHECK-NEXT:    store float [[DATA1]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
369; CHECK-NEXT:    store float [[DATA2]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
370; CHECK-NEXT:    store float [[DATA3]], ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
371; CHECK-NEXT:    store float [[DATA4]], ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
372; CHECK-NEXT:    br label [[BB:%.*]]
373; CHECK:       bb:
374; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
375; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
376; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
377; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
378; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_4]], i64 2
379; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_2_B_BLOCKWISE_COPY_SROA_4_6_LOAD4:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX]], align 2
380; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_0_B_BLOCKWISE_COPY_SROA_7_8_LOAD5:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
381; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_7]], i64 2
382; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_2_B_BLOCKWISE_COPY_SROA_7_10_LOAD6:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX]], align 2
383; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_0_B_BLOCKWISE_COPY_SROA_10_12_LOAD7:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
384; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_10]], i64 2
385; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_2_B_BLOCKWISE_COPY_SROA_10_14_LOAD8:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX]], align 2
386; CHECK-NEXT:    ret void
387;
388entry:
389  %b_blockwise_copy = alloca %"array.a", align 16
390  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
391  %data1 = load float, ptr undef
392  %data2 = load float, ptr undef
393  %data3 = load float, ptr undef
394  %data4 = load float, ptr undef
395  store float %data1, ptr %b_blockwise_copy, align 16
396  %data_ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
397  store float %data2, ptr %data_ptr1, align 16
398  %data_ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
399  store float %data3, ptr %data_ptr2, align 16
400  %data_ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
401  store float %data4, ptr %data_ptr3, align 16
402  br label %bb
403
404bb:
405  %load1 = load half, ptr %b_blockwise_copy, align 16
406  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
407  %load2 = load half, ptr %ptr2, align 16
408  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
409  %load3 = load half, ptr %ptr3, align 16
410  %ptr4 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 6
411  %load4 = load half, ptr %ptr4, align 16
412  %ptr5 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
413  %load5 = load half, ptr %ptr5, align 16
414  %ptr6 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 10
415  %load6 = load half, ptr %ptr6, align 16
416  %ptr7 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
417  %load7 = load half, ptr %ptr7, align 16
418  %ptr8 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 14
419  %load8 = load half, ptr %ptr8, align 16
420  ret void
421}
422
423declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) nounwind
424declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) nounwind
425attributes #0 = { nounwind readonly }
426
427;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
428; CHECK-MODIFY-CFG: {{.*}}
429; CHECK-PRESERVE-CFG: {{.*}}
430