xref: /llvm-project/llvm/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll (revision a930fec033a80bc92f5a11cc334ff4fc44cbe0ca)
1; RUN: llc < %s | FileCheck --check-prefix AS %s
2; RUN: opt -S -interleaved-load-combine < %s | FileCheck %s
3; RUN: opt -S -passes=interleaved-load-combine < %s | FileCheck %s
4
5; ModuleID = 'aarch64_interleaved-ld-combine.bc'
6target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
7target triple = "arm64--linux-gnu"
8
9; This should be lowered into LD4
10define void @aarch64_ilc_const(ptr %ptr) {
11entry:
12
13;;; Check LLVM transformation
14; CHECK-LABEL: @aarch64_ilc_const(
15; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, ptr %ptr, i64 2
16; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, ptr [[GEP]], align 16
17; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
18; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
19; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
20; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
21; CHECK: ret void
22
23;;; Check if it gets lowerd
24; AS-LABEL: aarch64_ilc_const
25; AS: ld4
26; AS: ret
27
28  %gep1 = getelementptr inbounds <4 x float>, ptr %ptr, i64  2
29  %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i64  3
30  %gep3 = getelementptr inbounds <4 x float>, ptr %ptr, i64  4
31  %gep4 = getelementptr inbounds <4 x float>, ptr %ptr, i64  5
32  %ld1 = load <4 x float>, ptr %gep1, align 16
33  %ld2 = load <4 x float>, ptr %gep2, align 16
34  %ld3 = load <4 x float>, ptr %gep3, align 16
35  %ld4 = load <4 x float>, ptr %gep4, align 16
36  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
37  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
38  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
39  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
40  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
41  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
42  %m8_11  = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
43  %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
44
45  store <4 x float> %m0_3, ptr %gep1, align 16
46  store <4 x float> %m4_7, ptr %gep2, align 16
47  store <4 x float> %m8_11, ptr %gep3, align 16
48  store <4 x float> %m12_15, ptr %gep4, align 16
49  ret void
50}
51
52; This should be lowered into LD4
53define void @aarch64_ilc_idx(ptr %ptr, i64 %idx) {
54entry:
55
56;;; Check LLVM transformation
57; CHECK-LABEL: @aarch64_ilc_idx(
58; CHECK-DAG: [[ADD:%.+]] = add i64 %idx, 16
59; CHECK-DAG: [[LSHR:%.+]] = lshr i64 [[ADD]], 2
60; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, ptr %ptr, i64 [[LSHR]]
61; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, ptr [[GEP]], align 16
62; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
63; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
64; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
65; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
66; CHECK: ret void
67
68; AS-LABEL: aarch64_ilc_idx
69; AS-DAG: lsl [[LSL:x[0-9]+]], x1, #2
70; AS-DAG: add [[ADD:x[0-9]+]], [[LSL]], #64
71; AS-DAG: and [[AND:x[0-9]+]], [[ADD]], #0xfffffffffffffff0
72; AS-DAG: add [[ADR:x[0-9]+]], x0, [[AND]]
73; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, [[[ADR]]]
74; AS-DAG: str q[[V0]]
75; AS-DAG: str q[[V1]]
76; AS-DAG: str q[[V2]]
77; AS-DAG: str q[[V3]]
78; AS: ret
79
80  %a2 = add i64 %idx, 20
81  %idx2 = lshr i64 %a2, 2
82  %a3 = add i64 %idx, 24
83  %a1 = add i64 %idx, 16
84  %idx1 = lshr i64 %a1, 2
85  %idx3 = lshr i64 %a3, 2
86  %a4 = add i64 %idx, 28
87  %idx4 = lshr i64 %a4, 2
88
89  %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx2
90  %gep4 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx4
91  %gep1 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx1
92  %gep3 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx3
93  %ld1 = load <4 x float>, ptr %gep1, align 16
94  %ld2 = load <4 x float>, ptr %gep2, align 16
95  %ld3 = load <4 x float>, ptr %gep3, align 16
96  %ld4 = load <4 x float>, ptr %gep4, align 16
97  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
98  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
99  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
100  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
101  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
102  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
103  %m8_11  = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
104  %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
105
106  store <4 x float> %m0_3, ptr %gep1, align 16
107  store <4 x float> %m4_7, ptr %gep2, align 16
108  store <4 x float> %m8_11, ptr %gep3, align 16
109  store <4 x float> %m12_15, ptr %gep4, align 16
110  ret void
111}
112
113; This should be lowered into LD4, a offset of has to be taken into account
114%struct.ilc = type <{ float, [0 x <4 x float>] }>
115define void @aarch64_ilc_struct(ptr %ptr, i64 %idx) {
116entry:
117
118;;; Check LLVM transformation
119; CHECK-LABEL: @aarch64_ilc_struct(
120; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
121; CHECK-DAG: [[GEP:%.+]] = getelementptr %struct.ilc, ptr %ptr, i32 0, i32 1, i64 [[LSHR]]
122; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, ptr [[GEP]], align 4
123; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
124; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
125; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
126; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
127; CHECK: ret void
128
129; AS-LABEL: aarch64_ilc_struct
130; AS-DAG: lsl [[LSL:x[0-9]+]], x1, #2
131; AS-DAG: add [[ADD:x[0-9]+]], x0, #4
132; AS-DAG: and [[AND:x[0-9]+]], [[LSL]], #0xfffffffffffffff0
133; AS-DAG: add [[ADR:x[0-9]+]], [[ADD]], [[AND]]
134; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, [[[ADR]]]
135; AS-DAG: str q[[V0]]
136; AS-DAG: str q[[V1]]
137; AS-DAG: str q[[V2]]
138; AS-DAG: str q[[V3]]
139; AS: ret
140
141  %a1 = add i64 %idx, 4
142  %idx2 = lshr i64 %a1, 2
143  %a2 = add i64 %idx, 8
144  %idx3 = lshr i64 %a2, 2
145  %a3 = add i64 %idx, 12
146  %idx4 = lshr i64 %a3, 2
147
148  %gep2 = getelementptr %struct.ilc, ptr %ptr, i32 0, i32 1, i64 %idx2
149  %gep3 = getelementptr %struct.ilc, ptr %ptr, i32 0, i32 1, i64 %idx3
150  %gep4 = getelementptr %struct.ilc, ptr %ptr, i32 0, i32 1, i64 %idx4
151  %idx1 = lshr i64 %idx, 2
152  %gep1 = getelementptr %struct.ilc, ptr %ptr, i32 0, i32 1, i64 %idx1
153  %ld1 = load <4 x float>, ptr %gep1, align 4
154  %ld2 = load <4 x float>, ptr %gep2, align 4
155  %ld3 = load <4 x float>, ptr %gep3, align 4
156  %ld4 = load <4 x float>, ptr %gep4, align 4
157  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
158  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
159  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
160  %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
161  %m0_3   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
162  %m4_7   = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
163  %m8_11  = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
164  %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
165
166  store <4 x float> %m0_3, ptr %gep1, align 16
167  store <4 x float> %m4_7, ptr %gep2, align 16
168  store <4 x float> %m8_11, ptr %gep3, align 16
169  store <4 x float> %m12_15, ptr %gep4, align 16
170  ret void
171}
172
173; This should be lowered into LD2
174define void @aarch64_ilc_idx_ld2(ptr %ptr, i64 %idx) {
175entry:
176; CHECK-LABEL: @aarch64_ilc_idx_ld2(
177; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
178; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, ptr %ptr, i64 [[LSHR]]
179; CHECK-DAG: [[LOAD:%.+]] = load <8 x float>, ptr [[GEP]], align 16
180; CHECK: %{{.* }}= shufflevector <8 x float> [[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
181; CHECK: %{{.* }}= shufflevector <8 x float> [[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
182; CHECK-DAG: ret void
183
184; AS-LABEL: aarch64_ilc_idx_ld2
185; AS: ld2
186; AS: ret
187
188  %idx1 = lshr i64 %idx, 2
189  %a1 = add i64 %idx, 4
190  %idx2 = lshr i64 %a1, 2
191
192  %gep1 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx1
193  %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx2
194  %ld1 = load <4 x float>, ptr %gep1, align 16
195  %ld2 = load <4 x float>, ptr %gep2, align 16
196  %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
197  %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
198
199  store <4 x float> %m0_3, ptr %gep1
200  store <4 x float> %m4_7, ptr %gep2
201  ret void
202}
203
204; This should be lowered into LD3
205define void @aarch64_ilc_idx_ld3(ptr %ptr, i64 %idx) {
206entry:
207; CHECK-LABEL: @aarch64_ilc_idx_ld3(
208; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
209; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, ptr %ptr, i64 [[LSHR]]
210; CHECK-DAG: [[LOAD:%.+]] = load <12 x float>, ptr [[GEP]], align 16
211; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
212; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
213; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
214; CHECK-DAG: ret void
215
216; AS-LABEL: aarch64_ilc_idx_ld3
217; AS: ld3
218; AS: ret
219
220  %idx1 = lshr i64 %idx, 2
221  %a1 = add i64 %idx, 4
222  %idx2 = lshr i64 %a1, 2
223  %a2 = add i64 %idx, 8
224  %idx3 = lshr i64 %a2, 2
225
226  %gep1 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx1
227  %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx2
228  %gep3 = getelementptr inbounds <4 x float>, ptr %ptr, i64  %idx3
229  %ld1 = load <4 x float>, ptr %gep1, align 16
230  %ld2 = load <4 x float>, ptr %gep2, align 16
231  %ld3 = load <4 x float>, ptr %gep3, align 16
232
233  %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 3, i32 6, i32 undef>
234  %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 4, i32 7, i32 undef>
235  %sv3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
236  %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %ld3, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
237  %m4_7 = shufflevector <4 x float> %sv2, <4 x float> %ld3, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
238  %m8_11 = shufflevector <4 x float> %sv3, <4 x float> %ld3, <4 x i32> <i32 0, i32 1, i32 4, i32 7>
239
240  store <4 x float> %m0_3, ptr %gep1, align 16
241  store <4 x float> %m4_7, ptr %gep2, align 16
242  store <4 x float> %m8_11, ptr %gep3, align 16
243  ret void
244}
245;  %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
246
247; This must not be lowered
248define void @aarch64_ilc_i32_idx(ptr %ptr, i32 %idx) {
249; CHECK-LABEL: @aarch64_ilc_i32_idx(
250; CHECK: %idx1 = lshr i32 %idx, 2
251; CHECK-NEXT: %a1 = add i32 %idx, 4
252; CHECK-NEXT: %idx2 = lshr i32 %a1, 2
253; CHECK-NEXT: %gep1 = getelementptr inbounds <4 x float>, ptr %ptr, i32 %idx1
254; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i32 %idx2
255; CHECK-NEXT: %ld1 = load <4 x float>, ptr %gep1, align 16
256; CHECK-NEXT: %ld2 = load <4 x float>, ptr %gep2, align 16
257; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
258; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
259; CHECK-NEXT: store <4 x float> %m0_3, ptr %gep1, align 16
260; CHECK-NEXT: store <4 x float> %m4_7, ptr %gep2, align 16
261; CHECK-NEXT: ret void
262
263; AS-LABEL: aarch64_ilc_i32_idx
264; AS-DAG: @function
265; AS-NOT: ld2
266; AS-NOT: ld3
267; AS-NOT: ld4
268; AS-DAG: ret
269
270entry:
271  %idx1 = lshr i32 %idx, 2
272  %a1 = add i32 %idx, 4
273  %idx2 = lshr i32 %a1, 2
274
275  %gep1 = getelementptr inbounds <4 x float>, ptr %ptr, i32 %idx1
276  %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i32 %idx2
277  %ld1 = load <4 x float>, ptr %gep1, align 16
278  %ld2 = load <4 x float>, ptr %gep2, align 16
279  %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
280  %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
281
282  store <4 x float> %m0_3, ptr %gep1, align 16
283  store <4 x float> %m4_7, ptr %gep2, align 16
284  ret void
285}
286
287; Volatile loads must not be lowered
288define void @aarch64_ilc_volatile(ptr %ptr) {
289; CHECK-LABEL: @aarch64_ilc_volatile(
290; CHECK: %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i32 1
291; CHECK-NEXT: %ld1 = load volatile <4 x float>, ptr %ptr, align 16
292; CHECK-NEXT: %ld2 = load <4 x float>, ptr %gep2, align 16
293; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
294; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
295; CHECK-NEXT: store <4 x float> %m0_3, ptr %ptr, align 16
296; CHECK-NEXT: store <4 x float> %m4_7, ptr %gep2, align 16
297; CHECK-NEXT: ret void
298
299; AS-LABEL: aarch64_ilc_volatile
300; AS-DAG: @function
301; AS-NOT: ld2
302; AS-NOT: ld3
303; AS-NOT: ld4
304; AS-DAG: ret
305
306entry:
307  %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i32 1
308  %ld1 = load volatile <4 x float>, ptr %ptr, align 16
309  %ld2 = load <4 x float>, ptr %gep2, align 16
310  %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
311  %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
312  store <4 x float> %m0_3, ptr %ptr, align 16
313  store <4 x float> %m4_7, ptr %gep2, align 16
314  ret void
315}
316
317; This must not be lowered
318define void @aarch64_ilc_depmem(ptr %ptr, i32 %idx) {
319entry:
320; CHECK-LABEL: @aarch64_ilc_depmem(
321; CHECK: %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i32 1
322; CHECK-NEXT: %ld1 = load <4 x float>, ptr %ptr, align 16
323; CHECK-NEXT: store <4 x float> %ld1, ptr %gep2, align 16
324; CHECK-NEXT: %ld2 = load <4 x float>, ptr %gep2, align 16
325; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
326; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
327; CHECK-NEXT: store <4 x float> %m0_3, ptr %ptr, align 16
328; CHECK-NEXT: store <4 x float> %m4_7, ptr %gep2, align 16
329; CHECK-NEXT: ret void
330
331; AS-LABEL: aarch64_ilc_depmem
332; AS-DAG: @function
333; AS-NOT: ld2
334; AS-NOT: ld3
335; AS-NOT: ld4
336; AS-DAG: ret
337
338  %gep2 = getelementptr inbounds <4 x float>, ptr %ptr, i32 1
339  %ld1 = load <4 x float>, ptr %ptr, align 16
340  store <4 x float> %ld1, ptr %gep2, align 16
341  %ld2 = load <4 x float>, ptr %gep2, align 16
342  %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
343  %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
344
345  store <4 x float> %m0_3, ptr %ptr, align 16
346  store <4 x float> %m4_7, ptr %gep2, align 16
347  ret void
348}
349
350; This cannot be converted - insertion position cannot be determined
351define void @aarch64_no_insertion_pos(ptr %ptr) {
352entry:
353; CHECK-LABEL: @aarch64_no_insertion_pos(
354; CHECK: %p1 = getelementptr inbounds float, ptr %ptr, i32 4
355; CHECK-NEXT: %l0 = load <5 x float>, ptr %ptr
356; CHECK-NEXT: %l1 = load <5 x float>, ptr %p1
357; CHECK-NEXT: %s0 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> <i32 1, i32 3, i32 6, i32 8>
358; CHECK-NEXT: %s1 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> <i32 2, i32 4, i32 7, i32 9>
359; CHECK-NEXT: ret void
360
361  %p1 = getelementptr inbounds float, ptr %ptr, i32 4
362  %l0 = load <5 x float>, ptr %ptr
363  %l1 = load <5 x float>, ptr %p1
364  %s0 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> <i32 1, i32 3, i32 6, i32 8>
365  %s1 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> <i32 2, i32 4, i32 7, i32 9>
366  ret void
367}
368
369; This cannot be converted - the insertion position does not dominate all
370; uses
371define void @aarch64_insertpos_does_not_dominate(ptr %ptr) {
372entry:
373; CHECK-LABEL: @aarch64_insertpos_does_not_dominate(
374; CHECK: %p1 = getelementptr inbounds float, ptr %ptr, i32 1
375; CHECK-NEXT: %l1 = load <7 x float>, ptr %p1
376; CHECK-NEXT: %s1 = shufflevector <7 x float> %l1, <7 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
377; CHECK-NEXT: %l0 = load <7 x float>, ptr %ptr
378; CHECK-NEXT: %s0 = shufflevector <7 x float> %l0, <7 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
379; CHECK-NEXT: ret void
380  %p1 = getelementptr inbounds float, ptr %ptr, i32 1
381  %l1 = load <7 x float>, ptr %p1
382  %s1 = shufflevector <7 x float> %l1, <7 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
383  %l0 = load <7 x float>, ptr %ptr
384  %s0 = shufflevector <7 x float> %l0, <7 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
385  ret void
386}
387