xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll (revision 9122c5235ec85ce0c0ad337e862b006e7b349d84)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32
3; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64
4; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN
5
6; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F
7
8; The two loads are contigous and should be folded into one
9define void @widen_2xv4i16(ptr %x, ptr %z) {
10; CHECK-LABEL: widen_2xv4i16:
11; CHECK:       # %bb.0:
12; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
13; CHECK-NEXT:    vle64.v v8, (a0)
14; CHECK-NEXT:    vse64.v v8, (a1)
15; CHECK-NEXT:    ret
16  %a = load <4 x i16>, ptr %x
17  %b.gep = getelementptr i8, ptr %x, i64 8
18  %b = load <4 x i16>, ptr %b.gep
19  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
20  store <8 x i16> %c, ptr %z
21  ret void
22}
23
24define void @widen_3xv4i16(ptr %x, ptr %z) {
25; CHECK-LABEL: widen_3xv4i16:
26; CHECK:       # %bb.0:
27; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
28; CHECK-NEXT:    vle16.v v8, (a0)
29; CHECK-NEXT:    addi a2, a0, 8
30; CHECK-NEXT:    vle16.v v9, (a2)
31; CHECK-NEXT:    addi a0, a0, 16
32; CHECK-NEXT:    vle16.v v10, (a0)
33; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
34; CHECK-NEXT:    vslideup.vi v8, v9, 4
35; CHECK-NEXT:    vsetivli zero, 12, e16, m2, ta, ma
36; CHECK-NEXT:    vslideup.vi v8, v10, 8
37; CHECK-NEXT:    vse16.v v8, (a1)
38; CHECK-NEXT:    ret
39  %a = load <4 x i16>, ptr %x
40  %b.gep = getelementptr i8, ptr %x, i64 8
41  %b = load <4 x i16>, ptr %b.gep
42  %c.gep = getelementptr i8, ptr %b.gep, i64 8
43  %c = load <4 x i16>, ptr %c.gep
44  %d.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
45  %d.1 = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
46  %d.2 = shufflevector <8 x i16> %d.0, <8 x i16> %d.1, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
47  store <12 x i16> %d.2, ptr %z
48  ret void
49}
50
51define void @widen_4xv4i16(ptr %x, ptr %z) {
52; CHECK-LABEL: widen_4xv4i16:
53; CHECK:       # %bb.0:
54; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
55; CHECK-NEXT:    vle64.v v8, (a0)
56; CHECK-NEXT:    vse64.v v8, (a1)
57; CHECK-NEXT:    ret
58  %a = load <4 x i16>, ptr %x
59  %b.gep = getelementptr i8, ptr %x, i64 8
60  %b = load <4 x i16>, ptr %b.gep
61  %c.gep = getelementptr i8, ptr %b.gep, i64 8
62  %c = load <4 x i16>, ptr %c.gep
63  %d.gep = getelementptr i8, ptr %c.gep, i64 8
64  %d = load <4 x i16>, ptr %d.gep
65  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
66  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
67  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
68  store <16 x i16> %e.2, ptr %z
69  ret void
70}
71
72define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
73; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
74; CHECK-NO-MISALIGN:       # %bb.0:
75; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
76; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
77; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 16
78; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a2)
79; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 8
80; CHECK-NO-MISALIGN-NEXT:    addi a0, a0, 24
81; CHECK-NO-MISALIGN-NEXT:    vle8.v v9, (a0)
82; CHECK-NO-MISALIGN-NEXT:    vle8.v v11, (a2)
83; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v10, v9, 4
84; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v11, 4
85; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
86; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v10, 8
87; CHECK-NO-MISALIGN-NEXT:    vse16.v v8, (a1)
88; CHECK-NO-MISALIGN-NEXT:    ret
89;
90; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned:
91; RV64-MISALIGN:       # %bb.0:
92; RV64-MISALIGN-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
93; RV64-MISALIGN-NEXT:    vle64.v v8, (a0)
94; RV64-MISALIGN-NEXT:    vse64.v v8, (a1)
95; RV64-MISALIGN-NEXT:    ret
96  %a = load <4 x i16>, ptr %x, align 1
97  %b.gep = getelementptr i8, ptr %x, i64 8
98  %b = load <4 x i16>, ptr %b.gep, align 1
99  %c.gep = getelementptr i8, ptr %b.gep, i64 8
100  %c = load <4 x i16>, ptr %c.gep, align 1
101  %d.gep = getelementptr i8, ptr %c.gep, i64 8
102  %d = load <4 x i16>, ptr %d.gep, align 1
103  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
104  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
105  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106  store <16 x i16> %e.2, ptr %z
107  ret void
108}
109
110; Should be a strided load - with type coercion to i64
111define void @strided_constant(ptr %x, ptr %z) {
112; CHECK-LABEL: strided_constant:
113; CHECK:       # %bb.0:
114; CHECK-NEXT:    li a2, 16
115; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
116; CHECK-NEXT:    vlse64.v v8, (a0), a2
117; CHECK-NEXT:    vse64.v v8, (a1)
118; CHECK-NEXT:    ret
119  %a = load <4 x i16>, ptr %x
120  %b.gep = getelementptr i8, ptr %x, i64 16
121  %b = load <4 x i16>, ptr %b.gep
122  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
123  store <8 x i16> %c, ptr %z
124  ret void
125}
126
127; Should be a strided load
128define void @strided_constant_64(ptr %x, ptr %z) {
129; CHECK-LABEL: strided_constant_64:
130; CHECK:       # %bb.0:
131; CHECK-NEXT:    li a2, 64
132; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
133; CHECK-NEXT:    vlse64.v v8, (a0), a2
134; CHECK-NEXT:    vse64.v v8, (a1)
135; CHECK-NEXT:    ret
136  %a = load <4 x i16>, ptr %x
137  %b.gep = getelementptr i8, ptr %x, i64 64
138  %b = load <4 x i16>, ptr %b.gep
139  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
140  store <8 x i16> %c, ptr %z
141  ret void
142}
143
144; Vector is too large to fit into a single strided load
145define void @strided_constant_v4i32(ptr %x, ptr %z) {
146; CHECK-LABEL: strided_constant_v4i32:
147; CHECK:       # %bb.0:
148; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
149; CHECK-NEXT:    vle32.v v8, (a0)
150; CHECK-NEXT:    addi a0, a0, 32
151; CHECK-NEXT:    vle32.v v10, (a0)
152; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
153; CHECK-NEXT:    vslideup.vi v8, v10, 4
154; CHECK-NEXT:    vse32.v v8, (a1)
155; CHECK-NEXT:    ret
156  %a = load <4 x i32>, ptr %x
157  %b.gep = getelementptr i8, ptr %x, i64 32
158  %b = load <4 x i32>, ptr %b.gep
159  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
160  store <8 x i32> %c, ptr %z
161  ret void
162}
163
164; Interestingly, can be a stride 0 load
165define void @strided_constant_0(ptr %x, ptr %z) {
166; CHECK-LABEL: strided_constant_0:
167; CHECK:       # %bb.0:
168; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
169; CHECK-NEXT:    vle16.v v8, (a0)
170; CHECK-NEXT:    vmv1r.v v9, v8
171; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
172; CHECK-NEXT:    vslideup.vi v9, v8, 4
173; CHECK-NEXT:    vse16.v v9, (a1)
174; CHECK-NEXT:    ret
175  %a = load <4 x i16>, ptr %x
176  %b = load <4 x i16>, ptr %x
177  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
178  store <8 x i16> %c, ptr %z
179  ret void
180}
181
182; Stride isn't consistent, so shouldn't be combined
183define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
184; CHECK-LABEL: strided_constant_mismatch_4xv4i16:
185; CHECK:       # %bb.0:
186; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
187; CHECK-NEXT:    vle16.v v8, (a0)
188; CHECK-NEXT:    addi a2, a0, 6
189; CHECK-NEXT:    vle16.v v10, (a2)
190; CHECK-NEXT:    addi a2, a0, 2
191; CHECK-NEXT:    addi a0, a0, 8
192; CHECK-NEXT:    vle16.v v9, (a0)
193; CHECK-NEXT:    vle16.v v11, (a2)
194; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
195; CHECK-NEXT:    vslideup.vi v10, v9, 4
196; CHECK-NEXT:    vslideup.vi v8, v11, 4
197; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
198; CHECK-NEXT:    vslideup.vi v8, v10, 8
199; CHECK-NEXT:    vse16.v v8, (a1)
200; CHECK-NEXT:    ret
201  %a = load <4 x i16>, ptr %x
202  %b.gep = getelementptr i8, ptr %x, i64 2
203  %b = load <4 x i16>, ptr %b.gep
204  %c.gep = getelementptr i8, ptr %b.gep, i64 4
205  %c = load <4 x i16>, ptr %c.gep
206  %d.gep = getelementptr i8, ptr %c.gep, i64 2
207  %d = load <4 x i16>, ptr %d.gep
208  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
209  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
210  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
211  store <16 x i16> %e.2, ptr %z
212  ret void
213}
214
215define void @strided_runtime(ptr %x, ptr %z, i64 %s) {
216; CHECK-LABEL: strided_runtime:
217; CHECK:       # %bb.0:
218; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
219; CHECK-NEXT:    vlse64.v v8, (a0), a2
220; CHECK-NEXT:    vse64.v v8, (a1)
221; CHECK-NEXT:    ret
222  %a = load <4 x i16>, ptr %x
223  %b.gep = getelementptr i8, ptr %x, i64 %s
224  %b = load <4 x i16>, ptr %b.gep
225  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
226  store <8 x i16> %c, ptr %z
227  ret void
228}
229
230define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) {
231; CHECK-LABEL: strided_runtime_4xv4i16:
232; CHECK:       # %bb.0:
233; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
234; CHECK-NEXT:    vlse64.v v8, (a0), a2
235; CHECK-NEXT:    vse64.v v8, (a1)
236; CHECK-NEXT:    ret
237  %a = load <4 x i16>, ptr %x
238  %b.gep = getelementptr i8, ptr %x, i64 %s
239  %b = load <4 x i16>, ptr %b.gep
240  %c.gep = getelementptr i8, ptr %b.gep, i64 %s
241  %c = load <4 x i16>, ptr %c.gep
242  %d.gep = getelementptr i8, ptr %c.gep, i64 %s
243  %d = load <4 x i16>, ptr %d.gep
244  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
245  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
246  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
247  store <16 x i16> %e.2, ptr %z
248  ret void
249}
250
251; Stride isn't consistent, so shouldn't be combined
252define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
253; RV32-LABEL: strided_runtime_mismatch_4xv4i16:
254; RV32:       # %bb.0:
255; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
256; RV32-NEXT:    vle16.v v8, (a0)
257; RV32-NEXT:    add a0, a0, a2
258; RV32-NEXT:    add a4, a0, a4
259; RV32-NEXT:    vle16.v v10, (a4)
260; RV32-NEXT:    add a2, a4, a2
261; RV32-NEXT:    vle16.v v9, (a2)
262; RV32-NEXT:    vle16.v v11, (a0)
263; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
264; RV32-NEXT:    vslideup.vi v10, v9, 4
265; RV32-NEXT:    vslideup.vi v8, v11, 4
266; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
267; RV32-NEXT:    vslideup.vi v8, v10, 8
268; RV32-NEXT:    vse16.v v8, (a1)
269; RV32-NEXT:    ret
270;
271; RV64-LABEL: strided_runtime_mismatch_4xv4i16:
272; RV64:       # %bb.0:
273; RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
274; RV64-NEXT:    vle16.v v8, (a0)
275; RV64-NEXT:    add a0, a0, a2
276; RV64-NEXT:    add a3, a0, a3
277; RV64-NEXT:    vle16.v v10, (a3)
278; RV64-NEXT:    add a2, a3, a2
279; RV64-NEXT:    vle16.v v9, (a2)
280; RV64-NEXT:    vle16.v v11, (a0)
281; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
282; RV64-NEXT:    vslideup.vi v10, v9, 4
283; RV64-NEXT:    vslideup.vi v8, v11, 4
284; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
285; RV64-NEXT:    vslideup.vi v8, v10, 8
286; RV64-NEXT:    vse16.v v8, (a1)
287; RV64-NEXT:    ret
288;
289; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16:
290; ZVE64F:       # %bb.0:
291; ZVE64F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
292; ZVE64F-NEXT:    vle16.v v8, (a0)
293; ZVE64F-NEXT:    add a0, a0, a2
294; ZVE64F-NEXT:    add a3, a0, a3
295; ZVE64F-NEXT:    vle16.v v10, (a3)
296; ZVE64F-NEXT:    add a2, a3, a2
297; ZVE64F-NEXT:    vle16.v v9, (a2)
298; ZVE64F-NEXT:    vle16.v v11, (a0)
299; ZVE64F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
300; ZVE64F-NEXT:    vslideup.vi v10, v9, 4
301; ZVE64F-NEXT:    vslideup.vi v8, v11, 4
302; ZVE64F-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
303; ZVE64F-NEXT:    vslideup.vi v8, v10, 8
304; ZVE64F-NEXT:    vse16.v v8, (a1)
305; ZVE64F-NEXT:    ret
306  %a = load <4 x i16>, ptr %x
307  %b.gep = getelementptr i8, ptr %x, i64 %s
308  %b = load <4 x i16>, ptr %b.gep
309  %c.gep = getelementptr i8, ptr %b.gep, i64 %t
310  %c = load <4 x i16>, ptr %c.gep
311  %d.gep = getelementptr i8, ptr %c.gep, i64 %s
312  %d = load <4 x i16>, ptr %d.gep
313  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
314  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
315  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
316  store <16 x i16> %e.2, ptr %z
317  ret void
318}
319
320define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) {
321; CHECK-LABEL: strided_runtime_4xv4f16:
322; CHECK:       # %bb.0:
323; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
324; CHECK-NEXT:    vlse64.v v8, (a0), a2
325; CHECK-NEXT:    vse64.v v8, (a1)
326; CHECK-NEXT:    ret
327  %a = load <4 x half>, ptr %x
328  %b.gep = getelementptr i8, ptr %x, i64 %s
329  %b = load <4 x half>, ptr %b.gep
330  %c.gep = getelementptr i8, ptr %b.gep, i64 %s
331  %c = load <4 x half>, ptr %c.gep
332  %d.gep = getelementptr i8, ptr %c.gep, i64 %s
333  %d = load <4 x half>, ptr %d.gep
334  %e.0 = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
335  %e.1 = shufflevector <4 x half> %c, <4 x half> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
336  %e.2 = shufflevector <8 x half> %e.0, <8 x half> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
337  store <16 x half> %e.2, ptr %z
338  ret void
339}
340
341define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
342; CHECK-LABEL: strided_runtime_4xv2f32:
343; CHECK:       # %bb.0:
344; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
345; CHECK-NEXT:    vlse64.v v8, (a0), a2
346; CHECK-NEXT:    vse64.v v8, (a1)
347; CHECK-NEXT:    ret
348  %a = load <2 x float>, ptr %x
349  %b.gep = getelementptr i8, ptr %x, i64 %s
350  %b = load <2 x float>, ptr %b.gep
351  %c.gep = getelementptr i8, ptr %b.gep, i64 %s
352  %c = load <2 x float>, ptr %c.gep
353  %d.gep = getelementptr i8, ptr %c.gep, i64 %s
354  %d = load <2 x float>, ptr %d.gep
355  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
356  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
357  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
358  store <8 x float> %e.2, ptr %z
359  ret void
360}
361
362define void @strided_unaligned(ptr %x, ptr %z, i64 %s) {
363; CHECK-NO-MISALIGN-LABEL: strided_unaligned:
364; CHECK-NO-MISALIGN:       # %bb.0:
365; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
366; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
367; CHECK-NO-MISALIGN-NEXT:    add a0, a0, a2
368; CHECK-NO-MISALIGN-NEXT:    vle8.v v9, (a0)
369; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v9, 4
370; CHECK-NO-MISALIGN-NEXT:    vse16.v v8, (a1)
371; CHECK-NO-MISALIGN-NEXT:    ret
372;
373; RV64-MISALIGN-LABEL: strided_unaligned:
374; RV64-MISALIGN:       # %bb.0:
375; RV64-MISALIGN-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
376; RV64-MISALIGN-NEXT:    vlse64.v v8, (a0), a2
377; RV64-MISALIGN-NEXT:    vse64.v v8, (a1)
378; RV64-MISALIGN-NEXT:    ret
379  %a = load <4 x i16>, ptr %x, align 1
380  %b.gep = getelementptr i8, ptr %x, i64 %s
381  %b = load <4 x i16>, ptr %b.gep, align 1
382  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
383  store <8 x i16> %c, ptr %z
384  ret void
385}
386
387; Should use the most restrictive common alignment
388define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) {
389; CHECK-LABEL: strided_mismatched_alignments:
390; CHECK:       # %bb.0:
391; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
392; CHECK-NEXT:    vlse64.v v8, (a0), a2
393; CHECK-NEXT:    vse64.v v8, (a1)
394; CHECK-NEXT:    ret
395  %a = load <4 x i16>, ptr %x, align 8
396  %b.gep = getelementptr i8, ptr %x, i64 %s
397  %b = load <4 x i16>, ptr %b.gep, align 16
398  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
399  store <8 x i16> %c, ptr %z
400  ret void
401}
402
403define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) {
404; CHECK-LABEL: strided_ok_alignments_8:
405; CHECK:       # %bb.0:
406; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
407; CHECK-NEXT:    vlse64.v v8, (a0), a2
408; CHECK-NEXT:    vse64.v v8, (a1)
409; CHECK-NEXT:    ret
410  %a = load <4 x i16>, ptr %x, align 8
411  %b.gep = getelementptr i8, ptr %x, i64 %s
412  %b = load <4 x i16>, ptr %b.gep, align 8
413  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
414  store <8 x i16> %c, ptr %z
415  ret void
416}
417
418define void @strided_ok_alignments_16(ptr %x, ptr %z, i64 %s) {
419; CHECK-LABEL: strided_ok_alignments_16:
420; CHECK:       # %bb.0:
421; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
422; CHECK-NEXT:    vlse64.v v8, (a0), a2
423; CHECK-NEXT:    vse64.v v8, (a1)
424; CHECK-NEXT:    ret
425  %a = load <4 x i16>, ptr %x, align 16
426  %b.gep = getelementptr i8, ptr %x, i64 %s
427  %b = load <4 x i16>, ptr %b.gep, align 16
428  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
429  store <8 x i16> %c, ptr %z
430  ret void
431}
432
433; Shouldn't be combined because one of the loads is not simple
434define void @strided_non_simple_load(ptr %x, ptr %z, i64 %s) {
435; CHECK-LABEL: strided_non_simple_load:
436; CHECK:       # %bb.0:
437; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
438; CHECK-NEXT:    vle16.v v8, (a0)
439; CHECK-NEXT:    add a0, a0, a2
440; CHECK-NEXT:    vle16.v v9, (a0)
441; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
442; CHECK-NEXT:    vslideup.vi v8, v9, 4
443; CHECK-NEXT:    vse16.v v8, (a1)
444; CHECK-NEXT:    ret
445  %a = load <4 x i16>, ptr %x
446  %b.gep = getelementptr i8, ptr %x, i64 %s
447  %b = load volatile <4 x i16>, ptr %b.gep
448  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
449  store <8 x i16> %c, ptr %z
450  ret void
451}
452
453; Shouldn't be combined because one of the operands is not a load
454define void @strided_non_load(ptr %x, ptr %z, <4 x i16> %b) {
455; CHECK-LABEL: strided_non_load:
456; CHECK:       # %bb.0:
457; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
458; CHECK-NEXT:    vle16.v v9, (a0)
459; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
460; CHECK-NEXT:    vslideup.vi v9, v8, 4
461; CHECK-NEXT:    vse16.v v9, (a1)
462; CHECK-NEXT:    ret
463  %a = load <4 x i16>, ptr %x
464  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
465  store <8 x i16> %c, ptr %z
466  ret void
467}
468
469define void @strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
470; CHECK-LABEL: strided_constant_neg_4xv2f32:
471; CHECK:       # %bb.0:
472; CHECK-NEXT:    li a2, -64
473; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
474; CHECK-NEXT:    vlse64.v v8, (a0), a2
475; CHECK-NEXT:    vse64.v v8, (a1)
476; CHECK-NEXT:    ret
477  %a = load <2 x float>, ptr %x
478  %b.gep = getelementptr i8, ptr %x, i64 -64
479  %b = load <2 x float>, ptr %b.gep
480  %c.gep = getelementptr i8, ptr %b.gep, i64 -64
481  %c = load <2 x float>, ptr %c.gep
482  %d.gep = getelementptr i8, ptr %c.gep, i64 -64
483  %d = load <2 x float>, ptr %d.gep
484  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
485  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
486  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
487  store <8 x float> %e.2, ptr %z
488  ret void
489}
490
491; This is a strided load with a negative stride
492define void @reverse_strided_constant_pos_4xv2f32(ptr %x, ptr %z, i64 %s) {
493; CHECK-LABEL: reverse_strided_constant_pos_4xv2f32:
494; CHECK:       # %bb.0:
495; CHECK-NEXT:    addi a0, a0, 192
496; CHECK-NEXT:    li a2, -64
497; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
498; CHECK-NEXT:    vlse64.v v8, (a0), a2
499; CHECK-NEXT:    vse64.v v8, (a1)
500; CHECK-NEXT:    ret
501  %x.1 = getelementptr i8, ptr %x, i64 64
502  %x.2 = getelementptr i8, ptr %x.1, i64 64
503  %x.3 = getelementptr i8, ptr %x.2, i64 64
504  %a = load <2 x float>, ptr %x.3
505  %b = load <2 x float>, ptr %x.2
506  %c = load <2 x float>, ptr %x.1
507  %d = load <2 x float>, ptr %x
508  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
509  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
510  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
511  store <8 x float> %e.2, ptr %z
512  ret void
513}
514
515define void @reverse_strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
516; CHECK-LABEL: reverse_strided_constant_neg_4xv2f32:
517; CHECK:       # %bb.0:
518; CHECK-NEXT:    addi a0, a0, -192
519; CHECK-NEXT:    li a2, 64
520; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
521; CHECK-NEXT:    vlse64.v v8, (a0), a2
522; CHECK-NEXT:    vse64.v v8, (a1)
523; CHECK-NEXT:    ret
524  %x.1 = getelementptr i8, ptr %x, i64 -64
525  %x.2 = getelementptr i8, ptr %x.1, i64 -64
526  %x.3 = getelementptr i8, ptr %x.2, i64 -64
527  %a = load <2 x float>, ptr %x.3
528  %b = load <2 x float>, ptr %x.2
529  %c = load <2 x float>, ptr %x.1
530  %d = load <2 x float>, ptr %x
531  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
532  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
533  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
534  store <8 x float> %e.2, ptr %z
535  ret void
536}
537
538; This is a strided load with a negative stride
539define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
540; CHECK-LABEL: reverse_strided_runtime_4xv2f32:
541; CHECK:       # %bb.0:
542; CHECK-NEXT:    add a0, a0, a2
543; CHECK-NEXT:    add a3, a2, a2
544; CHECK-NEXT:    add a0, a0, a3
545; CHECK-NEXT:    neg a2, a2
546; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
547; CHECK-NEXT:    vlse64.v v8, (a0), a2
548; CHECK-NEXT:    vse64.v v8, (a1)
549; CHECK-NEXT:    ret
550  %x.1 = getelementptr i8, ptr %x, i64 %s
551  %x.2 = getelementptr i8, ptr %x.1, i64 %s
552  %x.3 = getelementptr i8, ptr %x.2, i64 %s
553  %a = load <2 x float>, ptr %x.3
554  %b = load <2 x float>, ptr %x.2
555  %c = load <2 x float>, ptr %x.1
556  %d = load <2 x float>, ptr %x
557  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
558  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
559  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
560  store <8 x float> %e.2, ptr %z
561  ret void
562}
563
564; The middle end sometimes produces this pattern of shuffles, where the
565; intermediate shuffles are the full result vector size padded with poison
566; elements.
567define <16 x i8> @widen_4xv4i8_immediate_expand(ptr %p, i64 %s) {
568; CHECK-LABEL: widen_4xv4i8_immediate_expand:
569; CHECK:       # %bb.0:
570; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
571; CHECK-NEXT:    vlse32.v v8, (a0), a1
572; CHECK-NEXT:    ret
573  %a = load <4 x i8>, ptr %p
574  %b.ptr = getelementptr i8, ptr %p, i64 %s
575  %b = load <4 x i8>, ptr %b.ptr
576  %c.ptr = getelementptr i8, ptr %b.ptr, i64 %s
577  %c = load <4 x i8>, ptr %c.ptr
578  %d.ptr = getelementptr i8, ptr %c.ptr, i64 %s
579  %d = load <4 x i8>, ptr %d.ptr
580
581  %ab = shufflevector <4 x i8> %a, <4 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
582  %cx = shufflevector <4 x i8> %c, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
583  %dx = shufflevector <4 x i8> %d, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
584  %abcx = shufflevector <16 x i8> %ab, <16 x i8> %cx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
585  %abcd = shufflevector <16 x i8> %abcx, <16 x i8> %dx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
586  ret <16 x i8> %abcd
587}
588