xref: /llvm-project/llvm/test/CodeGen/AArch64/vldn_shuffle.ll (revision f6947e479e14e7904aa0b2539a95f5dfdc8f9295)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
3
4define void @vld2(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
5; CHECK-LABEL: vld2:
6; CHECK:       .Lfunc_begin0:
7; CHECK-NEXT:    .cfi_startproc
8; CHECK-NEXT:  // %bb.0: // %entry
9; CHECK-NEXT:    mov x8, xzr
10; CHECK-NEXT:  .LBB0_1: // %vector.body
11; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
12; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0], #32
13; CHECK-NEXT:    fmul v2.4s, v0.4s, v0.4s
14; CHECK-NEXT:    fmla v2.4s, v1.4s, v1.4s
15; CHECK-NEXT:    str q2, [x1, x8]
16; CHECK-NEXT:    add x8, x8, #16
17; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
18; CHECK-NEXT:    b.ne .LBB0_1
19; CHECK-NEXT:  // %bb.2: // %while.end
20; CHECK-NEXT:    ret
21entry:
22  br label %vector.body
23
24vector.body:                                      ; preds = %vector.body, %entry
25  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
26  %0 = shl i64 %index, 1
27  %next.gep = getelementptr float, ptr %pSrc, i64 %0
28  %next.gep19 = getelementptr float, ptr %pDst, i64 %index
29  %wide.vec = load <8 x float>, ptr %next.gep, align 4
30  %1 = fmul fast <8 x float> %wide.vec, %wide.vec
31  %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
32  %3 = fmul fast <8 x float> %wide.vec, %wide.vec
33  %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
34  %5 = fadd fast <4 x float> %4, %2
35  store <4 x float> %5, ptr %next.gep19, align 4
36  %index.next = add i64 %index, 4
37  %6 = icmp eq i64 %index.next, 1024
38  br i1 %6, label %while.end, label %vector.body
39
40while.end:                                        ; preds = %vector.body
41  ret void
42}
43
44define void @vld3(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
45; CHECK-LABEL: vld3:
46; CHECK:       .Lfunc_begin1:
47; CHECK-NEXT:    .cfi_startproc
48; CHECK-NEXT:  // %bb.0: // %entry
49; CHECK-NEXT:    mov x8, xzr
50; CHECK-NEXT:  .LBB1_1: // %vector.body
51; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
52; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
53; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
54; CHECK-NEXT:    fmla v3.4s, v1.4s, v1.4s
55; CHECK-NEXT:    fmla v3.4s, v2.4s, v2.4s
56; CHECK-NEXT:    str q3, [x1, x8]
57; CHECK-NEXT:    add x8, x8, #16
58; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
59; CHECK-NEXT:    b.ne .LBB1_1
60; CHECK-NEXT:  // %bb.2: // %while.end
61; CHECK-NEXT:    ret
62entry:
63  br label %vector.body
64
65vector.body:                                      ; preds = %vector.body, %entry
66  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
67  %0 = mul i64 %index, 3
68  %next.gep = getelementptr float, ptr %pSrc, i64 %0
69  %next.gep23 = getelementptr float, ptr %pDst, i64 %index
70  %wide.vec = load <12 x float>, ptr %next.gep, align 4
71  %1 = fmul fast <12 x float> %wide.vec, %wide.vec
72  %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
73  %3 = fmul fast <12 x float> %wide.vec, %wide.vec
74  %4 = shufflevector <12 x float> %3, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
75  %5 = fadd fast <4 x float> %4, %2
76  %6 = fmul fast <12 x float> %wide.vec, %wide.vec
77  %7 = shufflevector <12 x float> %6, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
78  %8 = fadd fast <4 x float> %5, %7
79  store <4 x float> %8, ptr %next.gep23, align 4
80  %index.next = add i64 %index, 4
81  %9 = icmp eq i64 %index.next, 1024
82  br i1 %9, label %while.end, label %vector.body
83
84while.end:                                        ; preds = %vector.body
85  ret void
86}
87
88define void @vld4(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
89; CHECK-LABEL: vld4:
90; CHECK:       .Lfunc_begin2:
91; CHECK-NEXT:    .cfi_startproc
92; CHECK-NEXT:  // %bb.0: // %entry
93; CHECK-NEXT:    mov x8, xzr
94; CHECK-NEXT:  .LBB2_1: // %vector.body
95; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
96; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
97; CHECK-NEXT:    add x9, x1, x8
98; CHECK-NEXT:    add x8, x8, #32
99; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
100; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
101; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
102; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
103; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
104; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x9]
105; CHECK-NEXT:    b.ne .LBB2_1
106; CHECK-NEXT:  // %bb.2: // %while.end
107; CHECK-NEXT:    ret
108entry:
109  br label %vector.body
110
111vector.body:                                      ; preds = %vector.body, %entry
112  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
113  %0 = shl i64 %index, 2
114  %next.gep = getelementptr float, ptr %pSrc, i64 %0
115  %1 = shl i64 %index, 1
116  %wide.vec = load <16 x float>, ptr %next.gep, align 4
117  %2 = fmul fast <16 x float> %wide.vec, %wide.vec
118  %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
119  %4 = fmul fast <16 x float> %wide.vec, %wide.vec
120  %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
121  %6 = fadd fast <4 x float> %5, %3
122  %7 = fmul fast <16 x float> %wide.vec, %wide.vec
123  %8 = shufflevector <16 x float> %7, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
124  %9 = fmul fast <16 x float> %wide.vec, %wide.vec
125  %10 = shufflevector <16 x float> %9, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
126  %11 = fadd fast <4 x float> %10, %8
127  %12 = getelementptr inbounds float, ptr %pDst, i64 %1
128  %interleaved.vec = shufflevector <4 x float> %6, <4 x float> %11, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
129  store <8 x float> %interleaved.vec, ptr %12, align 4
130  %index.next = add i64 %index, 4
131  %13 = icmp eq i64 %index.next, 1024
132  br i1 %13, label %while.end, label %vector.body
133
134while.end:                                        ; preds = %vector.body
135  ret void
136}
137
138define void @twosrc(ptr nocapture readonly %pSrc, ptr nocapture readonly %pSrc2, ptr noalias nocapture %pDst, i32 %numSamples) {
139; CHECK-LABEL: twosrc:
140; CHECK:       .Lfunc_begin3:
141; CHECK-NEXT:    .cfi_startproc
142; CHECK-NEXT:  // %bb.0: // %entry
143; CHECK-NEXT:    mov x8, xzr
144; CHECK-NEXT:  .LBB3_1: // %vector.body
145; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
146; CHECK-NEXT:    add x9, x0, x8
147; CHECK-NEXT:    add x10, x1, x8
148; CHECK-NEXT:    add x8, x8, #32
149; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x9]
150; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
151; CHECK-NEXT:    ld2 { v2.4s, v3.4s }, [x10]
152; CHECK-NEXT:    fmul v4.4s, v2.4s, v0.4s
153; CHECK-NEXT:    fmla v4.4s, v1.4s, v3.4s
154; CHECK-NEXT:    str q4, [x2], #16
155; CHECK-NEXT:    b.ne .LBB3_1
156; CHECK-NEXT:  // %bb.2: // %while.end
157; CHECK-NEXT:    ret
158entry:
159  br label %vector.body
160
161vector.body:                                      ; preds = %vector.body, %entry
162  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
163  %0 = shl i64 %index, 1
164  %next.gep = getelementptr float, ptr %pSrc, i64 %0
165  %1 = shl i64 %index, 1
166  %next.gep23 = getelementptr float, ptr %pSrc2, i64 %1
167  %next.gep24 = getelementptr float, ptr %pDst, i64 %index
168  %wide.vec = load <8 x float>, ptr %next.gep, align 4
169  %wide.vec26 = load <8 x float>, ptr %next.gep23, align 4
170  %2 = fmul fast <8 x float> %wide.vec26, %wide.vec
171  %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
172  %4 = fmul fast <8 x float> %wide.vec26, %wide.vec
173  %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
174  %6 = fadd fast <4 x float> %5, %3
175  store <4 x float> %6, ptr %next.gep24, align 4
176  %index.next = add i64 %index, 4
177  %7 = icmp eq i64 %index.next, 1024
178  br i1 %7, label %while.end, label %vector.body
179
180while.end:                                        ; preds = %vector.body
181  ret void
182}
183
184define void @vld2_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
185; CHECK-LABEL: vld2_multiuse:
186; CHECK:       .Lfunc_begin4:
187; CHECK-NEXT:    .cfi_startproc
188; CHECK-NEXT:  // %bb.0: // %entry
189; CHECK-NEXT:    mov x8, xzr
190; CHECK-NEXT:  .LBB4_1: // %vector.body
191; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
192; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0], #32
193; CHECK-NEXT:    fmul v2.4s, v0.4s, v0.4s
194; CHECK-NEXT:    fmla v2.4s, v1.4s, v1.4s
195; CHECK-NEXT:    str q2, [x1, x8]
196; CHECK-NEXT:    add x8, x8, #16
197; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
198; CHECK-NEXT:    b.ne .LBB4_1
199; CHECK-NEXT:  // %bb.2: // %while.end
200; CHECK-NEXT:    ret
201entry:
202  br label %vector.body
203
204vector.body:                                      ; preds = %vector.body, %entry
205  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
206  %0 = shl i64 %index, 1
207  %next.gep = getelementptr float, ptr %pSrc, i64 %0
208  %next.gep19 = getelementptr float, ptr %pDst, i64 %index
209  %wide.vec = load <8 x float>, ptr %next.gep, align 4
210  %1 = fmul fast <8 x float> %wide.vec, %wide.vec
211  %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
212  %3 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
213  %4 = fadd fast <4 x float> %3, %2
214  store <4 x float> %4, ptr %next.gep19, align 4
215  %index.next = add i64 %index, 4
216  %5 = icmp eq i64 %index.next, 1024
217  br i1 %5, label %while.end, label %vector.body
218
219while.end:                                        ; preds = %vector.body
220  ret void
221}
222
223define void @vld3_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
224; CHECK-LABEL: vld3_multiuse:
225; CHECK:       .Lfunc_begin5:
226; CHECK-NEXT:    .cfi_startproc
227; CHECK-NEXT:  // %bb.0: // %entry
228; CHECK-NEXT:    mov x8, xzr
229; CHECK-NEXT:  .LBB5_1: // %vector.body
230; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
231; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
232; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
233; CHECK-NEXT:    fmla v3.4s, v1.4s, v1.4s
234; CHECK-NEXT:    fmla v3.4s, v2.4s, v2.4s
235; CHECK-NEXT:    str q3, [x1, x8]
236; CHECK-NEXT:    add x8, x8, #16
237; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
238; CHECK-NEXT:    b.ne .LBB5_1
239; CHECK-NEXT:  // %bb.2: // %while.end
240; CHECK-NEXT:    ret
241entry:
242  br label %vector.body
243
244vector.body:                                      ; preds = %vector.body, %entry
245  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
246  %0 = mul i64 %index, 3
247  %next.gep = getelementptr float, ptr %pSrc, i64 %0
248  %next.gep23 = getelementptr float, ptr %pDst, i64 %index
249  %wide.vec = load <12 x float>, ptr %next.gep, align 4
250  %1 = fmul fast <12 x float> %wide.vec, %wide.vec
251  %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
252  %3 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
253  %4 = fadd fast <4 x float> %3, %2
254  %5 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
255  %6 = fadd fast <4 x float> %4, %5
256  store <4 x float> %6, ptr %next.gep23, align 4
257  %index.next = add i64 %index, 4
258  %7 = icmp eq i64 %index.next, 1024
259  br i1 %7, label %while.end, label %vector.body
260
261while.end:                                        ; preds = %vector.body
262  ret void
263}
264
265define void @vld4_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
266; CHECK-LABEL: vld4_multiuse:
267; CHECK:       .Lfunc_begin6:
268; CHECK-NEXT:    .cfi_startproc
269; CHECK-NEXT:  // %bb.0: // %entry
270; CHECK-NEXT:    mov x8, xzr
271; CHECK-NEXT:  .LBB6_1: // %vector.body
272; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
273; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
274; CHECK-NEXT:    add x9, x1, x8
275; CHECK-NEXT:    add x8, x8, #32
276; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
277; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
278; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
279; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
280; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
281; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x9]
282; CHECK-NEXT:    b.ne .LBB6_1
283; CHECK-NEXT:  // %bb.2: // %while.end
284; CHECK-NEXT:    ret
285entry:
286  br label %vector.body
287
288vector.body:                                      ; preds = %vector.body, %entry
289  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
290  %0 = shl i64 %index, 2
291  %next.gep = getelementptr float, ptr %pSrc, i64 %0
292  %1 = shl i64 %index, 1
293  %wide.vec = load <16 x float>, ptr %next.gep, align 4
294  %2 = fmul fast <16 x float> %wide.vec, %wide.vec
295  %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
296  %4 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
297  %5 = fadd fast <4 x float> %4, %3
298  %6 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
299  %7 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
300  %8 = fadd fast <4 x float> %7, %6
301  %9 = getelementptr inbounds float, ptr %pDst, i64 %1
302  %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %8, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
303  store <8 x float> %interleaved.vec, ptr %9, align 4
304  %index.next = add i64 %index, 4
305  %10 = icmp eq i64 %index.next, 1024
306  br i1 %10, label %while.end, label %vector.body
307
308while.end:                                        ; preds = %vector.body
309  ret void
310}
311
312; This example has store(shuffle(shuffle(... that would be better to be treated
313; as a single store. This avoids the vld2 for data that is already shuffled.
314define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) {
315; CHECK-LABEL: transpose_s16_8x8_simpler:
316; CHECK:       .Lfunc_begin7:
317; CHECK-NEXT:    .cfi_startproc
318; CHECK-NEXT:  // %bb.0: // %entry
319; CHECK-NEXT:    ldp q0, q1, [x0]
320; CHECK-NEXT:    ldp q2, q3, [x0, #64]
321; CHECK-NEXT:    ldp q4, q5, [x0, #32]
322; CHECK-NEXT:    ldp q6, q7, [x0, #96]
323; CHECK-NEXT:    trn1 v0.8h, v0.8h, v1.8h
324; CHECK-NEXT:    trn1 v1.8h, v2.8h, v3.8h
325; CHECK-NEXT:    trn1 v2.8h, v4.8h, v5.8h
326; CHECK-NEXT:    trn1 v3.8h, v6.8h, v7.8h
327; CHECK-NEXT:    trn1 v0.4s, v0.4s, v1.4s
328; CHECK-NEXT:    trn1 v1.4s, v2.4s, v3.4s
329; CHECK-NEXT:    zip2 v2.4s, v0.4s, v1.4s
330; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x0]
331; CHECK-NEXT:    str q2, [x0, #64]
332; CHECK-NEXT:    ret
333entry:
334  %0 = load <8 x i16>, ptr %a, align 16
335  %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1
336  %1 = load <8 x i16>, ptr %arrayidx1, align 16
337  %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
338  %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2
339  %2 = load <8 x i16>, ptr %arrayidx2, align 16
340  %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3
341  %3 = load <8 x i16>, ptr %arrayidx3, align 16
342  %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
343  %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4
344  %4 = load <8 x i16>, ptr %arrayidx5, align 16
345  %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5
346  %5 = load <8 x i16>, ptr %arrayidx6, align 16
347  %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
348  %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6
349  %6 = load <8 x i16>, ptr %arrayidx8, align 16
350  %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7
351  %7 = load <8 x i16>, ptr %arrayidx9, align 16
352  %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
353  %8 = bitcast <8 x i16> %shuffle.i to <4 x i32>
354  %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32>
355  %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
356  %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32>
357  %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32>
358  %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
359  %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
360  %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
361  store <4 x i32> %vzip.i, ptr %a, align 16
362  store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16
363  ret void
364}
365
366; Same as above with some different shuffles
367define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) {
368; CHECK-LABEL: transpose_s16_8x8_simpler2:
369; CHECK:       .Lfunc_begin8:
370; CHECK-NEXT:    .cfi_startproc
371; CHECK-NEXT:  // %bb.0: // %entry
372; CHECK-NEXT:    ldp q0, q2, [x0]
373; CHECK-NEXT:    ldp q3, q4, [x0, #64]
374; CHECK-NEXT:    ldp q5, q6, [x0, #32]
375; CHECK-NEXT:    ldp q7, q16, [x0, #96]
376; CHECK-NEXT:    mov v0.h[5], v2.h[4]
377; CHECK-NEXT:    zip1 v2.8h, v3.8h, v4.8h
378; CHECK-NEXT:    zip1 v3.8h, v5.8h, v6.8h
379; CHECK-NEXT:    mov v7.h[5], v16.h[4]
380; CHECK-NEXT:    mov v0.s[1], v2.s[0]
381; CHECK-NEXT:    uzp1 v1.4s, v3.4s, v7.4s
382; CHECK-NEXT:    zip2 v2.4s, v0.4s, v1.4s
383; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x0]
384; CHECK-NEXT:    str q2, [x0, #64]
385; CHECK-NEXT:    ret
386entry:
387  %0 = load <8 x i16>, ptr %a, align 16
388  %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1
389  %1 = load <8 x i16>, ptr %arrayidx1, align 16
390  %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
391  %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2
392  %2 = load <8 x i16>, ptr %arrayidx2, align 16
393  %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3
394  %3 = load <8 x i16>, ptr %arrayidx3, align 16
395  %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
396  %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4
397  %4 = load <8 x i16>, ptr %arrayidx5, align 16
398  %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5
399  %5 = load <8 x i16>, ptr %arrayidx6, align 16
400  %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
401  %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6
402  %6 = load <8 x i16>, ptr %arrayidx8, align 16
403  %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7
404  %7 = load <8 x i16>, ptr %arrayidx9, align 16
405  %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
406  %8 = bitcast <8 x i16> %shuffle.i to <4 x i32>
407  %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32>
408  %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
409  %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32>
410  %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32>
411  %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
412  %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
413  %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
414  store <4 x i32> %vzip.i, ptr %a, align 16
415  store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16
416  ret void
417}
418
419
420define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %1, ptr nocapture noundef %2, ptr nocapture noundef %3, ptr nocapture noundef %4, ptr nocapture noundef %5, ptr nocapture noundef %6, ptr nocapture noundef %7) {
421; CHECK-LABEL: transpose_s16_8x8:
422; CHECK:       .Lfunc_begin9:
423; CHECK-NEXT:    .cfi_startproc
424; CHECK-NEXT:  // %bb.0:
425; CHECK-NEXT:    ldr q0, [x0]
426; CHECK-NEXT:    ldr q1, [x1]
427; CHECK-NEXT:    ldr q3, [x4]
428; CHECK-NEXT:    ldr q4, [x5]
429; CHECK-NEXT:    ldr q2, [x2]
430; CHECK-NEXT:    ldr q5, [x3]
431; CHECK-NEXT:    trn1 v16.8h, v0.8h, v1.8h
432; CHECK-NEXT:    trn2 v0.8h, v0.8h, v1.8h
433; CHECK-NEXT:    ldr q6, [x6]
434; CHECK-NEXT:    ldr q7, [x7]
435; CHECK-NEXT:    trn1 v17.8h, v3.8h, v4.8h
436; CHECK-NEXT:    trn2 v1.8h, v3.8h, v4.8h
437; CHECK-NEXT:    trn1 v18.8h, v2.8h, v5.8h
438; CHECK-NEXT:    trn2 v2.8h, v2.8h, v5.8h
439; CHECK-NEXT:    trn1 v19.8h, v6.8h, v7.8h
440; CHECK-NEXT:    trn2 v3.8h, v6.8h, v7.8h
441; CHECK-NEXT:    trn1 v4.4s, v16.4s, v17.4s
442; CHECK-NEXT:    trn1 v6.4s, v0.4s, v1.4s
443; CHECK-NEXT:    trn2 v16.4s, v16.4s, v17.4s
444; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
445; CHECK-NEXT:    trn1 v5.4s, v18.4s, v19.4s
446; CHECK-NEXT:    trn1 v7.4s, v2.4s, v3.4s
447; CHECK-NEXT:    trn2 v17.4s, v18.4s, v19.4s
448; CHECK-NEXT:    trn2 v1.4s, v2.4s, v3.4s
449; CHECK-NEXT:    st2 { v4.2s, v5.2s }, [x0]
450; CHECK-NEXT:    zip2 v2.4s, v4.4s, v5.4s
451; CHECK-NEXT:    zip2 v3.4s, v6.4s, v7.4s
452; CHECK-NEXT:    zip2 v4.4s, v16.4s, v17.4s
453; CHECK-NEXT:    st2 { v6.2s, v7.2s }, [x1]
454; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x2]
455; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x3]
456; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
457; CHECK-NEXT:    str q2, [x4]
458; CHECK-NEXT:    str q3, [x5]
459; CHECK-NEXT:    str q4, [x6]
460; CHECK-NEXT:    str q0, [x7]
461; CHECK-NEXT:    ret
462  %9 = load <8 x i16>, ptr %0, align 16
463  %10 = load <8 x i16>, ptr %1, align 16
464  %11 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
465  %12 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
466  %13 = load <8 x i16>, ptr %2, align 16
467  %14 = load <8 x i16>, ptr %3, align 16
468  %15 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
469  %16 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
470  %17 = load <8 x i16>, ptr %4, align 16
471  %18 = load <8 x i16>, ptr %5, align 16
472  %19 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
473  %20 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
474  %21 = load <8 x i16>, ptr %6, align 16
475  %22 = load <8 x i16>, ptr %7, align 16
476  %23 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
477  %24 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
478  %25 = bitcast <8 x i16> %11 to <4 x i32>
479  %26 = bitcast <8 x i16> %19 to <4 x i32>
480  %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
481  %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
482  %29 = bitcast <8 x i16> %12 to <4 x i32>
483  %30 = bitcast <8 x i16> %20 to <4 x i32>
484  %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
485  %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
486  %33 = bitcast <8 x i16> %15 to <4 x i32>
487  %34 = bitcast <8 x i16> %23 to <4 x i32>
488  %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
489  %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
490  %37 = bitcast <8 x i16> %16 to <4 x i32>
491  %38 = bitcast <8 x i16> %24 to <4 x i32>
492  %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
493  %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
494  %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
495  %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
496  %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
497  %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
498  %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
499  %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
500  %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
501  %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
502  store <4 x i32> %41, ptr %0, align 16
503  store <4 x i32> %43, ptr %1, align 16
504  store <4 x i32> %45, ptr %2, align 16
505  store <4 x i32> %47, ptr %3, align 16
506  store <4 x i32> %42, ptr %4, align 16
507  store <4 x i32> %44, ptr %5, align 16
508  store <4 x i32> %46, ptr %6, align 16
509  store <4 x i32> %48, ptr %7, align 16
510  ret void
511}
512
513define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
514; CHECK-LABEL: transpose_s16_8x8_:
515; CHECK:       .Lfunc_begin10:
516; CHECK-NEXT:    .cfi_startproc
517; CHECK-NEXT:  // %bb.0:
518; CHECK-NEXT:    ldp q0, q1, [x0]
519; CHECK-NEXT:    ldp q2, q3, [x0, #32]
520; CHECK-NEXT:    ldp q4, q5, [x0, #64]
521; CHECK-NEXT:    ldp q6, q7, [x0, #96]
522; CHECK-NEXT:    trn1 v16.8h, v0.8h, v1.8h
523; CHECK-NEXT:    trn2 v0.8h, v0.8h, v1.8h
524; CHECK-NEXT:    trn1 v1.8h, v2.8h, v3.8h
525; CHECK-NEXT:    trn2 v2.8h, v2.8h, v3.8h
526; CHECK-NEXT:    trn1 v17.8h, v4.8h, v5.8h
527; CHECK-NEXT:    trn2 v3.8h, v4.8h, v5.8h
528; CHECK-NEXT:    trn1 v18.8h, v6.8h, v7.8h
529; CHECK-NEXT:    trn2 v4.8h, v6.8h, v7.8h
530; CHECK-NEXT:    trn1 v5.4s, v16.4s, v17.4s
531; CHECK-NEXT:    trn1 v7.4s, v0.4s, v3.4s
532; CHECK-NEXT:    trn2 v16.4s, v16.4s, v17.4s
533; CHECK-NEXT:    trn1 v6.4s, v1.4s, v18.4s
534; CHECK-NEXT:    trn1 v19.4s, v2.4s, v4.4s
535; CHECK-NEXT:    trn2 v1.4s, v1.4s, v18.4s
536; CHECK-NEXT:    trn2 v0.4s, v0.4s, v3.4s
537; CHECK-NEXT:    trn2 v2.4s, v2.4s, v4.4s
538; CHECK-NEXT:    zip1 v3.4s, v5.4s, v6.4s
539; CHECK-NEXT:    zip1 v4.4s, v7.4s, v19.4s
540; CHECK-NEXT:    zip1 v17.4s, v16.4s, v1.4s
541; CHECK-NEXT:    zip1 v18.4s, v0.4s, v2.4s
542; CHECK-NEXT:    zip2 v5.4s, v5.4s, v6.4s
543; CHECK-NEXT:    zip2 v1.4s, v16.4s, v1.4s
544; CHECK-NEXT:    zip2 v0.4s, v0.4s, v2.4s
545; CHECK-NEXT:    stp q3, q4, [x0]
546; CHECK-NEXT:    zip2 v3.4s, v7.4s, v19.4s
547; CHECK-NEXT:    stp q17, q18, [x0, #32]
548; CHECK-NEXT:    stp q1, q0, [x0, #96]
549; CHECK-NEXT:    stp q5, q3, [x0, #64]
550; CHECK-NEXT:    ret
551  %2 = load <8 x i16>, ptr %0, align 16
552  %3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1
553  %4 = load <8 x i16>, ptr %3, align 1
554  %5 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
555  %6 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
556  %7 = getelementptr inbounds <8 x i16>, ptr %0, i64 2
557  %8 = load <8 x i16>, ptr %7, align 16
558  %9 = getelementptr inbounds <8 x i16>, ptr %0, i64 3
559  %10 = load <8 x i16>, ptr %9, align 16
560  %11 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
561  %12 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
562  %13 = getelementptr inbounds <8 x i16>, ptr %0, i64 4
563  %14 = load <8 x i16>, ptr %13, align 16
564  %15 = getelementptr inbounds <8 x i16>, ptr %0, i64 5
565  %16 = load <8 x i16>, ptr %15, align 16
566  %17 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
567  %18 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
568  %19 = getelementptr inbounds <8 x i16>, ptr %0, i64 6
569  %20 = load <8 x i16>, ptr %19, align 16
570  %21 = getelementptr inbounds <8 x i16>, ptr %0, i64 7
571  %22 = load <8 x i16>, ptr %21, align 16
572  %23 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
573  %24 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
574  %25 = bitcast <8 x i16> %5 to <4 x i32>
575  %26 = bitcast <8 x i16> %17 to <4 x i32>
576  %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
577  %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
578  %29 = bitcast <8 x i16> %6 to <4 x i32>
579  %30 = bitcast <8 x i16> %18 to <4 x i32>
580  %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
581  %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
582  %33 = bitcast <8 x i16> %11 to <4 x i32>
583  %34 = bitcast <8 x i16> %23 to <4 x i32>
584  %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
585  %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
586  %37 = bitcast <8 x i16> %12 to <4 x i32>
587  %38 = bitcast <8 x i16> %24 to <4 x i32>
588  %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
589  %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
590  %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
591  %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
592  %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
593  %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
594  %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
595  %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
596  %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
597  %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
598  store <4 x i32> %41, ptr %0, align 16
599  store <4 x i32> %43, ptr %3, align 16
600  store <4 x i32> %45, ptr %7, align 16
601  store <4 x i32> %47, ptr %9, align 16
602  store <4 x i32> %42, ptr %13, align 16
603  store <4 x i32> %44, ptr %15, align 16
604  store <4 x i32> %46, ptr %19, align 16
605  store <4 x i32> %48, ptr %21, align 16
606  ret void
607}
608
609define void @store_factor2(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1) {
610; CHECK-LABEL: store_factor2:
611; CHECK:       .Lfunc_begin11:
612; CHECK-NEXT:    .cfi_startproc
613; CHECK-NEXT:  // %bb.0:
614; CHECK-NEXT:    trn1 v2.4s, v0.4s, v1.4s
615; CHECK-NEXT:    trn1 v3.4s, v1.4s, v0.4s
616; CHECK-NEXT:    st2 { v2.4s, v3.4s }, [x0]
617; CHECK-NEXT:    ret
618  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
619  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
620  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
621  store <8 x i32> %interleaved.vec, ptr %ptr, align 4
622  ret void
623}
624
625define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) {
626; CHECK-LABEL: store_factor2_high:
627; CHECK:       .Lfunc_begin12:
628; CHECK-NEXT:    .cfi_startproc
629; CHECK-NEXT:  // %bb.0:
630; CHECK-NEXT:    trn1 v2.4s, v0.4s, v1.4s
631; CHECK-NEXT:    trn1 v0.4s, v1.4s, v0.4s
632; CHECK-NEXT:    zip1 v1.4s, v2.4s, v0.4s
633; CHECK-NEXT:    trn1 v1.4s, v1.4s, v0.4s
634; CHECK-NEXT:    zip2 v0.4s, v2.4s, v0.4s
635; CHECK-NEXT:    str q1, [x0]
636; CHECK-NEXT:    str q0, [x1]
637; CHECK-NEXT:    ret
638  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
639  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
640  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 1, i32 6>
641  %interleaved.vec2 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
642  store <4 x i32> %interleaved.vec, ptr %ptr, align 4
643  store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4
644  ret void
645}
646
647define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) {
648; CHECK-LABEL: store_factor2_high2:
649; CHECK:       .Lfunc_begin13:
650; CHECK-NEXT:    .cfi_startproc
651; CHECK-NEXT:  // %bb.0:
652; CHECK-NEXT:    zip1 v2.4s, v0.4s, v1.4s
653; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
654; CHECK-NEXT:    trn1 v2.4s, v2.4s, v1.4s
655; CHECK-NEXT:    str q2, [x0]
656; CHECK-NEXT:    str q0, [x1]
657; CHECK-NEXT:    ret
658  %interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 6>
659  %interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
660  store <4 x i32> %interleaved.vec, ptr %ptr, align 4
661  store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4
662  ret void
663}
664
665define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
666; CHECK-LABEL: store_factor3:
667; CHECK:       .Lfunc_begin14:
668; CHECK-NEXT:    .cfi_startproc
669; CHECK-NEXT:  // %bb.0:
670; CHECK-NEXT:    ext v3.16b, v0.16b, v1.16b, #12
671; CHECK-NEXT:    ext v6.16b, v1.16b, v2.16b, #12
672; CHECK-NEXT:    zip2 v3.4s, v0.4s, v3.4s
673; CHECK-NEXT:    mov v3.s[0], v0.s[0]
674; CHECK-NEXT:    ext v0.16b, v2.16b, v0.16b, #12
675; CHECK-NEXT:    zip2 v4.4s, v1.4s, v6.4s
676; CHECK-NEXT:    mov v4.s[0], v1.s[0]
677; CHECK-NEXT:    zip2 v5.4s, v2.4s, v0.4s
678; CHECK-NEXT:    mov v5.s[0], v2.s[0]
679; CHECK-NEXT:    st3 { v3.4s, v4.4s, v5.4s }, [x0]
680; CHECK-NEXT:    ret
681  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
682  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
683  %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a0, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
684  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
685  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
686  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
687  store <12 x i32> %interleaved.vec, ptr %ptr, align 4
688  ret void
689}
690
691define void @store_factor4(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
692; CHECK-LABEL: store_factor4:
693; CHECK:       .Lfunc_begin15:
694; CHECK-NEXT:    .cfi_startproc
695; CHECK-NEXT:  // %bb.0:
696; CHECK-NEXT:    trn1 v4.4s, v0.4s, v1.4s
697; CHECK-NEXT:    trn1 v5.4s, v1.4s, v2.4s
698; CHECK-NEXT:    trn1 v6.4s, v2.4s, v3.4s
699; CHECK-NEXT:    trn1 v7.4s, v3.4s, v0.4s
700; CHECK-NEXT:    st4 { v4.4s, v5.4s, v6.4s, v7.4s }, [x0]
701; CHECK-NEXT:    ret
702  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
703  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
704  %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
705  %v3 = shufflevector <4 x i32> %a3, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
706  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
707  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
708  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
709  store <16 x i32> %interleaved.vec, ptr %ptr, align 4
710  ret void
711}
712
713define void @debuginfo(ptr nocapture noundef writeonly %buf, <8 x i16> noundef %a) {
714; CHECK-LABEL: debuginfo:
715; CHECK:       .Lfunc_begin16:
716; CHECK-NEXT:    .cfi_startproc
717; CHECK-NEXT:  // %bb.0: // %entry
718; CHECK-NEXT:    movi v1.2d, #0000000000000000
719; CHECK-NEXT:    zip1 v2.8h, v0.8h, v1.8h
720; CHECK-NEXT:    zip2 v0.8h, v0.8h, v1.8h
721; CHECK-NEXT:    stp q2, q0, [x0]
722; CHECK-NEXT:    ret
723entry:
724  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
725  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
726  store <8 x i16> %vzip.i, ptr %buf, align 4
727  call void @llvm.dbg.value(metadata <8 x i16> %vzip1.i, metadata !21, metadata !DIExpression()), !dbg !23
728  %add.ptr = getelementptr inbounds i32, ptr %buf, i64 4
729  store <8 x i16> %vzip1.i, ptr %add.ptr, align 4
730  ret void
731}
732
733declare void @llvm.dbg.value(metadata, metadata, metadata)
734
735!llvm.dbg.cu = !{!0}
736!llvm.module.flags = !{!6, !7, !8, !9, !10, !11}
737
738!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None)
739!1 = !DIFile(filename: "a64.c", directory: "", checksumkind: CSK_MD5, checksum: "a1a236fb20d703d1ea5963e75545b91a")
740!2 = !{!15}
741!3 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
742!4 = !{!5}
743!5 = !DISubrange(count: 8)
744!6 = !{i32 7, !"Dwarf Version", i32 5}
745!7 = !{i32 2, !"Debug Info Version", i32 3}
746!8 = !{i32 1, !"wchar_size", i32 4}
747!9 = !{i32 7, !"uwtable", i32 2}
748!10 = !{i32 7, !"frame-pointer", i32 1}
749!11 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
750!12 = !DISubroutineType(types: !13)
751!13 = !{null, !14, !15}
752!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3, size: 64)
753!15 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !16)
754!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int16x8_t", file: !1, line: 57, baseType: !17)
755!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 128, flags: DIFlagVector, elements: !4)
756!18 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
757!19 = distinct !DISubprogram(name: "store_s16q_to_tran_low_", scope: !1, file: !1, line: 13, type: !12, scopeLine: 13, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !20)
758!20 = !{!21}
759!21 = !DILocalVariable(name: "__s1", scope: !22, file: !1, line: 16, type: !16)
760!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 16, column: 3)
761!23 = !DILocation(line: 0, scope: !22)
762