xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll (revision 38fffa630ee80163dc65e759392ad29798905679)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
3
4target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
5target triple = "aarch64--linux-gnu"
6
7define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
8; CHECK-LABEL: @build_vec_v2i64(
9; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
10; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
11; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
12; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
13; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
14; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
15;
16  %v0.0 = extractelement <2 x i64> %v0, i32 0
17  %v0.1 = extractelement <2 x i64> %v0, i32 1
18  %v1.0 = extractelement <2 x i64> %v1, i32 0
19  %v1.1 = extractelement <2 x i64> %v1, i32 1
20  %tmp0.0 = add i64 %v0.0, %v1.0
21  %tmp0.1 = add i64 %v0.1, %v1.1
22  %tmp1.0 = sub i64 %v0.0, %v1.0
23  %tmp1.1 = sub i64 %v0.1, %v1.1
24  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
25  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
26  %tmp3.0 = insertelement <2 x i64> undef, i64 %tmp2.0, i32 0
27  %tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1
28  ret <2 x i64> %tmp3.1
29}
30
31define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) {
32; CHECK-LABEL: @store_chain_v2i64(
33; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
34; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8
35; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
36; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
37; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 1, i32 2>
38; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
39; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]]
40; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8
41; CHECK-NEXT:    ret void
42;
43  %a.1 = getelementptr i64, ptr %a, i64 1
44  %b.1 = getelementptr i64, ptr %b, i64 1
45  %c.1 = getelementptr i64, ptr %c, i64 1
46  %v0.0 = load i64, ptr %a, align 8
47  %v0.1 = load i64, ptr %a.1, align 8
48  %v1.0 = load i64, ptr %b, align 8
49  %v1.1 = load i64, ptr %b.1, align 8
50  %tmp0.0 = add i64 %v0.0, %v1.0
51  %tmp0.1 = add i64 %v0.1, %v1.1
52  %tmp1.0 = sub i64 %v0.0, %v1.0
53  %tmp1.1 = sub i64 %v0.1, %v1.1
54  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
55  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
56  store i64 %tmp2.0, ptr %c, align 8
57  store i64 %tmp2.1, ptr %c.1, align 8
58  ret void
59}
60
61define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
62; CHECK-LABEL: @build_vec_v4i32(
63; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
64; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
65; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
66; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
67; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
68; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
69;
70  %v0.0 = extractelement <4 x i32> %v0, i32 0
71  %v0.1 = extractelement <4 x i32> %v0, i32 1
72  %v0.2 = extractelement <4 x i32> %v0, i32 2
73  %v0.3 = extractelement <4 x i32> %v0, i32 3
74  %v1.0 = extractelement <4 x i32> %v1, i32 0
75  %v1.1 = extractelement <4 x i32> %v1, i32 1
76  %v1.2 = extractelement <4 x i32> %v1, i32 2
77  %v1.3 = extractelement <4 x i32> %v1, i32 3
78  %tmp0.0 = add i32 %v0.0, %v1.0
79  %tmp0.1 = add i32 %v0.1, %v1.1
80  %tmp0.2 = add i32 %v0.2, %v1.2
81  %tmp0.3 = add i32 %v0.3, %v1.3
82  %tmp1.0 = sub i32 %v0.0, %v1.0
83  %tmp1.1 = sub i32 %v0.1, %v1.1
84  %tmp1.2 = sub i32 %v0.2, %v1.2
85  %tmp1.3 = sub i32 %v0.3, %v1.3
86  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
87  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
88  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
89  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
90  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
91  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
92  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
93  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
94  ret <4 x i32> %tmp3.3
95}
96
97define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
98; CHECK-LABEL: @build_vec_v4i32_reuse_0(
99; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
100; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
101; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
102; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
103; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
104; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
105; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
106;
107  %v0.0 = extractelement <2 x i32> %v0, i32 0
108  %v0.1 = extractelement <2 x i32> %v0, i32 1
109  %v1.0 = extractelement <2 x i32> %v1, i32 0
110  %v1.1 = extractelement <2 x i32> %v1, i32 1
111  %tmp0.0 = add i32 %v0.0, %v1.0
112  %tmp0.1 = add i32 %v0.1, %v1.1
113  %tmp1.0 = sub i32 %v0.0, %v1.0
114  %tmp1.1 = sub i32 %v0.1, %v1.1
115  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
116  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
117  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
118  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
119  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2
120  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3
121  ret <4 x i32> %tmp3.3
122}
123
124define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
125; CHECK-LABEL: @build_vec_v4i32_reuse_1(
126; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
127; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i64 1
128; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
129; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i64 1
130; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
131; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
132; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
133; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
134; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
135; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_1]], i64 0
136; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
137; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]]
138; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
139;
140  %v0.0 = extractelement <2 x i32> %v0, i32 0
141  %v0.1 = extractelement <2 x i32> %v0, i32 1
142  %v1.0 = extractelement <2 x i32> %v1, i32 0
143  %v1.1 = extractelement <2 x i32> %v1, i32 1
144  %tmp0.0 = add i32 %v0.0, %v1.0
145  %tmp0.1 = add i32 %v0.1, %v1.1
146  %tmp0.2 = xor i32 %v0.0, %v1.0
147  %tmp0.3 = xor i32 %v0.1, %v1.1
148  %tmp1.0 = sub i32 %tmp0.0, %tmp0.1
149  %tmp1.1 = sub i32 %tmp0.0, %tmp0.1
150  %tmp1.2 = sub i32 %tmp0.2, %tmp0.3
151  %tmp1.3 = sub i32 %tmp0.3, %tmp0.2
152  %tmp2.0 = insertelement <4 x i32> undef, i32 %tmp1.0, i32 0
153  %tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1
154  %tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2
155  %tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3
156  ret <4 x i32> %tmp2.3
157}
158
159define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
160; CHECK-LABEL: @build_vec_v4i32_3_binops(
161; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
162; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]]
163; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
164; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
165; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
166; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
167; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
168; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
169; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
170; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
171; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
172;
173  %v0.0 = extractelement <2 x i32> %v0, i32 0
174  %v0.1 = extractelement <2 x i32> %v0, i32 1
175  %v1.0 = extractelement <2 x i32> %v1, i32 0
176  %v1.1 = extractelement <2 x i32> %v1, i32 1
177  %tmp0.0 = add i32 %v0.0, %v1.0
178  %tmp0.1 = add i32 %v0.1, %v1.1
179  %tmp0.2 = xor i32 %v0.0, %v1.0
180  %tmp0.3 = xor i32 %v0.1, %v1.1
181  %tmp1.0 = mul i32 %v0.0, %v1.0
182  %tmp1.1 = mul i32 %v0.1, %v1.1
183  %tmp1.2 = xor i32 %v0.0, %v1.0
184  %tmp1.3 = xor i32 %v0.1, %v1.1
185  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
186  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
187  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
188  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
189  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
190  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
191  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
192  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
193  ret <4 x i32> %tmp3.3
194}
195
196define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
197; CHECK-LABEL: @reduction_v4i32(
198; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
199; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
200; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
201; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
202; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
203; CHECK-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], splat (i32 15)
204; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP6]], splat (i32 65537)
205; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], splat (i32 65535)
206; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
207; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
208; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
209; CHECK-NEXT:    ret i32 [[TMP11]]
210;
211  %v0.0 = extractelement <4 x i32> %v0, i32 0
212  %v0.1 = extractelement <4 x i32> %v0, i32 1
213  %v0.2 = extractelement <4 x i32> %v0, i32 2
214  %v0.3 = extractelement <4 x i32> %v0, i32 3
215  %v1.0 = extractelement <4 x i32> %v1, i32 0
216  %v1.1 = extractelement <4 x i32> %v1, i32 1
217  %v1.2 = extractelement <4 x i32> %v1, i32 2
218  %v1.3 = extractelement <4 x i32> %v1, i32 3
219  %tmp0.0 = add i32 %v0.0, %v1.0
220  %tmp0.1 = add i32 %v0.1, %v1.1
221  %tmp0.2 = add i32 %v0.2, %v1.2
222  %tmp0.3 = add i32 %v0.3, %v1.3
223  %tmp1.0 = sub i32 %v0.0, %v1.0
224  %tmp1.1 = sub i32 %v0.1, %v1.1
225  %tmp1.2 = sub i32 %v0.2, %v1.2
226  %tmp1.3 = sub i32 %v0.3, %v1.3
227  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
228  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
229  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
230  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
231  %tmp3.0 = lshr i32 %tmp2.0, 15
232  %tmp3.1 = lshr i32 %tmp2.1, 15
233  %tmp3.2 = lshr i32 %tmp2.2, 15
234  %tmp3.3 = lshr i32 %tmp2.3, 15
235  %tmp4.0 = and i32 %tmp3.0, 65537
236  %tmp4.1 = and i32 %tmp3.1, 65537
237  %tmp4.2 = and i32 %tmp3.2, 65537
238  %tmp4.3 = and i32 %tmp3.3, 65537
239  %tmp5.0 = mul nuw i32 %tmp4.0, 65535
240  %tmp5.1 = mul nuw i32 %tmp4.1, 65535
241  %tmp5.2 = mul nuw i32 %tmp4.2, 65535
242  %tmp5.3 = mul nuw i32 %tmp4.3, 65535
243  %tmp6.0 = add i32 %tmp5.0, %tmp2.0
244  %tmp6.1 = add i32 %tmp5.1, %tmp2.1
245  %tmp6.2 = add i32 %tmp5.2, %tmp2.2
246  %tmp6.3 = add i32 %tmp5.3, %tmp2.3
247  %tmp7.0 = xor i32 %tmp6.0, %tmp5.0
248  %tmp7.1 = xor i32 %tmp6.1, %tmp5.1
249  %tmp7.2 = xor i32 %tmp6.2, %tmp5.2
250  %tmp7.3 = xor i32 %tmp6.3, %tmp5.3
251  %reduce.0 = add i32 %tmp7.1, %tmp7.0
252  %reduce.1 = add i32 %reduce.0, %tmp7.2
253  %reduce.2 = add i32 %reduce.1, %tmp7.3
254  ret i32 %reduce.2
255}
256