xref: /llvm-project/llvm/test/CodeGen/AArch64/highextractbitcast.ll (revision db158c7c830807caeeb0691739c41f1d522029e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE
3; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE
4
5declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
6declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
7declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
8declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
9declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
10declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
11declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
12declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2)
13
14define <4 x i32> @test_smull_high_s16_base(<8 x i16> %a, <8 x i16> %b) #0 {
15; CHECK-LE-LABEL: test_smull_high_s16_base:
16; CHECK-LE:       // %bb.0: // %entry
17; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
18; CHECK-LE-NEXT:    ret
19;
20; CHECK-BE-LABEL: test_smull_high_s16_base:
21; CHECK-BE:       // %bb.0: // %entry
22; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
23; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
24; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
25; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
26; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
27; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
28; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
29; CHECK-BE-NEXT:    ret
30entry:
31  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
32  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
33  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
34  ret <4 x i32> %r
35}
36
37define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 {
38; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1:
39; CHECK-LE:       // %bb.0: // %entry
40; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
41; CHECK-LE-NEXT:    ret
42;
43; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1:
44; CHECK-BE:       // %bb.0: // %entry
45; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
46; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
47; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
48; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
49; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
50; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
51; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
52; CHECK-BE-NEXT:    ret
53entry:
54  %a = bitcast <2 x i64> %aa to <8 x i16>
55  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
56  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
57  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
58  ret <4 x i32> %r
59}
60
61define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 {
62; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1:
63; CHECK-LE:       // %bb.0: // %entry
64; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
65; CHECK-LE-NEXT:    ret
66;
67; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1:
68; CHECK-BE:       // %bb.0: // %entry
69; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
70; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
71; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
72; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
73; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
74; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
75; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
76; CHECK-BE-NEXT:    ret
77entry:
78  %b = bitcast <16 x i8> %bb to <8 x i16>
79  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
80  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
81  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
82  ret <4 x i32> %r
83}
84
85define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 {
86; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2:
87; CHECK-LE:       // %bb.0: // %entry
88; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
89; CHECK-LE-NEXT:    ret
90;
91; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2:
92; CHECK-BE:       // %bb.0: // %entry
93; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
94; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
95; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
96; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
97; CHECK-BE-NEXT:    rev64 v0.4h, v0.4h
98; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
99; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
100; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
101; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
102; CHECK-BE-NEXT:    ret
103entry:
104  %s1a = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
105  %s1 = bitcast <1 x i64> %s1a to <4 x i16>
106  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
107  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
108  ret <4 x i32> %r
109}
110
111define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 {
112; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2:
113; CHECK-LE:       // %bb.0: // %entry
114; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
115; CHECK-LE-NEXT:    ret
116;
117; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2:
118; CHECK-BE:       // %bb.0: // %entry
119; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
120; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
121; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
122; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
123; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
124; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
125; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
126; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
127; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
128; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
129; CHECK-BE-NEXT:    ret
130entry:
131  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
132  %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
133  %s2 = bitcast <8 x i8> %s2a to <4 x i16>
134  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
135  ret <4 x i32> %r
136}
137
138
139define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i16> %b) #0 {
140; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1_wrongindex:
141; CHECK-LE:       // %bb.0: // %entry
142; CHECK-LE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
143; CHECK-LE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
144; CHECK-LE-NEXT:    ext v0.8b, v0.8b, v2.8b, #4
145; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h
146; CHECK-LE-NEXT:    ret
147;
148; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1_wrongindex:
149; CHECK-BE:       // %bb.0: // %entry
150; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
151; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
152; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
153; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
154; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
155; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
156; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
157; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
158; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
159; CHECK-BE-NEXT:    ret
160entry:
161  %a = bitcast <2 x i64> %aa to <8 x i16>
162  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
163  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
164  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
165  ret <4 x i32> %r
166}
167
168define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i8> %bb) #0 {
169; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1_wrongindex:
170; CHECK-LE:       // %bb.0: // %entry
171; CHECK-LE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
172; CHECK-LE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
173; CHECK-LE-NEXT:    ext v1.8b, v1.8b, v2.8b, #6
174; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h
175; CHECK-LE-NEXT:    ret
176;
177; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1_wrongindex:
178; CHECK-BE:       // %bb.0: // %entry
179; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
180; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
181; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
182; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
183; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
184; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #6
185; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
186; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
187; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
188; CHECK-BE-NEXT:    ret
189entry:
190  %b = bitcast <16 x i8> %bb to <8 x i16>
191  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
192  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
193  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
194  ret <4 x i32> %r
195}
196
197define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i16> %b) #0 {
198; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2_wrongindex:
199; CHECK-LE:       // %bb.0: // %entry
200; CHECK-LE-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
201; CHECK-LE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
202; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h
203; CHECK-LE-NEXT:    ret
204;
205; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2_wrongindex:
206; CHECK-BE:       // %bb.0: // %entry
207; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
208; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
209; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
210; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
211; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
212; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
213; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
214; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
215; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
216; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
217; CHECK-BE-NEXT:    ret
218entry:
219  %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 2>
220  %s1 = bitcast <2 x i32> %s1a to <4 x i16>
221  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
222  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
223  ret <4 x i32> %r
224}
225
226define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i8> %b) #0 {
227; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2_wrongindex:
228; CHECK-LE:       // %bb.0: // %entry
229; CHECK-LE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
230; CHECK-LE-NEXT:    ext v1.16b, v1.16b, v1.16b, #4
231; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h
232; CHECK-LE-NEXT:    ret
233;
234; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2_wrongindex:
235; CHECK-BE:       // %bb.0: // %entry
236; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
237; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
238; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
239; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
240; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #4
241; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
242; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
243; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
244; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
245; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
246; CHECK-BE-NEXT:    ret
247entry:
248  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
249  %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
250  %s2 = bitcast <8 x i8> %s2a to <4 x i16>
251  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
252  ret <4 x i32> %r
253}
254
255
256define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 {
257; CHECK-LE-LABEL: test_smull_high_s16_splata1:
258; CHECK-LE:       // %bb.0: // %entry
259; CHECK-LE-NEXT:    smull2 v0.4s, v1.8h, v0.h[3]
260; CHECK-LE-NEXT:    ret
261;
262; CHECK-BE-LABEL: test_smull_high_s16_splata1:
263; CHECK-BE:       // %bb.0: // %entry
264; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
265; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
266; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
267; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
268; CHECK-BE-NEXT:    smull2 v0.4s, v1.8h, v0.h[3]
269; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
270; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
271; CHECK-BE-NEXT:    ret
272entry:
273  %a = bitcast <2 x i64> %aa to <8 x i16>
274  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
275  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
276  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
277  ret <4 x i32> %r
278}
279
280define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 {
281; CHECK-LE-LABEL: test_smull_high_s16_splatb1:
282; CHECK-LE:       // %bb.0: // %entry
283; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.h[3]
284; CHECK-LE-NEXT:    ret
285;
286; CHECK-BE-LABEL: test_smull_high_s16_splatb1:
287; CHECK-BE:       // %bb.0: // %entry
288; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
289; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
290; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
291; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
292; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.h[3]
293; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
294; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
295; CHECK-BE-NEXT:    ret
296entry:
297  %b = bitcast <16 x i8> %bb to <8 x i16>
298  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
299  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
300  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
301  ret <4 x i32> %r
302}
303
304define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 {
305; CHECK-LE-LABEL: test_smull_high_s16_splata2:
306; CHECK-LE:       // %bb.0: // %entry
307; CHECK-LE-NEXT:    dup v0.2s, v0.s[3]
308; CHECK-LE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
309; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h
310; CHECK-LE-NEXT:    ret
311;
312; CHECK-BE-LABEL: test_smull_high_s16_splata2:
313; CHECK-BE:       // %bb.0: // %entry
314; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
315; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
316; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
317; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
318; CHECK-BE-NEXT:    dup v0.2s, v0.s[3]
319; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
320; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
321; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
322; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
323; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
324; CHECK-BE-NEXT:    ret
325entry:
326  %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
327  %s1 = bitcast <2 x i32> %s1a to <4 x i16>
328  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
329  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
330  ret <4 x i32> %r
331}
332
333define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 {
334; CHECK-LE-LABEL: test_smull_high_s16_splatb2:
335; CHECK-LE:       // %bb.0: // %entry
336; CHECK-LE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
337; CHECK-LE-NEXT:    dup v1.8b, v1.b[3]
338; CHECK-LE-NEXT:    smull v0.4s, v0.4h, v1.4h
339; CHECK-LE-NEXT:    ret
340;
341; CHECK-BE-LABEL: test_smull_high_s16_splatb2:
342; CHECK-BE:       // %bb.0: // %entry
343; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
344; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
345; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
346; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
347; CHECK-BE-NEXT:    dup v1.8b, v1.b[3]
348; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
349; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
350; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
351; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
352; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
353; CHECK-BE-NEXT:    ret
354entry:
355  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
356  %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
357  %s2 = bitcast <8 x i8> %s2a to <4 x i16>
358  %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
359  ret <4 x i32> %r
360}
361
362
363
364define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 {
365; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1:
366; CHECK-LE:       // %bb.0: // %entry
367; CHECK-LE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
368; CHECK-LE-NEXT:    ret
369;
370; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1:
371; CHECK-BE:       // %bb.0: // %entry
372; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
373; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
374; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
375; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
376; CHECK-BE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
377; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
378; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
379; CHECK-BE-NEXT:    ret
380entry:
381  %a = bitcast <2 x i64> %aa to <8 x i16>
382  %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
383  %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
384  %r = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %s1, <4 x i16> %s2)
385  ret <4 x i32> %r
386}
387
388define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) {
389; CHECK-LE-LABEL: test_vabdl_high_u82:
390; CHECK-LE:       // %bb.0: // %entry
391; CHECK-LE-NEXT:    uabdl2 v0.8h, v0.16b, v1.16b
392; CHECK-LE-NEXT:    ret
393;
394; CHECK-BE-LABEL: test_vabdl_high_u82:
395; CHECK-BE:       // %bb.0: // %entry
396; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
397; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
398; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
399; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
400; CHECK-BE-NEXT:    uabdl2 v0.8h, v0.16b, v1.16b
401; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
402; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
403; CHECK-BE-NEXT:    ret
404entry:
405  %b = bitcast <8 x i16> %bb to <16 x i8>
406  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
407  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
408  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
409  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
410  ret <8 x i16> %vmovl.i.i.i
411}
412
413define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) {
414; CHECK-LE-LABEL: test_vabdl_high_s82:
415; CHECK-LE:       // %bb.0: // %entry
416; CHECK-LE-NEXT:    sabdl2 v0.8h, v0.16b, v1.16b
417; CHECK-LE-NEXT:    ret
418;
419; CHECK-BE-LABEL: test_vabdl_high_s82:
420; CHECK-BE:       // %bb.0: // %entry
421; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
422; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
423; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
424; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
425; CHECK-BE-NEXT:    sabdl2 v0.8h, v0.16b, v1.16b
426; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
427; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
428; CHECK-BE-NEXT:    ret
429entry:
430  %b = bitcast <8 x i16> %bb to <16 x i8>
431  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
432  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
433  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
434  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
435  ret <8 x i16> %vmovl.i.i.i
436}
437
438define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) {
439; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast:
440; CHECK-LE:       // %bb.0: // %entry
441; CHECK-LE-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h
442; CHECK-LE-NEXT:    ret
443;
444; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast:
445; CHECK-BE:       // %bb.0: // %entry
446; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
447; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
448; CHECK-BE-NEXT:    rev64 v2.8h, v2.8h
449; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
450; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
451; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
452; CHECK-BE-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h
453; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
454; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
455; CHECK-BE-NEXT:    ret
456entry:
457  %c = bitcast <16 x i8> %cc to <8 x i16>
458  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
459  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
460  %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
461  %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
462  ret <4 x i32> %vqdmlal4.i.i
463}
464
465define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
466; CHECK-LE-LABEL: test_pmull_high_p8_128:
467; CHECK-LE:       // %bb.0: // %entry
468; CHECK-LE-NEXT:    fmov d0, x3
469; CHECK-LE-NEXT:    fmov d1, x1
470; CHECK-LE-NEXT:    pmull v0.8h, v1.8b, v0.8b
471; CHECK-LE-NEXT:    ret
472;
473; CHECK-BE-LABEL: test_pmull_high_p8_128:
474; CHECK-BE:       // %bb.0: // %entry
475; CHECK-BE-NEXT:    fmov d0, x3
476; CHECK-BE-NEXT:    fmov d1, x1
477; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
478; CHECK-BE-NEXT:    rev64 v1.8b, v1.8b
479; CHECK-BE-NEXT:    pmull v0.8h, v1.8b, v0.8b
480; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
481; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
482; CHECK-BE-NEXT:    ret
483entry:
484  %a = bitcast i128 %aa to <16 x i8>
485  %b = bitcast i128 %bb to <16 x i8>
486  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
487  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
488  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
489  ret <8 x i16> %vmull.i.i
490}
491
492define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) {
493; CHECK-LE-LABEL: test_pmull_high_p8_64:
494; CHECK-LE:       // %bb.0: // %entry
495; CHECK-LE-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
496; CHECK-LE-NEXT:    ret
497;
498; CHECK-BE-LABEL: test_pmull_high_p8_64:
499; CHECK-BE:       // %bb.0: // %entry
500; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
501; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
502; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
503; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
504; CHECK-BE-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
505; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
506; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
507; CHECK-BE-NEXT:    ret
508entry:
509  %a = bitcast <2 x i64> %aa to <16 x i8>
510  %b = bitcast <2 x i64> %bb to <16 x i8>
511  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
512  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
513  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
514  ret <8 x i16> %vmull.i.i
515}
516
517define <8 x i16> @foov8i16(<16 x i8> %a1, <2 x i64> %b1) {
518; CHECK-LE-LABEL: foov8i16:
519; CHECK-LE:       // %bb.0:
520; CHECK-LE-NEXT:    shrn v0.4h, v0.4s, #5
521; CHECK-LE-NEXT:    shrn2 v0.8h, v1.4s, #5
522; CHECK-LE-NEXT:    ret
523;
524; CHECK-BE-LABEL: foov8i16:
525; CHECK-BE:       // %bb.0:
526; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
527; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
528; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
529; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
530; CHECK-BE-NEXT:    shrn v0.4h, v0.4s, #5
531; CHECK-BE-NEXT:    shrn2 v0.8h, v1.4s, #5
532; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
533; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
534; CHECK-BE-NEXT:    ret
535  %a0 = bitcast <16 x i8> %a1 to <4 x i32>
536  %b0 = bitcast <2 x i64> %b1 to <4 x i32>
537  %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
538  %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
539  %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
540  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
541  %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
542  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
543  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
544  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
545  ret <8 x i16> %3
546}
547
548define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) {
549; CHECK-LE-LABEL: hadd32_zext_asr:
550; CHECK-LE:       // %bb.0:
551; CHECK-LE-NEXT:    ushll2 v0.2d, v0.4s, #1
552; CHECK-LE-NEXT:    ret
553;
554; CHECK-BE-LABEL: hadd32_zext_asr:
555; CHECK-BE:       // %bb.0:
556; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
557; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
558; CHECK-BE-NEXT:    ushll2 v0.2d, v0.4s, #1
559; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
560; CHECK-BE-NEXT:    ret
561  %src1 = bitcast <16 x i8> %src1a to <4 x i32>
562  %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
563  %zextsrc1 = zext <2 x i32> %s1 to <2 x i64>
564  %resulti32 = shl <2 x i64> %zextsrc1, <i64 1, i64 1>
565  ret <2 x i64> %resulti32
566}
567
568define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 {
569; CHECK-LE-LABEL: test_umull_high_s16_splata1:
570; CHECK-LE:       // %bb.0: // %entry
571; CHECK-LE-NEXT:    umull2 v0.2d, v1.4s, v0.s[1]
572; CHECK-LE-NEXT:    ret
573;
574; CHECK-BE-LABEL: test_umull_high_s16_splata1:
575; CHECK-BE:       // %bb.0: // %entry
576; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
577; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
578; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
579; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
580; CHECK-BE-NEXT:    umull2 v0.2d, v1.4s, v0.s[1]
581; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
582; CHECK-BE-NEXT:    ret
583entry:
584  %a = bitcast <2 x i64> %aa to <4 x i32>
585  %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
586  %s2 = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
587  %r = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2)
588  ret <2 x i64> %r
589}
590