xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll (revision db158c7c830807caeeb0691739c41f1d522029e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm | FileCheck %s
3; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a | FileCheck %s
4
5declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
6declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
7declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
8declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
9declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
10
11declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
12declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
13declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
14declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
15declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
16
17declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
18declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
19declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
20declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
21declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
22
23declare <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
24declare <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
25declare <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
26declare <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
27declare i32 @llvm.aarch64.neon.sqrdmlah.i32(i32, i32, i32)
28
29declare <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
30declare <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
31declare <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
32declare <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
33declare i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32, i32, i32)
34
35; The sadd intrinsics in this file previously transformed into sqrdmlah where they
36; shouldn't. They should produce sqrdmulh and sqadd.
37
38;-----------------------------------------------------------------------------
39; RDMA Vector
40; test for SIMDThreeSameVectorSQRDMLxHTiedHS
41
42define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
43; CHECK-LABEL: test_sqrdmlah_v4i16:
44; CHECK:       // %bb.0:
45; CHECK-NEXT:    sqrdmulh v1.4h, v1.4h, v2.4h
46; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
47; CHECK-NEXT:    ret
48   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
49   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
50   ret <4 x i16> %retval
51}
52
53define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
54; CHECK-LABEL: test_sqrdmlah_v8i16:
55; CHECK:       // %bb.0:
56; CHECK-NEXT:    sqrdmulh v1.8h, v1.8h, v2.8h
57; CHECK-NEXT:    sqadd v0.8h, v0.8h, v1.8h
58; CHECK-NEXT:    ret
59   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
60   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
61   ret <8 x i16> %retval
62}
63
64define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
65; CHECK-LABEL: test_sqrdmlah_v2i32:
66; CHECK:       // %bb.0:
67; CHECK-NEXT:    sqrdmulh v1.2s, v1.2s, v2.2s
68; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
69; CHECK-NEXT:    ret
70   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
71   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
72   ret <2 x i32> %retval
73}
74
75define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
76; CHECK-LABEL: test_sqrdmlah_v4i32:
77; CHECK:       // %bb.0:
78; CHECK-NEXT:    sqrdmulh v1.4s, v1.4s, v2.4s
79; CHECK-NEXT:    sqadd v0.4s, v0.4s, v1.4s
80; CHECK-NEXT:    ret
81   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
82   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
83   ret <4 x i32> %retval
84}
85
86define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
87; CHECK-LABEL: test_sqrdmlsh_v4i16:
88; CHECK:       // %bb.0:
89; CHECK-NEXT:    sqrdmulh v1.4h, v1.4h, v2.4h
90; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
91; CHECK-NEXT:    ret
92   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
93   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
94   ret <4 x i16> %retval
95}
96
97define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
98; CHECK-LABEL: test_sqrdmlsh_v8i16:
99; CHECK:       // %bb.0:
100; CHECK-NEXT:    sqrdmulh v1.8h, v1.8h, v2.8h
101; CHECK-NEXT:    sqsub v0.8h, v0.8h, v1.8h
102; CHECK-NEXT:    ret
103   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
104   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
105   ret <8 x i16> %retval
106}
107
108define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
109; CHECK-LABEL: test_sqrdmlsh_v2i32:
110; CHECK:       // %bb.0:
111; CHECK-NEXT:    sqrdmulh v1.2s, v1.2s, v2.2s
112; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
113; CHECK-NEXT:    ret
114   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
115   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
116   ret <2 x i32> %retval
117}
118
119define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
120; CHECK-LABEL: test_sqrdmlsh_v4i32:
121; CHECK:       // %bb.0:
122; CHECK-NEXT:    sqrdmulh v1.4s, v1.4s, v2.4s
123; CHECK-NEXT:    sqsub v0.4s, v0.4s, v1.4s
124; CHECK-NEXT:    ret
125   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
126   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
127   ret <4 x i32> %retval
128}
129
130;-----------------------------------------------------------------------------
131; RDMA Vector, by element
132; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
133
134define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
135; CHECK-LABEL: test_sqrdmlah_lane_s16:
136; CHECK:       // %bb.0: // %entry
137; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
138; CHECK-NEXT:    sqrdmulh v1.4h, v1.4h, v2.h[3]
139; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
140; CHECK-NEXT:    ret
141entry:
142  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
143  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
144  %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
145  ret <4 x i16> %retval
146}
147
148define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
149; CHECK-LABEL: test_sqrdmlahq_lane_s16:
150; CHECK:       // %bb.0: // %entry
151; CHECK-NEXT:    sqrdmulh v1.8h, v1.8h, v2.h[2]
152; CHECK-NEXT:    sqadd v0.8h, v0.8h, v1.8h
153; CHECK-NEXT:    ret
154entry:
155  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
156  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
157  %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
158  ret <8 x i16> %retval
159}
160
161define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
162; CHECK-LABEL: test_sqrdmlah_lane_s32:
163; CHECK:       // %bb.0: // %entry
164; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
165; CHECK-NEXT:    sqrdmulh v1.2s, v1.2s, v2.s[1]
166; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
167; CHECK-NEXT:    ret
168entry:
169  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
170  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
171  %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
172  ret <2 x i32> %retval
173}
174
175define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
176; CHECK-LABEL: test_sqrdmlahq_lane_s32:
177; CHECK:       // %bb.0: // %entry
178; CHECK-NEXT:    sqrdmulh v1.4s, v1.4s, v2.s[0]
179; CHECK-NEXT:    sqadd v0.4s, v0.4s, v1.4s
180; CHECK-NEXT:    ret
181entry:
182  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
183  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
184  %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
185  ret <4 x i32> %retval
186}
187
188define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
189; CHECK-LABEL: test_sqrdmlsh_lane_s16:
190; CHECK:       // %bb.0: // %entry
191; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
192; CHECK-NEXT:    sqrdmulh v1.4h, v1.4h, v2.h[3]
193; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
194; CHECK-NEXT:    ret
195entry:
196  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
197  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
198  %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
199  ret <4 x i16> %retval
200}
201
202define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
203; CHECK-LABEL: test_sqrdmlshq_lane_s16:
204; CHECK:       // %bb.0: // %entry
205; CHECK-NEXT:    sqrdmulh v1.8h, v1.8h, v2.h[2]
206; CHECK-NEXT:    sqsub v0.8h, v0.8h, v1.8h
207; CHECK-NEXT:    ret
208entry:
209  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
210  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
211  %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
212  ret <8 x i16> %retval
213}
214
215define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
216; CHECK-LABEL: test_sqrdmlsh_lane_s32:
217; CHECK:       // %bb.0: // %entry
218; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
219; CHECK-NEXT:    sqrdmulh v1.2s, v1.2s, v2.s[1]
220; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
221; CHECK-NEXT:    ret
222entry:
223  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
224  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
225  %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
226  ret <2 x i32> %retval
227}
228
229define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
230; CHECK-LABEL: test_sqrdmlshq_lane_s32:
231; CHECK:       // %bb.0: // %entry
232; CHECK-NEXT:    sqrdmulh v1.4s, v1.4s, v2.s[0]
233; CHECK-NEXT:    sqsub v0.4s, v0.4s, v1.4s
234; CHECK-NEXT:    ret
235entry:
236  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
237  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
238  %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
239  ret <4 x i32> %retval
240}
241
242;-----------------------------------------------------------------------------
243; RDMA Vector, by element, extracted
244; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
245; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
246
247define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
248; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
249; CHECK:       // %bb.0: // %entry
250; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
251; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[1]
252; CHECK-NEXT:    fmov s1, w0
253; CHECK-NEXT:    sqadd v0.4h, v1.4h, v0.4h
254; CHECK-NEXT:    umov w0, v0.h[0]
255; CHECK-NEXT:    ret
256entry:
257  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
258  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
259  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
260  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
261  %retval = extractelement <4 x i16> %retval_vec, i64 0
262  ret i16 %retval
263}
264
265define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
266; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
267; CHECK:       // %bb.0: // %entry
268; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[1]
269; CHECK-NEXT:    fmov s1, w0
270; CHECK-NEXT:    sqadd v0.8h, v1.8h, v0.8h
271; CHECK-NEXT:    umov w0, v0.h[0]
272; CHECK-NEXT:    ret
273entry:
274  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
275  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
276  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
277  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
278  %retval = extractelement <8 x i16> %retval_vec, i64 0
279  ret i16 %retval
280}
281
282define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
283; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
284; CHECK:       // %bb.0: // %entry
285; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
286; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[0]
287; CHECK-NEXT:    fmov s1, w0
288; CHECK-NEXT:    sqadd s0, s1, s0
289; CHECK-NEXT:    fmov w0, s0
290; CHECK-NEXT:    ret
291entry:
292  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
293  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
294  %extract = extractelement <2 x i32> %prod, i64 0
295  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
296  ret i32 %retval
297}
298
299define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
300; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
301; CHECK:       // %bb.0: // %entry
302; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[0]
303; CHECK-NEXT:    fmov s1, w0
304; CHECK-NEXT:    sqadd s0, s1, s0
305; CHECK-NEXT:    fmov w0, s0
306; CHECK-NEXT:    ret
307entry:
308  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
309  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
310  %extract = extractelement <4 x i32> %prod, i64 0
311  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
312  ret i32 %retval
313}
314
315define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
316; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
317; CHECK:       // %bb.0: // %entry
318; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
319; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[1]
320; CHECK-NEXT:    fmov s1, w0
321; CHECK-NEXT:    sqsub v0.4h, v1.4h, v0.4h
322; CHECK-NEXT:    umov w0, v0.h[0]
323; CHECK-NEXT:    ret
324entry:
325  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
326  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
327  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
328  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
329  %retval = extractelement <4 x i16> %retval_vec, i64 0
330  ret i16 %retval
331}
332
333define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
334; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
335; CHECK:       // %bb.0: // %entry
336; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[1]
337; CHECK-NEXT:    fmov s1, w0
338; CHECK-NEXT:    sqsub v0.8h, v1.8h, v0.8h
339; CHECK-NEXT:    umov w0, v0.h[0]
340; CHECK-NEXT:    ret
341entry:
342  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
343  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
344  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
345  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
346  %retval = extractelement <8 x i16> %retval_vec, i64 0
347  ret i16 %retval
348}
349
350define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
351; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
352; CHECK:       // %bb.0: // %entry
353; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
354; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[0]
355; CHECK-NEXT:    fmov s1, w0
356; CHECK-NEXT:    sqsub s0, s1, s0
357; CHECK-NEXT:    fmov w0, s0
358; CHECK-NEXT:    ret
359entry:
360  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
361  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
362  %extract = extractelement <2 x i32> %prod, i64 0
363  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
364  ret i32 %retval
365}
366
367define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
368; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
369; CHECK:       // %bb.0: // %entry
370; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[0]
371; CHECK-NEXT:    fmov s1, w0
372; CHECK-NEXT:    sqsub s0, s1, s0
373; CHECK-NEXT:    fmov w0, s0
374; CHECK-NEXT:    ret
375entry:
376  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
377  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
378  %extract = extractelement <4 x i32> %prod, i64 0
379  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
380  ret i32 %retval
381}
382
383;-----------------------------------------------------------------------------
384; RDMA Scalar
385; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
386
387define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
388; CHECK-LABEL: test_sqrdmlah_v1i16:
389; CHECK:       // %bb.0:
390; CHECK-NEXT:    fmov s0, w1
391; CHECK-NEXT:    fmov s1, w2
392; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.4h
393; CHECK-NEXT:    fmov s1, w0
394; CHECK-NEXT:    sqadd v0.4h, v1.4h, v0.4h
395; CHECK-NEXT:    umov w0, v0.h[0]
396; CHECK-NEXT:    ret
397  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
398  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
399  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
400  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
401  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
402  %retval = extractelement <4 x i16> %retval_vec, i64 0
403  ret i16 %retval
404}
405
406define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
407; CHECK-LABEL: test_sqrdmlah_v1i32:
408; CHECK:       // %bb.0:
409; CHECK-NEXT:    fmov s0, w1
410; CHECK-NEXT:    fmov s1, w2
411; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.4s
412; CHECK-NEXT:    fmov s1, w0
413; CHECK-NEXT:    sqadd v0.4s, v1.4s, v0.4s
414; CHECK-NEXT:    fmov w0, s0
415; CHECK-NEXT:    ret
416  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
417  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
418  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
419  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
420  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
421  %retval = extractelement <4 x i32> %retval_vec, i64 0
422  ret i32 %retval
423}
424
425
426define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
427; CHECK-LABEL: test_sqrdmlsh_v1i16:
428; CHECK:       // %bb.0:
429; CHECK-NEXT:    fmov s0, w1
430; CHECK-NEXT:    fmov s1, w2
431; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.4h
432; CHECK-NEXT:    fmov s1, w0
433; CHECK-NEXT:    sqsub v0.4h, v1.4h, v0.4h
434; CHECK-NEXT:    umov w0, v0.h[0]
435; CHECK-NEXT:    ret
436  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
437  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
438  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
439  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
440  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
441  %retval = extractelement <4 x i16> %retval_vec, i64 0
442  ret i16 %retval
443}
444
445define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
446; CHECK-LABEL: test_sqrdmlsh_v1i32:
447; CHECK:       // %bb.0:
448; CHECK-NEXT:    fmov s0, w1
449; CHECK-NEXT:    fmov s1, w2
450; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.4s
451; CHECK-NEXT:    fmov s1, w0
452; CHECK-NEXT:    sqsub v0.4s, v1.4s, v0.4s
453; CHECK-NEXT:    fmov w0, s0
454; CHECK-NEXT:    ret
455  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
456  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
457  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
458  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
459  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
460  %retval = extractelement <4 x i32> %retval_vec, i64 0
461  ret i32 %retval
462}
463
464define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
465; CHECK-LABEL: test_sqrdmlah_i32:
466; CHECK:       // %bb.0:
467; CHECK-NEXT:    fmov s0, w1
468; CHECK-NEXT:    fmov s1, w2
469; CHECK-NEXT:    sqrdmulh s0, s0, s1
470; CHECK-NEXT:    fmov s1, w0
471; CHECK-NEXT:    sqadd s0, s1, s0
472; CHECK-NEXT:    fmov w0, s0
473; CHECK-NEXT:    ret
474  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
475  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
476  ret i32 %retval
477}
478
479define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
480; CHECK-LABEL: test_sqrdmlsh_i32:
481; CHECK:       // %bb.0:
482; CHECK-NEXT:    fmov s0, w1
483; CHECK-NEXT:    fmov s1, w2
484; CHECK-NEXT:    sqrdmulh s0, s0, s1
485; CHECK-NEXT:    fmov s1, w0
486; CHECK-NEXT:    sqsub s0, s1, s0
487; CHECK-NEXT:    fmov w0, s0
488; CHECK-NEXT:    ret
489  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
490  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
491  ret i32 %retval
492}
493
494;-----------------------------------------------------------------------------
495; RDMA Scalar, by element
496; i16 tests are performed via tests in above chapter, with IR in ACLE style
497; i32 tests are for v1i32_indexed in SIMDIndexedSQRDMLxHSDTied
498
499define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
500; CHECK-LABEL: test_sqrdmlah_extract_i16:
501; CHECK:       // %bb.0:
502; CHECK-NEXT:    fmov s1, w1
503; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
504; CHECK-NEXT:    sqrdmulh v0.4h, v1.4h, v0.h[1]
505; CHECK-NEXT:    fmov s1, w0
506; CHECK-NEXT:    sqadd v0.4h, v1.4h, v0.4h
507; CHECK-NEXT:    umov w0, v0.h[0]
508; CHECK-NEXT:    ret
509  %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
510  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
511  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
512  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
513  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
514  %retval = extractelement <4 x i16> %retval_vec, i32 0
515  ret i16 %retval
516}
517
518define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
519; CHECK-LABEL: test_sqrdmlah_extract_i32:
520; CHECK:       // %bb.0:
521; CHECK-NEXT:    fmov s1, w1
522; CHECK-NEXT:    sqrdmulh s0, s1, v0.s[3]
523; CHECK-NEXT:    fmov s1, w0
524; CHECK-NEXT:    sqadd s0, s1, s0
525; CHECK-NEXT:    fmov w0, s0
526; CHECK-NEXT:    ret
527  %extract = extractelement <4 x i32> %rhs, i32 3
528  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
529  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
530  ret i32 %retval
531}
532
533define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
534; CHECK-LABEL: test_sqrdmlshq_extract_i16:
535; CHECK:       // %bb.0:
536; CHECK-NEXT:    fmov s1, w1
537; CHECK-NEXT:    sqrdmulh v0.8h, v1.8h, v0.h[1]
538; CHECK-NEXT:    fmov s1, w0
539; CHECK-NEXT:    sqsub v0.8h, v1.8h, v0.8h
540; CHECK-NEXT:    umov w0, v0.h[0]
541; CHECK-NEXT:    ret
542  %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
543  %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
544  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
545  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
546  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
547  %retval = extractelement <8 x i16> %retval_vec, i32 0
548  ret i16 %retval
549}
550
551define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
552; CHECK-LABEL: test_sqrdmlsh_extract_i32:
553; CHECK:       // %bb.0:
554; CHECK-NEXT:    fmov s1, w1
555; CHECK-NEXT:    sqrdmulh s0, s1, v0.s[3]
556; CHECK-NEXT:    fmov s1, w0
557; CHECK-NEXT:    sqsub s0, s1, s0
558; CHECK-NEXT:    fmov w0, s0
559; CHECK-NEXT:    ret
560  %extract = extractelement <4 x i32> %rhs, i32 3
561  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
562  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
563  ret i32 %retval
564}
565
566
567;-----------------------------------------------------------------------------
568; Using sqrdmlah intrinsics
569
570define <4 x i16> @test_vqrdmlah_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
571; CHECK-LABEL: test_vqrdmlah_laneq_s16:
572; CHECK:       // %bb.0: // %entry
573; CHECK-NEXT:    sqrdmlah v0.4h, v1.4h, v2.h[7]
574; CHECK-NEXT:    ret
575entry:
576  %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
577  %vqrdmlah_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
578  ret <4 x i16> %vqrdmlah_v3.i
579}
580
581define <2 x i32> @test_vqrdmlah_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
582; CHECK-LABEL: test_vqrdmlah_laneq_s32:
583; CHECK:       // %bb.0: // %entry
584; CHECK-NEXT:    sqrdmlah v0.2s, v1.2s, v2.s[3]
585; CHECK-NEXT:    ret
586entry:
587  %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
588  %vqrdmlah_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
589  ret <2 x i32> %vqrdmlah_v3.i
590}
591
592define <8 x i16> @test_vqrdmlahq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
593; CHECK-LABEL: test_vqrdmlahq_laneq_s16:
594; CHECK:       // %bb.0: // %entry
595; CHECK-NEXT:    sqrdmlah v0.8h, v1.8h, v2.h[7]
596; CHECK-NEXT:    ret
597entry:
598  %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
599  %vqrdmlahq_v3.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #4
600  ret <8 x i16> %vqrdmlahq_v3.i
601}
602
603define <4 x i32> @test_vqrdmlahq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
604; CHECK-LABEL: test_vqrdmlahq_laneq_s32:
605; CHECK:       // %bb.0: // %entry
606; CHECK-NEXT:    sqrdmlah v0.4s, v1.4s, v2.s[3]
607; CHECK-NEXT:    ret
608entry:
609  %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
610  %vqrdmlahq_v3.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #4
611  ret <4 x i32> %vqrdmlahq_v3.i
612}
613
614define i16 @test_vqrdmlahh_s16(i16 %a, i16 %b, i16 %c) {
615; CHECK-LABEL: test_vqrdmlahh_s16:
616; CHECK:       // %bb.0: // %entry
617; CHECK-NEXT:    fmov s0, w0
618; CHECK-NEXT:    fmov s1, w1
619; CHECK-NEXT:    fmov s2, w2
620; CHECK-NEXT:    sqrdmlah v0.4h, v1.4h, v2.4h
621; CHECK-NEXT:    umov w0, v0.h[0]
622; CHECK-NEXT:    ret
623entry:
624  %0 = insertelement <4 x i16> undef, i16 %a, i64 0
625  %1 = insertelement <4 x i16> undef, i16 %b, i64 0
626  %2 = insertelement <4 x i16> undef, i16 %c, i64 0
627  %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
628  %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
629  ret i16 %3
630}
631
632define i32 @test_vqrdmlahs_s32(i32 %a, i32 %b, i32 %c) {
633; CHECK-LABEL: test_vqrdmlahs_s32:
634; CHECK:       // %bb.0: // %entry
635; CHECK-NEXT:    fmov s0, w0
636; CHECK-NEXT:    fmov s1, w1
637; CHECK-NEXT:    fmov s2, w2
638; CHECK-NEXT:    sqrdmlah s0, s1, s2
639; CHECK-NEXT:    fmov w0, s0
640; CHECK-NEXT:    ret
641entry:
642  %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %c) #4
643  ret i32 %vqrdmlahs_s32.i
644}
645
646define i16 @test_vqrdmlahh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
647; CHECK-LABEL: test_vqrdmlahh_lane_s16:
648; CHECK:       // %bb.0: // %entry
649; CHECK-NEXT:    fmov s1, w0
650; CHECK-NEXT:    fmov s2, w1
651; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
652; CHECK-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[3]
653; CHECK-NEXT:    umov w0, v1.h[0]
654; CHECK-NEXT:    ret
655entry:
656  %0 = insertelement <4 x i16> undef, i16 %a, i64 0
657  %1 = insertelement <4 x i16> undef, i16 %b, i64 0
658  %2 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
659  %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
660  %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
661  ret i16 %3
662}
663
664define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
665; CHECK-LABEL: test_vqrdmlahs_lane_s32:
666; CHECK:       // %bb.0: // %entry
667; CHECK-NEXT:    fmov s1, w0
668; CHECK-NEXT:    fmov s2, w1
669; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
670; CHECK-NEXT:    sqrdmlah s1, s2, v0.s[1]
671; CHECK-NEXT:    fmov w0, s1
672; CHECK-NEXT:    ret
673entry:
674  %vget_lane = extractelement <2 x i32> %c, i64 1
675  %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4
676  ret i32 %vqrdmlahs_s32.i
677}
678
679define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
680; CHECK-LABEL: test_vqrdmlahh_laneq_s16:
681; CHECK:       // %bb.0: // %entry
682; CHECK-NEXT:    fmov s1, w0
683; CHECK-NEXT:    fmov s2, w1
684; CHECK-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[7]
685; CHECK-NEXT:    umov w0, v1.h[0]
686; CHECK-NEXT:    ret
687entry:
688  %0 = insertelement <4 x i16> undef, i16 %a, i64 0
689  %1 = insertelement <4 x i16> undef, i16 %b, i64 0
690  %2 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
691  %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
692  %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
693  ret i16 %3
694}
695
696define i32 @test_vqrdmlahs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) {
697; CHECK-LABEL: test_vqrdmlahs_laneq_s32:
698; CHECK:       // %bb.0: // %entry
699; CHECK-NEXT:    fmov s1, w0
700; CHECK-NEXT:    fmov s2, w1
701; CHECK-NEXT:    sqrdmlah s1, s2, v0.s[3]
702; CHECK-NEXT:    fmov w0, s1
703; CHECK-NEXT:    ret
704entry:
705  %vgetq_lane = extractelement <4 x i32> %c, i64 3
706  %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
707  ret i32 %vqrdmlahs_s32.i
708}
709
710define <4 x i16> @test_vqrdmlsh_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
711; CHECK-LABEL: test_vqrdmlsh_laneq_s16:
712; CHECK:       // %bb.0: // %entry
713; CHECK-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.h[7]
714; CHECK-NEXT:    ret
715entry:
716  %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
717  %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
718  ret <4 x i16> %vqrdmlsh_v3.i
719}
720
721define <2 x i32> @test_vqrdmlsh_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
722; CHECK-LABEL: test_vqrdmlsh_laneq_s32:
723; CHECK:       // %bb.0: // %entry
724; CHECK-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.s[3]
725; CHECK-NEXT:    ret
726entry:
727  %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
728  %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
729  ret <2 x i32> %vqrdmlsh_v3.i
730}
731
732define <8 x i16> @test_vqrdmlshq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
733; CHECK-LABEL: test_vqrdmlshq_laneq_s16:
734; CHECK:       // %bb.0: // %entry
735; CHECK-NEXT:    sqrdmlsh v0.8h, v1.8h, v2.h[7]
736; CHECK-NEXT:    ret
737entry:
738  %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
739  %vqrdmlshq_v3.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #4
740  ret <8 x i16> %vqrdmlshq_v3.i
741}
742
743define <4 x i32> @test_vqrdmlshq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
744; CHECK-LABEL: test_vqrdmlshq_laneq_s32:
745; CHECK:       // %bb.0: // %entry
746; CHECK-NEXT:    sqrdmlsh v0.4s, v1.4s, v2.s[3]
747; CHECK-NEXT:    ret
748entry:
749  %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
750  %vqrdmlshq_v3.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #4
751  ret <4 x i32> %vqrdmlshq_v3.i
752}
753
754define i16 @test_vqrdmlshh_s16(i16 %a, i16 %b, i16 %c) {
755; CHECK-LABEL: test_vqrdmlshh_s16:
756; CHECK:       // %bb.0: // %entry
757; CHECK-NEXT:    fmov s0, w0
758; CHECK-NEXT:    fmov s1, w1
759; CHECK-NEXT:    fmov s2, w2
760; CHECK-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.4h
761; CHECK-NEXT:    umov w0, v0.h[0]
762; CHECK-NEXT:    ret
763entry:
764  %0 = insertelement <4 x i16> undef, i16 %a, i64 0
765  %1 = insertelement <4 x i16> undef, i16 %b, i64 0
766  %2 = insertelement <4 x i16> undef, i16 %c, i64 0
767  %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
768  %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
769  ret i16 %3
770}
771
772define i32 @test_vqrdmlshs_s32(i32 %a, i32 %b, i32 %c) {
773; CHECK-LABEL: test_vqrdmlshs_s32:
774; CHECK:       // %bb.0: // %entry
775; CHECK-NEXT:    fmov s0, w0
776; CHECK-NEXT:    fmov s1, w1
777; CHECK-NEXT:    fmov s2, w2
778; CHECK-NEXT:    sqrdmlsh s0, s1, s2
779; CHECK-NEXT:    fmov w0, s0
780; CHECK-NEXT:    ret
781entry:
782  %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %c) #4
783  ret i32 %vqrdmlshs_s32.i
784}
785
786define i16 @test_vqrdmlshh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
787; CHECK-LABEL: test_vqrdmlshh_lane_s16:
788; CHECK:       // %bb.0: // %entry
789; CHECK-NEXT:    fmov s1, w0
790; CHECK-NEXT:    fmov s2, w1
791; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
792; CHECK-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[3]
793; CHECK-NEXT:    umov w0, v1.h[0]
794; CHECK-NEXT:    ret
795entry:
796  %0 = insertelement <4 x i16> undef, i16 %a, i64 0
797  %1 = insertelement <4 x i16> undef, i16 %b, i64 0
798  %2 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
799  %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
800  %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
801  ret i16 %3
802}
803
804define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
805; CHECK-LABEL: test_vqrdmlshs_lane_s32:
806; CHECK:       // %bb.0: // %entry
807; CHECK-NEXT:    fmov s1, w0
808; CHECK-NEXT:    fmov s2, w1
809; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
810; CHECK-NEXT:    sqrdmlsh s1, s2, v0.s[1]
811; CHECK-NEXT:    fmov w0, s1
812; CHECK-NEXT:    ret
813entry:
814  %vget_lane = extractelement <2 x i32> %c, i64 1
815  %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4
816  ret i32 %vqrdmlshs_s32.i
817}
818
819define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
820; CHECK-LABEL: test_vqrdmlshh_laneq_s16:
821; CHECK:       // %bb.0: // %entry
822; CHECK-NEXT:    fmov s1, w0
823; CHECK-NEXT:    fmov s2, w1
824; CHECK-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[7]
825; CHECK-NEXT:    umov w0, v1.h[0]
826; CHECK-NEXT:    ret
827entry:
828  %0 = insertelement <4 x i16> undef, i16 %a, i64 0
829  %1 = insertelement <4 x i16> undef, i16 %b, i64 0
830  %2 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
831  %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
832  %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
833  ret i16 %3
834}
835
836define i32 @test_vqrdmlshs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) {
837; CHECK-LABEL: test_vqrdmlshs_laneq_s32:
838; CHECK:       // %bb.0: // %entry
839; CHECK-NEXT:    fmov s1, w0
840; CHECK-NEXT:    fmov s2, w1
841; CHECK-NEXT:    sqrdmlsh s1, s2, v0.s[3]
842; CHECK-NEXT:    fmov w0, s1
843; CHECK-NEXT:    ret
844entry:
845  %vgetq_lane = extractelement <4 x i32> %c, i64 3
846  %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
847  ret i32 %vqrdmlshs_s32.i
848}
849