xref: /llvm-project/clang/test/CodeGen/AArch64/neon-2velem.c (revision 207e5ccceec8d3cc3f32723e78f2a142bc61b07d)
1*207e5cccSFangrui Song // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2*207e5cccSFangrui Song // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
3*207e5cccSFangrui Song 
4*207e5cccSFangrui Song // REQUIRES: aarch64-registered-target || arm-registered-target
5*207e5cccSFangrui Song 
6*207e5cccSFangrui Song #include <arm_neon.h>
7*207e5cccSFangrui Song 
8*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_lane_s16(
9*207e5cccSFangrui Song // CHECK-NEXT:  entry:
10*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
11*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
13*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
14*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
15*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD]]
16*207e5cccSFangrui Song //
17*207e5cccSFangrui Song int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
18*207e5cccSFangrui Song   return vmla_lane_s16(a, b, v, 3);
19*207e5cccSFangrui Song }
20*207e5cccSFangrui Song 
21*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_lane_s16(
22*207e5cccSFangrui Song // CHECK-NEXT:  entry:
23*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
24*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
25*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
26*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
27*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
28*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD]]
29*207e5cccSFangrui Song //
30*207e5cccSFangrui Song int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
31*207e5cccSFangrui Song   return vmlaq_lane_s16(a, b, v, 3);
32*207e5cccSFangrui Song }
33*207e5cccSFangrui Song 
34*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_lane_s32(
35*207e5cccSFangrui Song // CHECK-NEXT:  entry:
36*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
37*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
38*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
39*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
40*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
41*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD]]
42*207e5cccSFangrui Song //
43*207e5cccSFangrui Song int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
44*207e5cccSFangrui Song   return vmla_lane_s32(a, b, v, 1);
45*207e5cccSFangrui Song }
46*207e5cccSFangrui Song 
47*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_lane_s32(
48*207e5cccSFangrui Song // CHECK-NEXT:  entry:
49*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
50*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
51*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
52*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
53*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
54*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
55*207e5cccSFangrui Song //
56*207e5cccSFangrui Song int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
57*207e5cccSFangrui Song   return vmlaq_lane_s32(a, b, v, 1);
58*207e5cccSFangrui Song }
59*207e5cccSFangrui Song 
60*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_laneq_s16(
61*207e5cccSFangrui Song // CHECK-NEXT:  entry:
62*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
63*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
64*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
65*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
66*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
67*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD]]
68*207e5cccSFangrui Song //
69*207e5cccSFangrui Song int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
70*207e5cccSFangrui Song   return vmla_laneq_s16(a, b, v, 7);
71*207e5cccSFangrui Song }
72*207e5cccSFangrui Song 
73*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_laneq_s16(
74*207e5cccSFangrui Song // CHECK-NEXT:  entry:
75*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
76*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
77*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
78*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
79*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
80*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD]]
81*207e5cccSFangrui Song //
82*207e5cccSFangrui Song int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
83*207e5cccSFangrui Song   return vmlaq_laneq_s16(a, b, v, 7);
84*207e5cccSFangrui Song }
85*207e5cccSFangrui Song 
86*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_laneq_s32(
87*207e5cccSFangrui Song // CHECK-NEXT:  entry:
88*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
89*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
90*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
91*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
92*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
93*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD]]
94*207e5cccSFangrui Song //
95*207e5cccSFangrui Song int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
96*207e5cccSFangrui Song   return vmla_laneq_s32(a, b, v, 3);
97*207e5cccSFangrui Song }
98*207e5cccSFangrui Song 
99*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_laneq_s32(
100*207e5cccSFangrui Song // CHECK-NEXT:  entry:
101*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
102*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
103*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
104*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
105*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
106*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
107*207e5cccSFangrui Song //
108*207e5cccSFangrui Song int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
109*207e5cccSFangrui Song   return vmlaq_laneq_s32(a, b, v, 3);
110*207e5cccSFangrui Song }
111*207e5cccSFangrui Song 
112*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_lane_s16(
113*207e5cccSFangrui Song // CHECK-NEXT:  entry:
114*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
115*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
116*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
117*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
118*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
119*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB]]
120*207e5cccSFangrui Song //
121*207e5cccSFangrui Song int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
122*207e5cccSFangrui Song   return vmls_lane_s16(a, b, v, 3);
123*207e5cccSFangrui Song }
124*207e5cccSFangrui Song 
125*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_lane_s16(
126*207e5cccSFangrui Song // CHECK-NEXT:  entry:
127*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
128*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
129*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
130*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
131*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
132*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB]]
133*207e5cccSFangrui Song //
134*207e5cccSFangrui Song int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
135*207e5cccSFangrui Song   return vmlsq_lane_s16(a, b, v, 3);
136*207e5cccSFangrui Song }
137*207e5cccSFangrui Song 
138*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_lane_s32(
139*207e5cccSFangrui Song // CHECK-NEXT:  entry:
140*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
141*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
142*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
143*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
144*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
145*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB]]
146*207e5cccSFangrui Song //
147*207e5cccSFangrui Song int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
148*207e5cccSFangrui Song   return vmls_lane_s32(a, b, v, 1);
149*207e5cccSFangrui Song }
150*207e5cccSFangrui Song 
151*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_lane_s32(
152*207e5cccSFangrui Song // CHECK-NEXT:  entry:
153*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
154*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
155*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
156*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
157*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
158*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
159*207e5cccSFangrui Song //
160*207e5cccSFangrui Song int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
161*207e5cccSFangrui Song   return vmlsq_lane_s32(a, b, v, 1);
162*207e5cccSFangrui Song }
163*207e5cccSFangrui Song 
164*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_laneq_s16(
165*207e5cccSFangrui Song // CHECK-NEXT:  entry:
166*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
167*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
168*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
169*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
170*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
171*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB]]
172*207e5cccSFangrui Song //
173*207e5cccSFangrui Song int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
174*207e5cccSFangrui Song   return vmls_laneq_s16(a, b, v, 7);
175*207e5cccSFangrui Song }
176*207e5cccSFangrui Song 
177*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_laneq_s16(
178*207e5cccSFangrui Song // CHECK-NEXT:  entry:
179*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
180*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
181*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
182*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
183*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
184*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB]]
185*207e5cccSFangrui Song //
186*207e5cccSFangrui Song int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
187*207e5cccSFangrui Song   return vmlsq_laneq_s16(a, b, v, 7);
188*207e5cccSFangrui Song }
189*207e5cccSFangrui Song 
190*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_laneq_s32(
191*207e5cccSFangrui Song // CHECK-NEXT:  entry:
192*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
193*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
194*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
195*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
196*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
197*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB]]
198*207e5cccSFangrui Song //
199*207e5cccSFangrui Song int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
200*207e5cccSFangrui Song   return vmls_laneq_s32(a, b, v, 3);
201*207e5cccSFangrui Song }
202*207e5cccSFangrui Song 
203*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_laneq_s32(
204*207e5cccSFangrui Song // CHECK-NEXT:  entry:
205*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
206*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
207*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
208*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
209*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
210*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
211*207e5cccSFangrui Song //
212*207e5cccSFangrui Song int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
213*207e5cccSFangrui Song   return vmlsq_laneq_s32(a, b, v, 3);
214*207e5cccSFangrui Song }
215*207e5cccSFangrui Song 
216*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_s16(
217*207e5cccSFangrui Song // CHECK-NEXT:  entry:
218*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
219*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
220*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
221*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
222*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL]]
223*207e5cccSFangrui Song //
224*207e5cccSFangrui Song int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
225*207e5cccSFangrui Song   return vmul_lane_s16(a, v, 3);
226*207e5cccSFangrui Song }
227*207e5cccSFangrui Song 
228*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_s16(
229*207e5cccSFangrui Song // CHECK-NEXT:  entry:
230*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
231*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
232*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
233*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
234*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL]]
235*207e5cccSFangrui Song //
236*207e5cccSFangrui Song int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
237*207e5cccSFangrui Song   return vmulq_lane_s16(a, v, 3);
238*207e5cccSFangrui Song }
239*207e5cccSFangrui Song 
240*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_s32(
241*207e5cccSFangrui Song // CHECK-NEXT:  entry:
242*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
243*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
244*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
245*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
246*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL]]
247*207e5cccSFangrui Song //
248*207e5cccSFangrui Song int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
249*207e5cccSFangrui Song   return vmul_lane_s32(a, v, 1);
250*207e5cccSFangrui Song }
251*207e5cccSFangrui Song 
252*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_s32(
253*207e5cccSFangrui Song // CHECK-NEXT:  entry:
254*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
255*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
256*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
257*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
258*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL]]
259*207e5cccSFangrui Song //
260*207e5cccSFangrui Song int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
261*207e5cccSFangrui Song   return vmulq_lane_s32(a, v, 1);
262*207e5cccSFangrui Song }
263*207e5cccSFangrui Song 
264*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_u16(
265*207e5cccSFangrui Song // CHECK-NEXT:  entry:
266*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
267*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
268*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
269*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
270*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL]]
271*207e5cccSFangrui Song //
272*207e5cccSFangrui Song uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
273*207e5cccSFangrui Song   return vmul_lane_u16(a, v, 3);
274*207e5cccSFangrui Song }
275*207e5cccSFangrui Song 
276*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_u16(
277*207e5cccSFangrui Song // CHECK-NEXT:  entry:
278*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
279*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
280*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
281*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
282*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL]]
283*207e5cccSFangrui Song //
284*207e5cccSFangrui Song uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
285*207e5cccSFangrui Song   return vmulq_lane_u16(a, v, 3);
286*207e5cccSFangrui Song }
287*207e5cccSFangrui Song 
288*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_u32(
289*207e5cccSFangrui Song // CHECK-NEXT:  entry:
290*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
291*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
292*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
293*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
294*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL]]
295*207e5cccSFangrui Song //
296*207e5cccSFangrui Song uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
297*207e5cccSFangrui Song   return vmul_lane_u32(a, v, 1);
298*207e5cccSFangrui Song }
299*207e5cccSFangrui Song 
300*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_u32(
301*207e5cccSFangrui Song // CHECK-NEXT:  entry:
302*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
303*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
304*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
305*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
306*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL]]
307*207e5cccSFangrui Song //
308*207e5cccSFangrui Song uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
309*207e5cccSFangrui Song   return vmulq_lane_u32(a, v, 1);
310*207e5cccSFangrui Song }
311*207e5cccSFangrui Song 
312*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_s16(
313*207e5cccSFangrui Song // CHECK-NEXT:  entry:
314*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
315*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
316*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
317*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
318*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL]]
319*207e5cccSFangrui Song //
320*207e5cccSFangrui Song int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
321*207e5cccSFangrui Song   return vmul_laneq_s16(a, v, 7);
322*207e5cccSFangrui Song }
323*207e5cccSFangrui Song 
324*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_s16(
325*207e5cccSFangrui Song // CHECK-NEXT:  entry:
326*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
327*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
328*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
329*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
330*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL]]
331*207e5cccSFangrui Song //
332*207e5cccSFangrui Song int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
333*207e5cccSFangrui Song   return vmulq_laneq_s16(a, v, 7);
334*207e5cccSFangrui Song }
335*207e5cccSFangrui Song 
336*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_s32(
337*207e5cccSFangrui Song // CHECK-NEXT:  entry:
338*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
339*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
340*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
341*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
342*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL]]
343*207e5cccSFangrui Song //
344*207e5cccSFangrui Song int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
345*207e5cccSFangrui Song   return vmul_laneq_s32(a, v, 3);
346*207e5cccSFangrui Song }
347*207e5cccSFangrui Song 
348*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_s32(
349*207e5cccSFangrui Song // CHECK-NEXT:  entry:
350*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
351*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
352*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
353*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
354*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL]]
355*207e5cccSFangrui Song //
356*207e5cccSFangrui Song int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
357*207e5cccSFangrui Song   return vmulq_laneq_s32(a, v, 3);
358*207e5cccSFangrui Song }
359*207e5cccSFangrui Song 
360*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_u16(
361*207e5cccSFangrui Song // CHECK-NEXT:  entry:
362*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
363*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
364*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
365*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
366*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL]]
367*207e5cccSFangrui Song //
368*207e5cccSFangrui Song uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
369*207e5cccSFangrui Song   return vmul_laneq_u16(a, v, 7);
370*207e5cccSFangrui Song }
371*207e5cccSFangrui Song 
372*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_u16(
373*207e5cccSFangrui Song // CHECK-NEXT:  entry:
374*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
375*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
376*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
377*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
378*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL]]
379*207e5cccSFangrui Song //
380*207e5cccSFangrui Song uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
381*207e5cccSFangrui Song   return vmulq_laneq_u16(a, v, 7);
382*207e5cccSFangrui Song }
383*207e5cccSFangrui Song 
384*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_u32(
385*207e5cccSFangrui Song // CHECK-NEXT:  entry:
386*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
387*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
388*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
389*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
390*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL]]
391*207e5cccSFangrui Song //
392*207e5cccSFangrui Song uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
393*207e5cccSFangrui Song   return vmul_laneq_u32(a, v, 3);
394*207e5cccSFangrui Song }
395*207e5cccSFangrui Song 
396*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_u32(
397*207e5cccSFangrui Song // CHECK-NEXT:  entry:
398*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
399*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
400*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
401*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
402*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL]]
403*207e5cccSFangrui Song //
404*207e5cccSFangrui Song uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
405*207e5cccSFangrui Song   return vmulq_laneq_u32(a, v, 3);
406*207e5cccSFangrui Song }
407*207e5cccSFangrui Song 
408*207e5cccSFangrui Song // CHECK-LABEL: @test_vfma_lane_f32(
409*207e5cccSFangrui Song // CHECK-NEXT:  entry:
410*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
411*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
412*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
413*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
414*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
415*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
416*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
417*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
418*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
419*207e5cccSFangrui Song //
420*207e5cccSFangrui Song float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
421*207e5cccSFangrui Song   return vfma_lane_f32(a, b, v, 1);
422*207e5cccSFangrui Song }
423*207e5cccSFangrui Song 
424*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmaq_lane_f32(
425*207e5cccSFangrui Song // CHECK-NEXT:  entry:
426*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
427*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
428*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
429*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
430*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
431*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
432*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
433*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
434*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
435*207e5cccSFangrui Song //
436*207e5cccSFangrui Song float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
437*207e5cccSFangrui Song   return vfmaq_lane_f32(a, b, v, 1);
438*207e5cccSFangrui Song }
439*207e5cccSFangrui Song 
440*207e5cccSFangrui Song // CHECK-LABEL: @test_vfma_laneq_f32(
441*207e5cccSFangrui Song // CHECK-NEXT:  entry:
442*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
443*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
444*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
445*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
446*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
447*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
448*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
449*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
450*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[TMP6]]
451*207e5cccSFangrui Song //
452*207e5cccSFangrui Song float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
453*207e5cccSFangrui Song   return vfma_laneq_f32(a, b, v, 3);
454*207e5cccSFangrui Song }
455*207e5cccSFangrui Song 
456*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmaq_laneq_f32(
457*207e5cccSFangrui Song // CHECK-NEXT:  entry:
458*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
459*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
460*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
461*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
462*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
463*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
464*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
465*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
466*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[TMP6]]
467*207e5cccSFangrui Song //
468*207e5cccSFangrui Song float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
469*207e5cccSFangrui Song   return vfmaq_laneq_f32(a, b, v, 3);
470*207e5cccSFangrui Song }
471*207e5cccSFangrui Song 
472*207e5cccSFangrui Song // CHECK-LABEL: @test_vfms_lane_f32(
473*207e5cccSFangrui Song // CHECK-NEXT:  entry:
474*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
475*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
476*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
477*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
478*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
479*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
480*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
481*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
482*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
483*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
484*207e5cccSFangrui Song //
485*207e5cccSFangrui Song float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
486*207e5cccSFangrui Song   return vfms_lane_f32(a, b, v, 1);
487*207e5cccSFangrui Song }
488*207e5cccSFangrui Song 
489*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsq_lane_f32(
490*207e5cccSFangrui Song // CHECK-NEXT:  entry:
491*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
492*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
493*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
494*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
495*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
496*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
497*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
498*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
499*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
500*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
501*207e5cccSFangrui Song //
502*207e5cccSFangrui Song float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
503*207e5cccSFangrui Song   return vfmsq_lane_f32(a, b, v, 1);
504*207e5cccSFangrui Song }
505*207e5cccSFangrui Song 
506*207e5cccSFangrui Song // CHECK-LABEL: @test_vfms_laneq_f32(
507*207e5cccSFangrui Song // CHECK-NEXT:  entry:
508*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
509*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
510*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
511*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
512*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
513*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
514*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
515*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
516*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
517*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[TMP6]]
518*207e5cccSFangrui Song //
519*207e5cccSFangrui Song float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
520*207e5cccSFangrui Song   return vfms_laneq_f32(a, b, v, 3);
521*207e5cccSFangrui Song }
522*207e5cccSFangrui Song 
523*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsq_laneq_f32(
524*207e5cccSFangrui Song // CHECK-NEXT:  entry:
525*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
526*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
527*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
528*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
529*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
530*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
531*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
532*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
533*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
534*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[TMP6]]
535*207e5cccSFangrui Song //
536*207e5cccSFangrui Song float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
537*207e5cccSFangrui Song   return vfmsq_laneq_f32(a, b, v, 3);
538*207e5cccSFangrui Song }
539*207e5cccSFangrui Song 
540*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmaq_lane_f64(
541*207e5cccSFangrui Song // CHECK-NEXT:  entry:
542*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
543*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
544*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
545*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
546*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
547*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
548*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
549*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
550*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[FMLA2]]
551*207e5cccSFangrui Song //
552*207e5cccSFangrui Song float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
553*207e5cccSFangrui Song   return vfmaq_lane_f64(a, b, v, 0);
554*207e5cccSFangrui Song }
555*207e5cccSFangrui Song 
556*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmaq_laneq_f64(
557*207e5cccSFangrui Song // CHECK-NEXT:  entry:
558*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
559*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
560*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
561*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
562*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
563*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
564*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
565*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
566*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[TMP6]]
567*207e5cccSFangrui Song //
568*207e5cccSFangrui Song float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
569*207e5cccSFangrui Song   return vfmaq_laneq_f64(a, b, v, 1);
570*207e5cccSFangrui Song }
571*207e5cccSFangrui Song 
572*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsq_lane_f64(
573*207e5cccSFangrui Song // CHECK-NEXT:  entry:
574*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
575*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
576*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
577*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
578*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
579*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
580*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
581*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
582*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
583*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[FMLA2]]
584*207e5cccSFangrui Song //
585*207e5cccSFangrui Song float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
586*207e5cccSFangrui Song   return vfmsq_lane_f64(a, b, v, 0);
587*207e5cccSFangrui Song }
588*207e5cccSFangrui Song 
589*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsq_laneq_f64(
590*207e5cccSFangrui Song // CHECK-NEXT:  entry:
591*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
592*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
593*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
594*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
595*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
596*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
597*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
598*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
599*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
600*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[TMP6]]
601*207e5cccSFangrui Song //
602*207e5cccSFangrui Song float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
603*207e5cccSFangrui Song   return vfmsq_laneq_f64(a, b, v, 1);
604*207e5cccSFangrui Song }
605*207e5cccSFangrui Song 
606*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmas_laneq_f32(
607*207e5cccSFangrui Song // CHECK-NEXT:  entry:
608*207e5cccSFangrui Song // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
609*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]])
610*207e5cccSFangrui Song // CHECK-NEXT:    ret float [[TMP0]]
611*207e5cccSFangrui Song //
612*207e5cccSFangrui Song float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
613*207e5cccSFangrui Song   return vfmas_laneq_f32(a, b, v, 3);
614*207e5cccSFangrui Song }
615*207e5cccSFangrui Song 
616*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsd_lane_f64(
617*207e5cccSFangrui Song // CHECK-NEXT:  entry:
618*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
619*207e5cccSFangrui Song // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i32 0
620*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
621*207e5cccSFangrui Song // CHECK-NEXT:    ret double [[TMP0]]
622*207e5cccSFangrui Song //
623*207e5cccSFangrui Song float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
624*207e5cccSFangrui Song   return vfmsd_lane_f64(a, b, v, 0);
625*207e5cccSFangrui Song }
626*207e5cccSFangrui Song 
627*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmss_laneq_f32(
628*207e5cccSFangrui Song // CHECK-NEXT:  entry:
629*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[B:%.*]]
630*207e5cccSFangrui Song // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
631*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]])
632*207e5cccSFangrui Song // CHECK-NEXT:    ret float [[TMP0]]
633*207e5cccSFangrui Song //
634*207e5cccSFangrui Song float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
635*207e5cccSFangrui Song   return vfmss_laneq_f32(a, b, v, 3);
636*207e5cccSFangrui Song }
637*207e5cccSFangrui Song 
638*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsd_laneq_f64(
639*207e5cccSFangrui Song // CHECK-NEXT:  entry:
640*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
641*207e5cccSFangrui Song // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i32 1
642*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
643*207e5cccSFangrui Song // CHECK-NEXT:    ret double [[TMP0]]
644*207e5cccSFangrui Song //
645*207e5cccSFangrui Song float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
646*207e5cccSFangrui Song   return vfmsd_laneq_f64(a, b, v, 1);
647*207e5cccSFangrui Song }
648*207e5cccSFangrui Song 
649*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_lane_s16(
650*207e5cccSFangrui Song // CHECK-NEXT:  entry:
651*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
652*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
653*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
654*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
655*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
656*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
657*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
658*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
659*207e5cccSFangrui Song //
660*207e5cccSFangrui Song int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
661*207e5cccSFangrui Song   return vmlal_lane_s16(a, b, v, 3);
662*207e5cccSFangrui Song }
663*207e5cccSFangrui Song 
664*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_lane_s32(
665*207e5cccSFangrui Song // CHECK-NEXT:  entry:
666*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
667*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
668*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
669*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
670*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
671*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
672*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
673*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
674*207e5cccSFangrui Song //
675*207e5cccSFangrui Song int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
676*207e5cccSFangrui Song   return vmlal_lane_s32(a, b, v, 1);
677*207e5cccSFangrui Song }
678*207e5cccSFangrui Song 
679*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_laneq_s16(
680*207e5cccSFangrui Song // CHECK-NEXT:  entry:
681*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
682*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
683*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
684*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
685*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
686*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
687*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
688*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
689*207e5cccSFangrui Song //
690*207e5cccSFangrui Song int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
691*207e5cccSFangrui Song   return vmlal_laneq_s16(a, b, v, 7);
692*207e5cccSFangrui Song }
693*207e5cccSFangrui Song 
694*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_laneq_s32(
695*207e5cccSFangrui Song // CHECK-NEXT:  entry:
696*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
697*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
698*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
699*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
700*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
701*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
702*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
703*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
704*207e5cccSFangrui Song //
705*207e5cccSFangrui Song int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
706*207e5cccSFangrui Song   return vmlal_laneq_s32(a, b, v, 3);
707*207e5cccSFangrui Song }
708*207e5cccSFangrui Song 
709*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_lane_s16(
710*207e5cccSFangrui Song // CHECK-NEXT:  entry:
711*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
712*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
713*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
714*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
715*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
716*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
717*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
718*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
719*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
720*207e5cccSFangrui Song //
721*207e5cccSFangrui Song int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
722*207e5cccSFangrui Song   return vmlal_high_lane_s16(a, b, v, 3);
723*207e5cccSFangrui Song }
724*207e5cccSFangrui Song 
725*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_lane_s32(
726*207e5cccSFangrui Song // CHECK-NEXT:  entry:
727*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
728*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
729*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
730*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
731*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
732*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
733*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
734*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
735*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
736*207e5cccSFangrui Song //
737*207e5cccSFangrui Song int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
738*207e5cccSFangrui Song   return vmlal_high_lane_s32(a, b, v, 1);
739*207e5cccSFangrui Song }
740*207e5cccSFangrui Song 
741*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_laneq_s16(
742*207e5cccSFangrui Song // CHECK-NEXT:  entry:
743*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
744*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
745*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
746*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
747*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
748*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
749*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
750*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
751*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
752*207e5cccSFangrui Song //
753*207e5cccSFangrui Song int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
754*207e5cccSFangrui Song   return vmlal_high_laneq_s16(a, b, v, 7);
755*207e5cccSFangrui Song }
756*207e5cccSFangrui Song 
757*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_laneq_s32(
758*207e5cccSFangrui Song // CHECK-NEXT:  entry:
759*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
760*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
761*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
762*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
763*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
764*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
765*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
766*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
767*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
768*207e5cccSFangrui Song //
769*207e5cccSFangrui Song int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
770*207e5cccSFangrui Song   return vmlal_high_laneq_s32(a, b, v, 3);
771*207e5cccSFangrui Song }
772*207e5cccSFangrui Song 
773*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_lane_s16(
774*207e5cccSFangrui Song // CHECK-NEXT:  entry:
775*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
776*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
777*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
778*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
779*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
780*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
781*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
782*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
783*207e5cccSFangrui Song //
784*207e5cccSFangrui Song int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
785*207e5cccSFangrui Song   return vmlsl_lane_s16(a, b, v, 3);
786*207e5cccSFangrui Song }
787*207e5cccSFangrui Song 
788*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_lane_s32(
789*207e5cccSFangrui Song // CHECK-NEXT:  entry:
790*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
791*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
792*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
793*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
794*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
795*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
796*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
797*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
798*207e5cccSFangrui Song //
799*207e5cccSFangrui Song int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
800*207e5cccSFangrui Song   return vmlsl_lane_s32(a, b, v, 1);
801*207e5cccSFangrui Song }
802*207e5cccSFangrui Song 
803*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_laneq_s16(
804*207e5cccSFangrui Song // CHECK-NEXT:  entry:
805*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
806*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
807*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
808*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
809*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
810*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
811*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
812*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
813*207e5cccSFangrui Song //
814*207e5cccSFangrui Song int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
815*207e5cccSFangrui Song   return vmlsl_laneq_s16(a, b, v, 7);
816*207e5cccSFangrui Song }
817*207e5cccSFangrui Song 
818*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_laneq_s32(
819*207e5cccSFangrui Song // CHECK-NEXT:  entry:
820*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
821*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
822*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
823*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
824*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
825*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
826*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
827*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
828*207e5cccSFangrui Song //
829*207e5cccSFangrui Song int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
830*207e5cccSFangrui Song   return vmlsl_laneq_s32(a, b, v, 3);
831*207e5cccSFangrui Song }
832*207e5cccSFangrui Song 
833*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_lane_s16(
834*207e5cccSFangrui Song // CHECK-NEXT:  entry:
835*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
836*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
837*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
838*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
839*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
840*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
841*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
842*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
843*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
844*207e5cccSFangrui Song //
845*207e5cccSFangrui Song int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
846*207e5cccSFangrui Song   return vmlsl_high_lane_s16(a, b, v, 3);
847*207e5cccSFangrui Song }
848*207e5cccSFangrui Song 
849*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_lane_s32(
850*207e5cccSFangrui Song // CHECK-NEXT:  entry:
851*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
852*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
853*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
854*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
855*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
856*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
857*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
858*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
859*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
860*207e5cccSFangrui Song //
861*207e5cccSFangrui Song int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
862*207e5cccSFangrui Song   return vmlsl_high_lane_s32(a, b, v, 1);
863*207e5cccSFangrui Song }
864*207e5cccSFangrui Song 
865*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
866*207e5cccSFangrui Song // CHECK-NEXT:  entry:
867*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
868*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
869*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
870*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
871*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
872*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
873*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
874*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
875*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
876*207e5cccSFangrui Song //
877*207e5cccSFangrui Song int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
878*207e5cccSFangrui Song   return vmlsl_high_laneq_s16(a, b, v, 7);
879*207e5cccSFangrui Song }
880*207e5cccSFangrui Song 
881*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
882*207e5cccSFangrui Song // CHECK-NEXT:  entry:
883*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
884*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
885*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
886*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
887*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
888*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
889*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
890*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
891*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
892*207e5cccSFangrui Song //
893*207e5cccSFangrui Song int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
894*207e5cccSFangrui Song   return vmlsl_high_laneq_s32(a, b, v, 3);
895*207e5cccSFangrui Song }
896*207e5cccSFangrui Song 
897*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_lane_u16(
898*207e5cccSFangrui Song // CHECK-NEXT:  entry:
899*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
900*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
901*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
902*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
903*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
904*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
905*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
906*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
907*207e5cccSFangrui Song //
908*207e5cccSFangrui Song int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
909*207e5cccSFangrui Song   return vmlal_lane_u16(a, b, v, 3);
910*207e5cccSFangrui Song }
911*207e5cccSFangrui Song 
912*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_lane_u32(
913*207e5cccSFangrui Song // CHECK-NEXT:  entry:
914*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
915*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
916*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
917*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
918*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
919*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
920*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
921*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
922*207e5cccSFangrui Song //
923*207e5cccSFangrui Song int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
924*207e5cccSFangrui Song   return vmlal_lane_u32(a, b, v, 1);
925*207e5cccSFangrui Song }
926*207e5cccSFangrui Song 
927*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_laneq_u16(
928*207e5cccSFangrui Song // CHECK-NEXT:  entry:
929*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
930*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
931*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
932*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
933*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
934*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
935*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
936*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
937*207e5cccSFangrui Song //
938*207e5cccSFangrui Song int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
939*207e5cccSFangrui Song   return vmlal_laneq_u16(a, b, v, 7);
940*207e5cccSFangrui Song }
941*207e5cccSFangrui Song 
942*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_laneq_u32(
943*207e5cccSFangrui Song // CHECK-NEXT:  entry:
944*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
945*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
946*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
947*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
948*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
949*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
950*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
951*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
952*207e5cccSFangrui Song //
953*207e5cccSFangrui Song int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
954*207e5cccSFangrui Song   return vmlal_laneq_u32(a, b, v, 3);
955*207e5cccSFangrui Song }
956*207e5cccSFangrui Song 
957*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_lane_u16(
958*207e5cccSFangrui Song // CHECK-NEXT:  entry:
959*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
960*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
961*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
962*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
963*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
964*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
965*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
966*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
967*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
968*207e5cccSFangrui Song //
969*207e5cccSFangrui Song int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
970*207e5cccSFangrui Song   return vmlal_high_lane_u16(a, b, v, 3);
971*207e5cccSFangrui Song }
972*207e5cccSFangrui Song 
973*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_lane_u32(
974*207e5cccSFangrui Song // CHECK-NEXT:  entry:
975*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
976*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
977*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
978*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
979*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
980*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
981*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
982*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
983*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
984*207e5cccSFangrui Song //
985*207e5cccSFangrui Song int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
986*207e5cccSFangrui Song   return vmlal_high_lane_u32(a, b, v, 1);
987*207e5cccSFangrui Song }
988*207e5cccSFangrui Song 
989*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_laneq_u16(
990*207e5cccSFangrui Song // CHECK-NEXT:  entry:
991*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
992*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
993*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
994*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
995*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
996*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
997*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
998*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
999*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
1000*207e5cccSFangrui Song //
1001*207e5cccSFangrui Song int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
1002*207e5cccSFangrui Song   return vmlal_high_laneq_u16(a, b, v, 7);
1003*207e5cccSFangrui Song }
1004*207e5cccSFangrui Song 
1005*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_laneq_u32(
1006*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1007*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1008*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1009*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1010*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1011*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1012*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1013*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1014*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
1015*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
1016*207e5cccSFangrui Song //
1017*207e5cccSFangrui Song int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
1018*207e5cccSFangrui Song   return vmlal_high_laneq_u32(a, b, v, 3);
1019*207e5cccSFangrui Song }
1020*207e5cccSFangrui Song 
1021*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_lane_u16(
1022*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1023*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1024*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1025*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1026*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1027*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1028*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1029*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1030*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1031*207e5cccSFangrui Song //
1032*207e5cccSFangrui Song int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
1033*207e5cccSFangrui Song   return vmlsl_lane_u16(a, b, v, 3);
1034*207e5cccSFangrui Song }
1035*207e5cccSFangrui Song 
1036*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_lane_u32(
1037*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1038*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1039*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1040*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1041*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1042*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1043*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1044*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1045*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1046*207e5cccSFangrui Song //
1047*207e5cccSFangrui Song int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
1048*207e5cccSFangrui Song   return vmlsl_lane_u32(a, b, v, 1);
1049*207e5cccSFangrui Song }
1050*207e5cccSFangrui Song 
1051*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_laneq_u16(
1052*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1053*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1054*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1055*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1056*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1057*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1058*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1059*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1060*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1061*207e5cccSFangrui Song //
1062*207e5cccSFangrui Song int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
1063*207e5cccSFangrui Song   return vmlsl_laneq_u16(a, b, v, 7);
1064*207e5cccSFangrui Song }
1065*207e5cccSFangrui Song 
1066*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_laneq_u32(
1067*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1068*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1069*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1070*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1071*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1072*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1073*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1074*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1075*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1076*207e5cccSFangrui Song //
1077*207e5cccSFangrui Song int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
1078*207e5cccSFangrui Song   return vmlsl_laneq_u32(a, b, v, 3);
1079*207e5cccSFangrui Song }
1080*207e5cccSFangrui Song 
1081*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_lane_u16(
1082*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1083*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1084*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1085*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1086*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1087*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1088*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1089*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1090*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1091*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1092*207e5cccSFangrui Song //
1093*207e5cccSFangrui Song int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
1094*207e5cccSFangrui Song   return vmlsl_high_lane_u16(a, b, v, 3);
1095*207e5cccSFangrui Song }
1096*207e5cccSFangrui Song 
1097*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_lane_u32(
1098*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1099*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1100*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1101*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1102*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1103*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1104*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1105*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1106*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1107*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1108*207e5cccSFangrui Song //
1109*207e5cccSFangrui Song int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
1110*207e5cccSFangrui Song   return vmlsl_high_lane_u32(a, b, v, 1);
1111*207e5cccSFangrui Song }
1112*207e5cccSFangrui Song 
1113*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
1114*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1115*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1116*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1117*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1118*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1119*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1120*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1121*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1122*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1123*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1124*207e5cccSFangrui Song //
1125*207e5cccSFangrui Song int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
1126*207e5cccSFangrui Song   return vmlsl_high_laneq_u16(a, b, v, 7);
1127*207e5cccSFangrui Song }
1128*207e5cccSFangrui Song 
1129*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
1130*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1131*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1132*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1133*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1134*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1135*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1136*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1137*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1138*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1139*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1140*207e5cccSFangrui Song //
1141*207e5cccSFangrui Song int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
1142*207e5cccSFangrui Song   return vmlsl_high_laneq_u32(a, b, v, 3);
1143*207e5cccSFangrui Song }
1144*207e5cccSFangrui Song 
1145*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_lane_s16(
1146*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1147*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1148*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1149*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1150*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1151*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1152*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1153*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1154*207e5cccSFangrui Song //
1155*207e5cccSFangrui Song int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
1156*207e5cccSFangrui Song   return vmull_lane_s16(a, v, 3);
1157*207e5cccSFangrui Song }
1158*207e5cccSFangrui Song 
1159*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_lane_s32(
1160*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1161*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1162*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1163*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1164*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1165*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1166*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1167*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1168*207e5cccSFangrui Song //
1169*207e5cccSFangrui Song int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
1170*207e5cccSFangrui Song   return vmull_lane_s32(a, v, 1);
1171*207e5cccSFangrui Song }
1172*207e5cccSFangrui Song 
1173*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_lane_u16(
1174*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1175*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1176*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1177*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1178*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1179*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1180*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1181*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1182*207e5cccSFangrui Song //
1183*207e5cccSFangrui Song uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
1184*207e5cccSFangrui Song   return vmull_lane_u16(a, v, 3);
1185*207e5cccSFangrui Song }
1186*207e5cccSFangrui Song 
1187*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_lane_u32(
1188*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1189*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1190*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1191*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1192*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1193*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1194*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1195*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1196*207e5cccSFangrui Song //
1197*207e5cccSFangrui Song uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
1198*207e5cccSFangrui Song   return vmull_lane_u32(a, v, 1);
1199*207e5cccSFangrui Song }
1200*207e5cccSFangrui Song 
1201*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_lane_s16(
1202*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1203*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1204*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1205*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1206*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1207*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1208*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1209*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1210*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1211*207e5cccSFangrui Song //
1212*207e5cccSFangrui Song int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1213*207e5cccSFangrui Song   return vmull_high_lane_s16(a, v, 3);
1214*207e5cccSFangrui Song }
1215*207e5cccSFangrui Song 
1216*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_lane_s32(
1217*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1218*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1219*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1220*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1221*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1222*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1223*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1224*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1225*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1226*207e5cccSFangrui Song //
1227*207e5cccSFangrui Song int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1228*207e5cccSFangrui Song   return vmull_high_lane_s32(a, v, 1);
1229*207e5cccSFangrui Song }
1230*207e5cccSFangrui Song 
1231*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_lane_u16(
1232*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1233*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1234*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1235*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1236*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1237*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1238*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1239*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1240*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1241*207e5cccSFangrui Song //
1242*207e5cccSFangrui Song uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
1243*207e5cccSFangrui Song   return vmull_high_lane_u16(a, v, 3);
1244*207e5cccSFangrui Song }
1245*207e5cccSFangrui Song 
1246*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_lane_u32(
1247*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1248*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1249*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1250*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1251*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1252*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1253*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1254*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1255*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1256*207e5cccSFangrui Song //
1257*207e5cccSFangrui Song uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
1258*207e5cccSFangrui Song   return vmull_high_lane_u32(a, v, 1);
1259*207e5cccSFangrui Song }
1260*207e5cccSFangrui Song 
1261*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_laneq_s16(
1262*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1263*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1264*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1265*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1266*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1267*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1268*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1269*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1270*207e5cccSFangrui Song //
1271*207e5cccSFangrui Song int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
1272*207e5cccSFangrui Song   return vmull_laneq_s16(a, v, 7);
1273*207e5cccSFangrui Song }
1274*207e5cccSFangrui Song 
1275*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_laneq_s32(
1276*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1277*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1278*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1279*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1280*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1281*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1282*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1283*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1284*207e5cccSFangrui Song //
1285*207e5cccSFangrui Song int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
1286*207e5cccSFangrui Song   return vmull_laneq_s32(a, v, 3);
1287*207e5cccSFangrui Song }
1288*207e5cccSFangrui Song 
1289*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_laneq_u16(
1290*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1291*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1292*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1293*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1294*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1295*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1296*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1297*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1298*207e5cccSFangrui Song //
1299*207e5cccSFangrui Song uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
1300*207e5cccSFangrui Song   return vmull_laneq_u16(a, v, 7);
1301*207e5cccSFangrui Song }
1302*207e5cccSFangrui Song 
1303*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_laneq_u32(
1304*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1305*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1306*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1307*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1308*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1309*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1310*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1311*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1312*207e5cccSFangrui Song //
1313*207e5cccSFangrui Song uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
1314*207e5cccSFangrui Song   return vmull_laneq_u32(a, v, 3);
1315*207e5cccSFangrui Song }
1316*207e5cccSFangrui Song 
1317*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_laneq_s16(
1318*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1319*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1320*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1321*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1322*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1323*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1324*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1325*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1326*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1327*207e5cccSFangrui Song //
1328*207e5cccSFangrui Song int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1329*207e5cccSFangrui Song   return vmull_high_laneq_s16(a, v, 7);
1330*207e5cccSFangrui Song }
1331*207e5cccSFangrui Song 
1332*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_laneq_s32(
1333*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1334*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1335*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1336*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1337*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1338*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1339*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1340*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1341*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1342*207e5cccSFangrui Song //
1343*207e5cccSFangrui Song int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1344*207e5cccSFangrui Song   return vmull_high_laneq_s32(a, v, 3);
1345*207e5cccSFangrui Song }
1346*207e5cccSFangrui Song 
1347*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_laneq_u16(
1348*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1349*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1350*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1351*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1352*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1353*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1354*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1355*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1356*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1357*207e5cccSFangrui Song //
1358*207e5cccSFangrui Song uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1359*207e5cccSFangrui Song   return vmull_high_laneq_u16(a, v, 7);
1360*207e5cccSFangrui Song }
1361*207e5cccSFangrui Song 
1362*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_laneq_u32(
1363*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1364*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1365*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1366*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1367*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1368*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1369*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1370*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1371*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1372*207e5cccSFangrui Song //
1373*207e5cccSFangrui Song uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1374*207e5cccSFangrui Song   return vmull_high_laneq_u32(a, v, 3);
1375*207e5cccSFangrui Song }
1376*207e5cccSFangrui Song 
1377*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_lane_s16(
1378*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1379*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1380*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1381*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1382*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1383*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1384*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1385*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1386*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1387*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
1388*207e5cccSFangrui Song //
1389*207e5cccSFangrui Song int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1390*207e5cccSFangrui Song   return vqdmlal_lane_s16(a, b, v, 3);
1391*207e5cccSFangrui Song }
1392*207e5cccSFangrui Song 
1393*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_lane_s32(
1394*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1395*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1396*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1397*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1398*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1399*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1400*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1401*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1402*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1403*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
1404*207e5cccSFangrui Song //
1405*207e5cccSFangrui Song int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1406*207e5cccSFangrui Song   return vqdmlal_lane_s32(a, b, v, 1);
1407*207e5cccSFangrui Song }
1408*207e5cccSFangrui Song 
1409*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
1410*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1411*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1412*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1413*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1414*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1415*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1416*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1417*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1418*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1419*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1420*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
1421*207e5cccSFangrui Song //
1422*207e5cccSFangrui Song int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1423*207e5cccSFangrui Song   return vqdmlal_high_lane_s16(a, b, v, 3);
1424*207e5cccSFangrui Song }
1425*207e5cccSFangrui Song 
1426*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
1427*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1428*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1429*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1430*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1431*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1432*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1433*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1434*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1435*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1436*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1437*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
1438*207e5cccSFangrui Song //
1439*207e5cccSFangrui Song int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1440*207e5cccSFangrui Song   return vqdmlal_high_lane_s32(a, b, v, 1);
1441*207e5cccSFangrui Song }
1442*207e5cccSFangrui Song 
1443*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_lane_s16(
1444*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1445*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1446*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1447*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1448*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1449*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1450*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1451*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1452*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1453*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
1454*207e5cccSFangrui Song //
1455*207e5cccSFangrui Song int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1456*207e5cccSFangrui Song   return vqdmlsl_lane_s16(a, b, v, 3);
1457*207e5cccSFangrui Song }
1458*207e5cccSFangrui Song 
1459*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_lane_s32(
1460*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1461*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1462*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1463*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1464*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1465*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1466*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1467*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1468*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1469*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
1470*207e5cccSFangrui Song //
1471*207e5cccSFangrui Song int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1472*207e5cccSFangrui Song   return vqdmlsl_lane_s32(a, b, v, 1);
1473*207e5cccSFangrui Song }
1474*207e5cccSFangrui Song 
1475*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
1476*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1477*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1478*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1479*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1480*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1481*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1482*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1483*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1484*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1485*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1486*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
1487*207e5cccSFangrui Song //
1488*207e5cccSFangrui Song int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1489*207e5cccSFangrui Song   return vqdmlsl_high_lane_s16(a, b, v, 3);
1490*207e5cccSFangrui Song }
1491*207e5cccSFangrui Song 
1492*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
1493*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1494*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1495*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1496*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1497*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1498*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1499*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1500*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1501*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1502*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1503*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
1504*207e5cccSFangrui Song //
1505*207e5cccSFangrui Song int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1506*207e5cccSFangrui Song   return vqdmlsl_high_lane_s32(a, b, v, 1);
1507*207e5cccSFangrui Song }
1508*207e5cccSFangrui Song 
1509*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_lane_s16(
1510*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1511*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1512*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1513*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1514*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1515*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1516*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1517*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1518*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1519*207e5cccSFangrui Song //
1520*207e5cccSFangrui Song int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1521*207e5cccSFangrui Song   return vqdmull_lane_s16(a, v, 3);
1522*207e5cccSFangrui Song }
1523*207e5cccSFangrui Song 
1524*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_lane_s32(
1525*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1526*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1527*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1528*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1529*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1530*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1531*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1532*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1533*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1534*207e5cccSFangrui Song //
1535*207e5cccSFangrui Song int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1536*207e5cccSFangrui Song   return vqdmull_lane_s32(a, v, 1);
1537*207e5cccSFangrui Song }
1538*207e5cccSFangrui Song 
1539*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_laneq_s16(
1540*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1541*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1542*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1543*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1544*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1545*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1546*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1547*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1548*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1549*207e5cccSFangrui Song //
1550*207e5cccSFangrui Song int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1551*207e5cccSFangrui Song   return vqdmull_laneq_s16(a, v, 3);
1552*207e5cccSFangrui Song }
1553*207e5cccSFangrui Song 
1554*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_laneq_s32(
1555*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1556*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1557*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1558*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1559*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1560*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1561*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1562*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1563*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1564*207e5cccSFangrui Song //
1565*207e5cccSFangrui Song int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1566*207e5cccSFangrui Song   return vqdmull_laneq_s32(a, v, 3);
1567*207e5cccSFangrui Song }
1568*207e5cccSFangrui Song 
1569*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_lane_s16(
1570*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1571*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1572*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1573*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1574*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1575*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1576*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1577*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1578*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1579*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1580*207e5cccSFangrui Song //
1581*207e5cccSFangrui Song int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1582*207e5cccSFangrui Song   return vqdmull_high_lane_s16(a, v, 3);
1583*207e5cccSFangrui Song }
1584*207e5cccSFangrui Song 
1585*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_lane_s32(
1586*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1587*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1588*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1589*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1590*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1591*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1592*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1593*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1594*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1595*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1596*207e5cccSFangrui Song //
1597*207e5cccSFangrui Song int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1598*207e5cccSFangrui Song   return vqdmull_high_lane_s32(a, v, 1);
1599*207e5cccSFangrui Song }
1600*207e5cccSFangrui Song 
1601*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
1602*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1603*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1604*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1605*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1606*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1607*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1608*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1609*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1610*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1611*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1612*207e5cccSFangrui Song //
1613*207e5cccSFangrui Song int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1614*207e5cccSFangrui Song   return vqdmull_high_laneq_s16(a, v, 7);
1615*207e5cccSFangrui Song }
1616*207e5cccSFangrui Song 
1617*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
1618*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1619*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1620*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1621*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1622*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1623*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1624*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1625*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1626*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1627*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1628*207e5cccSFangrui Song //
1629*207e5cccSFangrui Song int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1630*207e5cccSFangrui Song   return vqdmull_high_laneq_s32(a, v, 3);
1631*207e5cccSFangrui Song }
1632*207e5cccSFangrui Song 
1633*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_lane_s16(
1634*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1635*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1636*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1637*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1638*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1639*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
1640*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
1641*207e5cccSFangrui Song //
1642*207e5cccSFangrui Song int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1643*207e5cccSFangrui Song   return vqdmulh_lane_s16(a, v, 3);
1644*207e5cccSFangrui Song }
1645*207e5cccSFangrui Song 
1646*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_lane_s16(
1647*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1648*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
1649*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1650*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1651*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1652*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
1653*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
1654*207e5cccSFangrui Song //
1655*207e5cccSFangrui Song int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1656*207e5cccSFangrui Song   return vqdmulhq_lane_s16(a, v, 3);
1657*207e5cccSFangrui Song }
1658*207e5cccSFangrui Song 
1659*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_lane_s32(
1660*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1661*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1662*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1663*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1664*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1665*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
1666*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
1667*207e5cccSFangrui Song //
1668*207e5cccSFangrui Song int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1669*207e5cccSFangrui Song   return vqdmulh_lane_s32(a, v, 1);
1670*207e5cccSFangrui Song }
1671*207e5cccSFangrui Song 
1672*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_lane_s32(
1673*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1674*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1675*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1676*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1677*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1678*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
1679*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
1680*207e5cccSFangrui Song //
1681*207e5cccSFangrui Song int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1682*207e5cccSFangrui Song   return vqdmulhq_lane_s32(a, v, 1);
1683*207e5cccSFangrui Song }
1684*207e5cccSFangrui Song 
1685*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_lane_s16(
1686*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1687*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1688*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1689*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1690*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1691*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
1692*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
1693*207e5cccSFangrui Song //
1694*207e5cccSFangrui Song int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1695*207e5cccSFangrui Song   return vqrdmulh_lane_s16(a, v, 3);
1696*207e5cccSFangrui Song }
1697*207e5cccSFangrui Song 
1698*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
1699*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1700*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
1701*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1702*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1703*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1704*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
1705*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
1706*207e5cccSFangrui Song //
1707*207e5cccSFangrui Song int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1708*207e5cccSFangrui Song   return vqrdmulhq_lane_s16(a, v, 3);
1709*207e5cccSFangrui Song }
1710*207e5cccSFangrui Song 
1711*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_lane_s32(
1712*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1713*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1714*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1715*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1716*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1717*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
1718*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
1719*207e5cccSFangrui Song //
1720*207e5cccSFangrui Song int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1721*207e5cccSFangrui Song   return vqrdmulh_lane_s32(a, v, 1);
1722*207e5cccSFangrui Song }
1723*207e5cccSFangrui Song 
1724*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
1725*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1726*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1727*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1728*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1729*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1730*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
1731*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
1732*207e5cccSFangrui Song //
1733*207e5cccSFangrui Song int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1734*207e5cccSFangrui Song   return vqrdmulhq_lane_s32(a, v, 1);
1735*207e5cccSFangrui Song }
1736*207e5cccSFangrui Song 
1737*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_f32(
1738*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1739*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1740*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1741*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
1742*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
1743*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[MUL]]
1744*207e5cccSFangrui Song //
1745*207e5cccSFangrui Song float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1746*207e5cccSFangrui Song   return vmul_lane_f32(a, v, 1);
1747*207e5cccSFangrui Song }
1748*207e5cccSFangrui Song 
1749*207e5cccSFangrui Song 
1750*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_f64(
1751*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1752*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
1753*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1754*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1755*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1756*207e5cccSFangrui Song // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1757*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1758*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1759*207e5cccSFangrui Song // CHECK-NEXT:    ret <1 x double> [[TMP5]]
1760*207e5cccSFangrui Song //
1761*207e5cccSFangrui Song float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1762*207e5cccSFangrui Song   return vmul_lane_f64(a, v, 0);
1763*207e5cccSFangrui Song }
1764*207e5cccSFangrui Song 
1765*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_f32(
1766*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1767*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1768*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1769*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1770*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
1771*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[MUL]]
1772*207e5cccSFangrui Song //
1773*207e5cccSFangrui Song float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1774*207e5cccSFangrui Song   return vmulq_lane_f32(a, v, 1);
1775*207e5cccSFangrui Song }
1776*207e5cccSFangrui Song 
1777*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_f64(
1778*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1779*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1780*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
1781*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
1782*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
1783*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[MUL]]
1784*207e5cccSFangrui Song //
1785*207e5cccSFangrui Song float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1786*207e5cccSFangrui Song   return vmulq_lane_f64(a, v, 0);
1787*207e5cccSFangrui Song }
1788*207e5cccSFangrui Song 
1789*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_f32(
1790*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1791*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1792*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1793*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
1794*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
1795*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[MUL]]
1796*207e5cccSFangrui Song //
1797*207e5cccSFangrui Song float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1798*207e5cccSFangrui Song   return vmul_laneq_f32(a, v, 3);
1799*207e5cccSFangrui Song }
1800*207e5cccSFangrui Song 
1801*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_f64(
1802*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1803*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
1804*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1805*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1806*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1807*207e5cccSFangrui Song // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1808*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1809*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1810*207e5cccSFangrui Song // CHECK-NEXT:    ret <1 x double> [[TMP5]]
1811*207e5cccSFangrui Song //
1812*207e5cccSFangrui Song float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1813*207e5cccSFangrui Song   return vmul_laneq_f64(a, v, 1);
1814*207e5cccSFangrui Song }
1815*207e5cccSFangrui Song 
1816*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_f32(
1817*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1818*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1819*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1820*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1821*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
1822*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[MUL]]
1823*207e5cccSFangrui Song //
1824*207e5cccSFangrui Song float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1825*207e5cccSFangrui Song   return vmulq_laneq_f32(a, v, 3);
1826*207e5cccSFangrui Song }
1827*207e5cccSFangrui Song 
1828*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_f64(
1829*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1830*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1831*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1832*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
1833*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
1834*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[MUL]]
1835*207e5cccSFangrui Song //
1836*207e5cccSFangrui Song float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1837*207e5cccSFangrui Song   return vmulq_laneq_f64(a, v, 1);
1838*207e5cccSFangrui Song }
1839*207e5cccSFangrui Song 
1840*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulx_lane_f32(
1841*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1842*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1843*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1844*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
1845*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1846*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
1847*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
1848*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
1849*207e5cccSFangrui Song //
1850*207e5cccSFangrui Song float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1851*207e5cccSFangrui Song   return vmulx_lane_f32(a, v, 1);
1852*207e5cccSFangrui Song }
1853*207e5cccSFangrui Song 
1854*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulxq_lane_f32(
1855*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1856*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1857*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1858*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1859*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1860*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
1861*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
1862*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
1863*207e5cccSFangrui Song //
1864*207e5cccSFangrui Song float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1865*207e5cccSFangrui Song   return vmulxq_lane_f32(a, v, 1);
1866*207e5cccSFangrui Song }
1867*207e5cccSFangrui Song 
1868*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulxq_lane_f64(
1869*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1870*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1871*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
1872*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
1873*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
1874*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
1875*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
1876*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
1877*207e5cccSFangrui Song //
1878*207e5cccSFangrui Song float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1879*207e5cccSFangrui Song   return vmulxq_lane_f64(a, v, 0);
1880*207e5cccSFangrui Song }
1881*207e5cccSFangrui Song 
1882*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulx_laneq_f32(
1883*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1884*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1885*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1886*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
1887*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1888*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
1889*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
1890*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
1891*207e5cccSFangrui Song //
1892*207e5cccSFangrui Song float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1893*207e5cccSFangrui Song   return vmulx_laneq_f32(a, v, 3);
1894*207e5cccSFangrui Song }
1895*207e5cccSFangrui Song 
1896*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulxq_laneq_f32(
1897*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1898*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1899*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1900*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1901*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1902*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
1903*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
1904*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
1905*207e5cccSFangrui Song //
1906*207e5cccSFangrui Song float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1907*207e5cccSFangrui Song   return vmulxq_laneq_f32(a, v, 3);
1908*207e5cccSFangrui Song }
1909*207e5cccSFangrui Song 
1910*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulxq_laneq_f64(
1911*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1912*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1913*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1914*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
1915*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
1916*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
1917*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
1918*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
1919*207e5cccSFangrui Song //
1920*207e5cccSFangrui Song float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1921*207e5cccSFangrui Song   return vmulxq_laneq_f64(a, v, 1);
1922*207e5cccSFangrui Song }
1923*207e5cccSFangrui Song 
1924*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_lane_s16_0(
1925*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1926*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1927*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1928*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
1929*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
1930*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
1931*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD]]
1932*207e5cccSFangrui Song //
1933*207e5cccSFangrui Song int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1934*207e5cccSFangrui Song   return vmla_lane_s16(a, b, v, 0);
1935*207e5cccSFangrui Song }
1936*207e5cccSFangrui Song 
1937*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_lane_s16_0(
1938*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1939*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1940*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1941*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
1942*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
1943*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
1944*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD]]
1945*207e5cccSFangrui Song //
1946*207e5cccSFangrui Song int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1947*207e5cccSFangrui Song   return vmlaq_lane_s16(a, b, v, 0);
1948*207e5cccSFangrui Song }
1949*207e5cccSFangrui Song 
1950*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_lane_s32_0(
1951*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1952*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1953*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1954*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
1955*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
1956*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
1957*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD]]
1958*207e5cccSFangrui Song //
1959*207e5cccSFangrui Song int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1960*207e5cccSFangrui Song   return vmla_lane_s32(a, b, v, 0);
1961*207e5cccSFangrui Song }
1962*207e5cccSFangrui Song 
1963*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_lane_s32_0(
1964*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1965*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1966*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1967*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
1968*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
1969*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
1970*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
1971*207e5cccSFangrui Song //
1972*207e5cccSFangrui Song int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1973*207e5cccSFangrui Song   return vmlaq_lane_s32(a, b, v, 0);
1974*207e5cccSFangrui Song }
1975*207e5cccSFangrui Song 
1976*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_laneq_s16_0(
1977*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1978*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1979*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1980*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
1981*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
1982*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
1983*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD]]
1984*207e5cccSFangrui Song //
1985*207e5cccSFangrui Song int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1986*207e5cccSFangrui Song   return vmla_laneq_s16(a, b, v, 0);
1987*207e5cccSFangrui Song }
1988*207e5cccSFangrui Song 
1989*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
1990*207e5cccSFangrui Song // CHECK-NEXT:  entry:
1991*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1992*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1993*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
1994*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
1995*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
1996*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD]]
1997*207e5cccSFangrui Song //
1998*207e5cccSFangrui Song int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1999*207e5cccSFangrui Song   return vmlaq_laneq_s16(a, b, v, 0);
2000*207e5cccSFangrui Song }
2001*207e5cccSFangrui Song 
2002*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_laneq_s32_0(
2003*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2004*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2005*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2006*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2007*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2008*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
2009*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD]]
2010*207e5cccSFangrui Song //
2011*207e5cccSFangrui Song int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
2012*207e5cccSFangrui Song   return vmla_laneq_s32(a, b, v, 0);
2013*207e5cccSFangrui Song }
2014*207e5cccSFangrui Song 
2015*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
2016*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2017*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2018*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2019*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2020*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2021*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
2022*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2023*207e5cccSFangrui Song //
2024*207e5cccSFangrui Song int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
2025*207e5cccSFangrui Song   return vmlaq_laneq_s32(a, b, v, 0);
2026*207e5cccSFangrui Song }
2027*207e5cccSFangrui Song 
2028*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_lane_s16_0(
2029*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2030*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2031*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2032*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2033*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
2034*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
2035*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB]]
2036*207e5cccSFangrui Song //
2037*207e5cccSFangrui Song int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
2038*207e5cccSFangrui Song   return vmls_lane_s16(a, b, v, 0);
2039*207e5cccSFangrui Song }
2040*207e5cccSFangrui Song 
2041*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_lane_s16_0(
2042*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2043*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2044*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2045*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2046*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
2047*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
2048*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB]]
2049*207e5cccSFangrui Song //
2050*207e5cccSFangrui Song int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
2051*207e5cccSFangrui Song   return vmlsq_lane_s16(a, b, v, 0);
2052*207e5cccSFangrui Song }
2053*207e5cccSFangrui Song 
2054*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_lane_s32_0(
2055*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2056*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2057*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2058*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2059*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2060*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
2061*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB]]
2062*207e5cccSFangrui Song //
2063*207e5cccSFangrui Song int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
2064*207e5cccSFangrui Song   return vmls_lane_s32(a, b, v, 0);
2065*207e5cccSFangrui Song }
2066*207e5cccSFangrui Song 
2067*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_lane_s32_0(
2068*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2069*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2070*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2071*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2072*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2073*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
2074*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2075*207e5cccSFangrui Song //
2076*207e5cccSFangrui Song int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
2077*207e5cccSFangrui Song   return vmlsq_lane_s32(a, b, v, 0);
2078*207e5cccSFangrui Song }
2079*207e5cccSFangrui Song 
2080*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_laneq_s16_0(
2081*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2082*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2083*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2084*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2085*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
2086*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
2087*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB]]
2088*207e5cccSFangrui Song //
2089*207e5cccSFangrui Song int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
2090*207e5cccSFangrui Song   return vmls_laneq_s16(a, b, v, 0);
2091*207e5cccSFangrui Song }
2092*207e5cccSFangrui Song 
2093*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
2094*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2095*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2096*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2097*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2098*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
2099*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
2100*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB]]
2101*207e5cccSFangrui Song //
2102*207e5cccSFangrui Song int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
2103*207e5cccSFangrui Song   return vmlsq_laneq_s16(a, b, v, 0);
2104*207e5cccSFangrui Song }
2105*207e5cccSFangrui Song 
2106*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_laneq_s32_0(
2107*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2108*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2109*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2110*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2111*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2112*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
2113*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB]]
2114*207e5cccSFangrui Song //
2115*207e5cccSFangrui Song int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
2116*207e5cccSFangrui Song   return vmls_laneq_s32(a, b, v, 0);
2117*207e5cccSFangrui Song }
2118*207e5cccSFangrui Song 
2119*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
2120*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2121*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2122*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2123*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2124*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2125*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
2126*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2127*207e5cccSFangrui Song //
2128*207e5cccSFangrui Song int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
2129*207e5cccSFangrui Song   return vmlsq_laneq_s32(a, b, v, 0);
2130*207e5cccSFangrui Song }
2131*207e5cccSFangrui Song 
2132*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_s16_0(
2133*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2134*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2135*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2136*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2137*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2138*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2139*207e5cccSFangrui Song //
2140*207e5cccSFangrui Song int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
2141*207e5cccSFangrui Song   return vmul_lane_s16(a, v, 0);
2142*207e5cccSFangrui Song }
2143*207e5cccSFangrui Song 
2144*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_s16_0(
2145*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2146*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2147*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2148*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2149*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2150*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2151*207e5cccSFangrui Song //
2152*207e5cccSFangrui Song int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
2153*207e5cccSFangrui Song   return vmulq_lane_s16(a, v, 0);
2154*207e5cccSFangrui Song }
2155*207e5cccSFangrui Song 
2156*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_s32_0(
2157*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2158*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2159*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2160*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2161*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2162*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2163*207e5cccSFangrui Song //
2164*207e5cccSFangrui Song int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
2165*207e5cccSFangrui Song   return vmul_lane_s32(a, v, 0);
2166*207e5cccSFangrui Song }
2167*207e5cccSFangrui Song 
2168*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_s32_0(
2169*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2170*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2171*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2172*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2173*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2174*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2175*207e5cccSFangrui Song //
2176*207e5cccSFangrui Song int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
2177*207e5cccSFangrui Song   return vmulq_lane_s32(a, v, 0);
2178*207e5cccSFangrui Song }
2179*207e5cccSFangrui Song 
2180*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_u16_0(
2181*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2182*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2183*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2184*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2185*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2186*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2187*207e5cccSFangrui Song //
2188*207e5cccSFangrui Song uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2189*207e5cccSFangrui Song   return vmul_lane_u16(a, v, 0);
2190*207e5cccSFangrui Song }
2191*207e5cccSFangrui Song 
2192*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_u16_0(
2193*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2194*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2195*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2196*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2197*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2198*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2199*207e5cccSFangrui Song //
2200*207e5cccSFangrui Song uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2201*207e5cccSFangrui Song   return vmulq_lane_u16(a, v, 0);
2202*207e5cccSFangrui Song }
2203*207e5cccSFangrui Song 
2204*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_u32_0(
2205*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2206*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2207*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2208*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2209*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2210*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2211*207e5cccSFangrui Song //
2212*207e5cccSFangrui Song uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2213*207e5cccSFangrui Song   return vmul_lane_u32(a, v, 0);
2214*207e5cccSFangrui Song }
2215*207e5cccSFangrui Song 
2216*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_u32_0(
2217*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2218*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2219*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2220*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2221*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2222*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2223*207e5cccSFangrui Song //
2224*207e5cccSFangrui Song uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2225*207e5cccSFangrui Song   return vmulq_lane_u32(a, v, 0);
2226*207e5cccSFangrui Song }
2227*207e5cccSFangrui Song 
2228*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_s16_0(
2229*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2230*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2231*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2232*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2233*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2234*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2235*207e5cccSFangrui Song //
2236*207e5cccSFangrui Song int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
2237*207e5cccSFangrui Song   return vmul_laneq_s16(a, v, 0);
2238*207e5cccSFangrui Song }
2239*207e5cccSFangrui Song 
2240*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_s16_0(
2241*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2242*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2243*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2244*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2245*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2246*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2247*207e5cccSFangrui Song //
2248*207e5cccSFangrui Song int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
2249*207e5cccSFangrui Song   return vmulq_laneq_s16(a, v, 0);
2250*207e5cccSFangrui Song }
2251*207e5cccSFangrui Song 
2252*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_s32_0(
2253*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2254*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2255*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2256*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2257*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2258*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2259*207e5cccSFangrui Song //
2260*207e5cccSFangrui Song int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
2261*207e5cccSFangrui Song   return vmul_laneq_s32(a, v, 0);
2262*207e5cccSFangrui Song }
2263*207e5cccSFangrui Song 
2264*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_s32_0(
2265*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2266*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2267*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2268*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2269*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2270*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2271*207e5cccSFangrui Song //
2272*207e5cccSFangrui Song int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
2273*207e5cccSFangrui Song   return vmulq_laneq_s32(a, v, 0);
2274*207e5cccSFangrui Song }
2275*207e5cccSFangrui Song 
2276*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_u16_0(
2277*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2278*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2279*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2280*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2281*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2282*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2283*207e5cccSFangrui Song //
2284*207e5cccSFangrui Song uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2285*207e5cccSFangrui Song   return vmul_laneq_u16(a, v, 0);
2286*207e5cccSFangrui Song }
2287*207e5cccSFangrui Song 
2288*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_u16_0(
2289*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2290*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2291*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2292*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2293*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2294*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2295*207e5cccSFangrui Song //
2296*207e5cccSFangrui Song uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2297*207e5cccSFangrui Song   return vmulq_laneq_u16(a, v, 0);
2298*207e5cccSFangrui Song }
2299*207e5cccSFangrui Song 
2300*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_u32_0(
2301*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2302*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2303*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2304*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2305*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2306*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2307*207e5cccSFangrui Song //
2308*207e5cccSFangrui Song uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2309*207e5cccSFangrui Song   return vmul_laneq_u32(a, v, 0);
2310*207e5cccSFangrui Song }
2311*207e5cccSFangrui Song 
2312*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_u32_0(
2313*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2314*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2315*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2316*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2317*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2318*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2319*207e5cccSFangrui Song //
2320*207e5cccSFangrui Song uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2321*207e5cccSFangrui Song   return vmulq_laneq_u32(a, v, 0);
2322*207e5cccSFangrui Song }
2323*207e5cccSFangrui Song 
2324*207e5cccSFangrui Song // CHECK-LABEL: @test_vfma_lane_f32_0(
2325*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2326*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2327*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
2328*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2329*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2330*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
2331*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2332*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2333*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
2334*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
2335*207e5cccSFangrui Song //
2336*207e5cccSFangrui Song float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
2337*207e5cccSFangrui Song   return vfma_lane_f32(a, b, v, 0);
2338*207e5cccSFangrui Song }
2339*207e5cccSFangrui Song 
2340*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmaq_lane_f32_0(
2341*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2342*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2343*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
2344*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2345*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2346*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
2347*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2348*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2349*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
2350*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
2351*207e5cccSFangrui Song //
2352*207e5cccSFangrui Song float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
2353*207e5cccSFangrui Song   return vfmaq_lane_f32(a, b, v, 0);
2354*207e5cccSFangrui Song }
2355*207e5cccSFangrui Song 
2356*207e5cccSFangrui Song // CHECK-LABEL: @test_vfma_laneq_f32_0(
2357*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2358*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2359*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
2360*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2361*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2362*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2363*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2364*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
2365*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2366*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[TMP6]]
2367*207e5cccSFangrui Song //
2368*207e5cccSFangrui Song float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
2369*207e5cccSFangrui Song   return vfma_laneq_f32(a, b, v, 0);
2370*207e5cccSFangrui Song }
2371*207e5cccSFangrui Song 
2372*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
2373*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2374*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2375*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
2376*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2377*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2378*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2379*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2380*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2381*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2382*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[TMP6]]
2383*207e5cccSFangrui Song //
2384*207e5cccSFangrui Song float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2385*207e5cccSFangrui Song   return vfmaq_laneq_f32(a, b, v, 0);
2386*207e5cccSFangrui Song }
2387*207e5cccSFangrui Song 
2388*207e5cccSFangrui Song // CHECK-LABEL: @test_vfms_lane_f32_0(
2389*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2390*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
2391*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2392*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
2393*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2394*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2395*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
2396*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2397*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2398*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
2399*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
2400*207e5cccSFangrui Song //
2401*207e5cccSFangrui Song float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
2402*207e5cccSFangrui Song   return vfms_lane_f32(a, b, v, 0);
2403*207e5cccSFangrui Song }
2404*207e5cccSFangrui Song 
2405*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsq_lane_f32_0(
2406*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2407*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
2408*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2409*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
2410*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2411*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2412*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
2413*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2414*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2415*207e5cccSFangrui Song // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
2416*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
2417*207e5cccSFangrui Song //
2418*207e5cccSFangrui Song float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
2419*207e5cccSFangrui Song   return vfmsq_lane_f32(a, b, v, 0);
2420*207e5cccSFangrui Song }
2421*207e5cccSFangrui Song 
2422*207e5cccSFangrui Song // CHECK-LABEL: @test_vfms_laneq_f32_0(
2423*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2424*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
2425*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2426*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
2427*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2428*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2429*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2430*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2431*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
2432*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2433*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[TMP6]]
2434*207e5cccSFangrui Song //
2435*207e5cccSFangrui Song float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
2436*207e5cccSFangrui Song   return vfms_laneq_f32(a, b, v, 0);
2437*207e5cccSFangrui Song }
2438*207e5cccSFangrui Song 
2439*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsq_laneq_f32_0(
2440*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2441*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
2442*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2443*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
2444*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2445*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2446*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2447*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2448*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2449*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2450*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[TMP6]]
2451*207e5cccSFangrui Song //
2452*207e5cccSFangrui Song float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2453*207e5cccSFangrui Song   return vfmsq_laneq_f32(a, b, v, 0);
2454*207e5cccSFangrui Song }
2455*207e5cccSFangrui Song 
2456*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmaq_laneq_f64_0(
2457*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2458*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
2459*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
2460*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
2461*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2462*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2463*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2464*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2465*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2466*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[TMP6]]
2467*207e5cccSFangrui Song //
2468*207e5cccSFangrui Song float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2469*207e5cccSFangrui Song   return vfmaq_laneq_f64(a, b, v, 0);
2470*207e5cccSFangrui Song }
2471*207e5cccSFangrui Song 
2472*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
2473*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2474*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
2475*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
2476*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
2477*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
2478*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2479*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2480*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2481*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2482*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2483*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[TMP6]]
2484*207e5cccSFangrui Song //
2485*207e5cccSFangrui Song float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2486*207e5cccSFangrui Song   return vfmsq_laneq_f64(a, b, v, 0);
2487*207e5cccSFangrui Song }
2488*207e5cccSFangrui Song 
2489*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_lane_s16_0(
2490*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2491*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2492*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2493*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2494*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2495*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2496*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2497*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2498*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2499*207e5cccSFangrui Song //
2500*207e5cccSFangrui Song int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2501*207e5cccSFangrui Song   return vmlal_lane_s16(a, b, v, 0);
2502*207e5cccSFangrui Song }
2503*207e5cccSFangrui Song 
2504*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_lane_s32_0(
2505*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2506*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2507*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2508*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2509*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2510*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2511*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2512*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2513*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2514*207e5cccSFangrui Song //
2515*207e5cccSFangrui Song int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2516*207e5cccSFangrui Song   return vmlal_lane_s32(a, b, v, 0);
2517*207e5cccSFangrui Song }
2518*207e5cccSFangrui Song 
2519*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_laneq_s16_0(
2520*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2521*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2522*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2523*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2524*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2525*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2526*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2527*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2528*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2529*207e5cccSFangrui Song //
2530*207e5cccSFangrui Song int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2531*207e5cccSFangrui Song   return vmlal_laneq_s16(a, b, v, 0);
2532*207e5cccSFangrui Song }
2533*207e5cccSFangrui Song 
2534*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_laneq_s32_0(
2535*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2536*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2537*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2538*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2539*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2540*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2541*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2542*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2543*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2544*207e5cccSFangrui Song //
2545*207e5cccSFangrui Song int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2546*207e5cccSFangrui Song   return vmlal_laneq_s32(a, b, v, 0);
2547*207e5cccSFangrui Song }
2548*207e5cccSFangrui Song 
2549*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
2550*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2551*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2552*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2553*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2554*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2555*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2556*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2557*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2558*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2559*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2560*207e5cccSFangrui Song //
2561*207e5cccSFangrui Song int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2562*207e5cccSFangrui Song   return vmlal_high_lane_s16(a, b, v, 0);
2563*207e5cccSFangrui Song }
2564*207e5cccSFangrui Song 
2565*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
2566*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2567*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2568*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2569*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2570*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2571*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2572*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2573*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2574*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2575*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2576*207e5cccSFangrui Song //
2577*207e5cccSFangrui Song int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2578*207e5cccSFangrui Song   return vmlal_high_lane_s32(a, b, v, 0);
2579*207e5cccSFangrui Song }
2580*207e5cccSFangrui Song 
2581*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
2582*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2583*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2584*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2585*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2586*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2587*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2588*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2589*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2590*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2591*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2592*207e5cccSFangrui Song //
2593*207e5cccSFangrui Song int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2594*207e5cccSFangrui Song   return vmlal_high_laneq_s16(a, b, v, 0);
2595*207e5cccSFangrui Song }
2596*207e5cccSFangrui Song 
2597*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
2598*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2599*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2600*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2601*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2602*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2603*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2604*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2605*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2606*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2607*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2608*207e5cccSFangrui Song //
2609*207e5cccSFangrui Song int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2610*207e5cccSFangrui Song   return vmlal_high_laneq_s32(a, b, v, 0);
2611*207e5cccSFangrui Song }
2612*207e5cccSFangrui Song 
2613*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_lane_s16_0(
2614*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2615*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2616*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2617*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2618*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2619*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2620*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2621*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2622*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2623*207e5cccSFangrui Song //
2624*207e5cccSFangrui Song int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2625*207e5cccSFangrui Song   return vmlsl_lane_s16(a, b, v, 0);
2626*207e5cccSFangrui Song }
2627*207e5cccSFangrui Song 
2628*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_lane_s32_0(
2629*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2630*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2631*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2632*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2633*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2634*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2635*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2636*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2637*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2638*207e5cccSFangrui Song //
2639*207e5cccSFangrui Song int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2640*207e5cccSFangrui Song   return vmlsl_lane_s32(a, b, v, 0);
2641*207e5cccSFangrui Song }
2642*207e5cccSFangrui Song 
2643*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
2644*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2645*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2646*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2647*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2648*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2649*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2650*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2651*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2652*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2653*207e5cccSFangrui Song //
2654*207e5cccSFangrui Song int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2655*207e5cccSFangrui Song   return vmlsl_laneq_s16(a, b, v, 0);
2656*207e5cccSFangrui Song }
2657*207e5cccSFangrui Song 
2658*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
2659*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2660*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2661*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2662*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2663*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2664*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2665*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2666*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2667*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2668*207e5cccSFangrui Song //
2669*207e5cccSFangrui Song int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2670*207e5cccSFangrui Song   return vmlsl_laneq_s32(a, b, v, 0);
2671*207e5cccSFangrui Song }
2672*207e5cccSFangrui Song 
2673*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
2674*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2675*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2676*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2677*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2678*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2679*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2680*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2681*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2682*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2683*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2684*207e5cccSFangrui Song //
2685*207e5cccSFangrui Song int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2686*207e5cccSFangrui Song   return vmlsl_high_lane_s16(a, b, v, 0);
2687*207e5cccSFangrui Song }
2688*207e5cccSFangrui Song 
2689*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
2690*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2691*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2692*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2693*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2694*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2695*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2696*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2697*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2698*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2699*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2700*207e5cccSFangrui Song //
2701*207e5cccSFangrui Song int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2702*207e5cccSFangrui Song   return vmlsl_high_lane_s32(a, b, v, 0);
2703*207e5cccSFangrui Song }
2704*207e5cccSFangrui Song 
2705*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
2706*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2707*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2708*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2709*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2710*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2711*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2712*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2713*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2714*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2715*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2716*207e5cccSFangrui Song //
2717*207e5cccSFangrui Song int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2718*207e5cccSFangrui Song   return vmlsl_high_laneq_s16(a, b, v, 0);
2719*207e5cccSFangrui Song }
2720*207e5cccSFangrui Song 
2721*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
2722*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2723*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2724*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2725*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2726*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2727*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2728*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2729*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2730*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2731*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2732*207e5cccSFangrui Song //
2733*207e5cccSFangrui Song int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2734*207e5cccSFangrui Song   return vmlsl_high_laneq_s32(a, b, v, 0);
2735*207e5cccSFangrui Song }
2736*207e5cccSFangrui Song 
2737*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_lane_u16_0(
2738*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2739*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2740*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2741*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2742*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2743*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2744*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2745*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2746*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2747*207e5cccSFangrui Song //
2748*207e5cccSFangrui Song int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2749*207e5cccSFangrui Song   return vmlal_lane_u16(a, b, v, 0);
2750*207e5cccSFangrui Song }
2751*207e5cccSFangrui Song 
2752*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_lane_u32_0(
2753*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2754*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2755*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2756*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2757*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2758*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2759*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2760*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2761*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2762*207e5cccSFangrui Song //
2763*207e5cccSFangrui Song int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2764*207e5cccSFangrui Song   return vmlal_lane_u32(a, b, v, 0);
2765*207e5cccSFangrui Song }
2766*207e5cccSFangrui Song 
2767*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_laneq_u16_0(
2768*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2769*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2770*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2771*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2772*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2773*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2774*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2775*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2776*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2777*207e5cccSFangrui Song //
2778*207e5cccSFangrui Song int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2779*207e5cccSFangrui Song   return vmlal_laneq_u16(a, b, v, 0);
2780*207e5cccSFangrui Song }
2781*207e5cccSFangrui Song 
2782*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_laneq_u32_0(
2783*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2784*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2785*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2786*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2787*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2788*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2789*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2790*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2791*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2792*207e5cccSFangrui Song //
2793*207e5cccSFangrui Song int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2794*207e5cccSFangrui Song   return vmlal_laneq_u32(a, b, v, 0);
2795*207e5cccSFangrui Song }
2796*207e5cccSFangrui Song 
2797*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
2798*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2799*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2800*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2801*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2802*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2803*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2804*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2805*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2806*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2807*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2808*207e5cccSFangrui Song //
2809*207e5cccSFangrui Song int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2810*207e5cccSFangrui Song   return vmlal_high_lane_u16(a, b, v, 0);
2811*207e5cccSFangrui Song }
2812*207e5cccSFangrui Song 
2813*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
2814*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2815*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2816*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2817*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2818*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2819*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2820*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2821*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2822*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2823*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2824*207e5cccSFangrui Song //
2825*207e5cccSFangrui Song int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2826*207e5cccSFangrui Song   return vmlal_high_lane_u32(a, b, v, 0);
2827*207e5cccSFangrui Song }
2828*207e5cccSFangrui Song 
2829*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
2830*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2831*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2832*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2833*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2834*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2835*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2836*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2837*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2838*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2839*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2840*207e5cccSFangrui Song //
2841*207e5cccSFangrui Song int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2842*207e5cccSFangrui Song   return vmlal_high_laneq_u16(a, b, v, 0);
2843*207e5cccSFangrui Song }
2844*207e5cccSFangrui Song 
2845*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
2846*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2847*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2848*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2849*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2850*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2851*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2852*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2853*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2854*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2855*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2856*207e5cccSFangrui Song //
2857*207e5cccSFangrui Song int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2858*207e5cccSFangrui Song   return vmlal_high_laneq_u32(a, b, v, 0);
2859*207e5cccSFangrui Song }
2860*207e5cccSFangrui Song 
2861*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_lane_u16_0(
2862*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2863*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2864*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2865*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2866*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2867*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2868*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2869*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2870*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2871*207e5cccSFangrui Song //
2872*207e5cccSFangrui Song int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2873*207e5cccSFangrui Song   return vmlsl_lane_u16(a, b, v, 0);
2874*207e5cccSFangrui Song }
2875*207e5cccSFangrui Song 
2876*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_lane_u32_0(
2877*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2878*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2879*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2880*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2881*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2882*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2883*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2884*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2885*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2886*207e5cccSFangrui Song //
2887*207e5cccSFangrui Song int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2888*207e5cccSFangrui Song   return vmlsl_lane_u32(a, b, v, 0);
2889*207e5cccSFangrui Song }
2890*207e5cccSFangrui Song 
2891*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
2892*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2893*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2894*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2895*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2896*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2897*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2898*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2899*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2900*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2901*207e5cccSFangrui Song //
2902*207e5cccSFangrui Song int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2903*207e5cccSFangrui Song   return vmlsl_laneq_u16(a, b, v, 0);
2904*207e5cccSFangrui Song }
2905*207e5cccSFangrui Song 
2906*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
2907*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2908*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2909*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2910*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2911*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2912*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2913*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2914*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2915*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2916*207e5cccSFangrui Song //
2917*207e5cccSFangrui Song int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2918*207e5cccSFangrui Song   return vmlsl_laneq_u32(a, b, v, 0);
2919*207e5cccSFangrui Song }
2920*207e5cccSFangrui Song 
2921*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
2922*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2923*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2924*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2925*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2926*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2927*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2928*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2929*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2930*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2931*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2932*207e5cccSFangrui Song //
2933*207e5cccSFangrui Song int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2934*207e5cccSFangrui Song   return vmlsl_high_lane_u16(a, b, v, 0);
2935*207e5cccSFangrui Song }
2936*207e5cccSFangrui Song 
2937*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
2938*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2939*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2940*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2941*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2942*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2943*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2944*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2945*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2946*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2947*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2948*207e5cccSFangrui Song //
2949*207e5cccSFangrui Song int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2950*207e5cccSFangrui Song   return vmlsl_high_lane_u32(a, b, v, 0);
2951*207e5cccSFangrui Song }
2952*207e5cccSFangrui Song 
2953*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
2954*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2955*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2956*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2957*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2958*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2959*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2960*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2961*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2962*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2963*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2964*207e5cccSFangrui Song //
2965*207e5cccSFangrui Song int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2966*207e5cccSFangrui Song   return vmlsl_high_laneq_u16(a, b, v, 0);
2967*207e5cccSFangrui Song }
2968*207e5cccSFangrui Song 
2969*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
2970*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2971*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2972*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2973*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2974*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2975*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2976*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2977*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2978*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2979*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2980*207e5cccSFangrui Song //
2981*207e5cccSFangrui Song int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2982*207e5cccSFangrui Song   return vmlsl_high_laneq_u32(a, b, v, 0);
2983*207e5cccSFangrui Song }
2984*207e5cccSFangrui Song 
2985*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_lane_s16_0(
2986*207e5cccSFangrui Song // CHECK-NEXT:  entry:
2987*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2988*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2989*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2990*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2991*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2992*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
2993*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
2994*207e5cccSFangrui Song //
2995*207e5cccSFangrui Song int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2996*207e5cccSFangrui Song   return vmull_lane_s16(a, v, 0);
2997*207e5cccSFangrui Song }
2998*207e5cccSFangrui Song 
2999*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_lane_s32_0(
3000*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3001*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3002*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3003*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3004*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3005*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3006*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3007*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3008*207e5cccSFangrui Song //
3009*207e5cccSFangrui Song int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
3010*207e5cccSFangrui Song   return vmull_lane_s32(a, v, 0);
3011*207e5cccSFangrui Song }
3012*207e5cccSFangrui Song 
3013*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_lane_u16_0(
3014*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3015*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3016*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3017*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3018*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3019*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3020*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3021*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3022*207e5cccSFangrui Song //
3023*207e5cccSFangrui Song uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
3024*207e5cccSFangrui Song   return vmull_lane_u16(a, v, 0);
3025*207e5cccSFangrui Song }
3026*207e5cccSFangrui Song 
3027*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_lane_u32_0(
3028*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3029*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3030*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3031*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3032*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3033*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3034*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3035*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3036*207e5cccSFangrui Song //
3037*207e5cccSFangrui Song uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
3038*207e5cccSFangrui Song   return vmull_lane_u32(a, v, 0);
3039*207e5cccSFangrui Song }
3040*207e5cccSFangrui Song 
3041*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_lane_s16_0(
3042*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3043*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3044*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3045*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3046*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3047*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3048*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3049*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3050*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3051*207e5cccSFangrui Song //
3052*207e5cccSFangrui Song int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
3053*207e5cccSFangrui Song   return vmull_high_lane_s16(a, v, 0);
3054*207e5cccSFangrui Song }
3055*207e5cccSFangrui Song 
3056*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_lane_s32_0(
3057*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3058*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3059*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3060*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3061*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3062*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3063*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3064*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3065*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3066*207e5cccSFangrui Song //
3067*207e5cccSFangrui Song int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
3068*207e5cccSFangrui Song   return vmull_high_lane_s32(a, v, 0);
3069*207e5cccSFangrui Song }
3070*207e5cccSFangrui Song 
3071*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_lane_u16_0(
3072*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3073*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3074*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3075*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3076*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3077*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3078*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3079*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3080*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3081*207e5cccSFangrui Song //
3082*207e5cccSFangrui Song uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
3083*207e5cccSFangrui Song   return vmull_high_lane_u16(a, v, 0);
3084*207e5cccSFangrui Song }
3085*207e5cccSFangrui Song 
3086*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_lane_u32_0(
3087*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3088*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3089*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3090*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3091*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3092*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3093*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3094*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3095*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3096*207e5cccSFangrui Song //
3097*207e5cccSFangrui Song uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
3098*207e5cccSFangrui Song   return vmull_high_lane_u32(a, v, 0);
3099*207e5cccSFangrui Song }
3100*207e5cccSFangrui Song 
3101*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_laneq_s16_0(
3102*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3103*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3104*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3105*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3106*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3107*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3108*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3109*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3110*207e5cccSFangrui Song //
3111*207e5cccSFangrui Song int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
3112*207e5cccSFangrui Song   return vmull_laneq_s16(a, v, 0);
3113*207e5cccSFangrui Song }
3114*207e5cccSFangrui Song 
3115*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_laneq_s32_0(
3116*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3117*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3118*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3119*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3120*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3121*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3122*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3123*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3124*207e5cccSFangrui Song //
3125*207e5cccSFangrui Song int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
3126*207e5cccSFangrui Song   return vmull_laneq_s32(a, v, 0);
3127*207e5cccSFangrui Song }
3128*207e5cccSFangrui Song 
3129*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_laneq_u16_0(
3130*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3131*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3132*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3133*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3134*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3135*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3136*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3137*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3138*207e5cccSFangrui Song //
3139*207e5cccSFangrui Song uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
3140*207e5cccSFangrui Song   return vmull_laneq_u16(a, v, 0);
3141*207e5cccSFangrui Song }
3142*207e5cccSFangrui Song 
3143*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_laneq_u32_0(
3144*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3145*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3146*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3147*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3148*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3149*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3150*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3151*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3152*207e5cccSFangrui Song //
3153*207e5cccSFangrui Song uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
3154*207e5cccSFangrui Song   return vmull_laneq_u32(a, v, 0);
3155*207e5cccSFangrui Song }
3156*207e5cccSFangrui Song 
3157*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
3158*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3159*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3160*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3161*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3162*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3163*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3164*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3165*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3166*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3167*207e5cccSFangrui Song //
3168*207e5cccSFangrui Song int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
3169*207e5cccSFangrui Song   return vmull_high_laneq_s16(a, v, 0);
3170*207e5cccSFangrui Song }
3171*207e5cccSFangrui Song 
3172*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
3173*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3174*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3175*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3176*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3177*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3178*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3179*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3180*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3181*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3182*207e5cccSFangrui Song //
3183*207e5cccSFangrui Song int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
3184*207e5cccSFangrui Song   return vmull_high_laneq_s32(a, v, 0);
3185*207e5cccSFangrui Song }
3186*207e5cccSFangrui Song 
3187*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
3188*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3189*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3190*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3191*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3192*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3193*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3194*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3195*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3196*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3197*207e5cccSFangrui Song //
3198*207e5cccSFangrui Song uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
3199*207e5cccSFangrui Song   return vmull_high_laneq_u16(a, v, 0);
3200*207e5cccSFangrui Song }
3201*207e5cccSFangrui Song 
3202*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
3203*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3204*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3205*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3206*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3207*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3208*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3209*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3210*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3211*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3212*207e5cccSFangrui Song //
3213*207e5cccSFangrui Song uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
3214*207e5cccSFangrui Song   return vmull_high_laneq_u32(a, v, 0);
3215*207e5cccSFangrui Song }
3216*207e5cccSFangrui Song 
3217*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
3218*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3219*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3220*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3221*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3222*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3223*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
3224*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3225*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
3226*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3227*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
3228*207e5cccSFangrui Song //
3229*207e5cccSFangrui Song int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
3230*207e5cccSFangrui Song   return vqdmlal_lane_s16(a, b, v, 0);
3231*207e5cccSFangrui Song }
3232*207e5cccSFangrui Song 
3233*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
3234*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3235*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3236*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3237*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3238*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3239*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
3240*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3241*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
3242*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3243*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
3244*207e5cccSFangrui Song //
3245*207e5cccSFangrui Song int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
3246*207e5cccSFangrui Song   return vqdmlal_lane_s32(a, b, v, 0);
3247*207e5cccSFangrui Song }
3248*207e5cccSFangrui Song 
3249*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
3250*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3251*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3252*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3253*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3254*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3255*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3256*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3257*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3258*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3259*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3260*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
3261*207e5cccSFangrui Song //
3262*207e5cccSFangrui Song int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
3263*207e5cccSFangrui Song   return vqdmlal_high_lane_s16(a, b, v, 0);
3264*207e5cccSFangrui Song }
3265*207e5cccSFangrui Song 
3266*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
3267*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3268*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3269*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3270*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3271*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3272*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3273*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3274*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3275*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3276*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3277*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
3278*207e5cccSFangrui Song //
3279*207e5cccSFangrui Song int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
3280*207e5cccSFangrui Song   return vqdmlal_high_lane_s32(a, b, v, 0);
3281*207e5cccSFangrui Song }
3282*207e5cccSFangrui Song 
3283*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
3284*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3285*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3286*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3287*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3288*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3289*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
3290*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3291*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
3292*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3293*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
3294*207e5cccSFangrui Song //
3295*207e5cccSFangrui Song int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
3296*207e5cccSFangrui Song   return vqdmlsl_lane_s16(a, b, v, 0);
3297*207e5cccSFangrui Song }
3298*207e5cccSFangrui Song 
3299*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
3300*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3301*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3302*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3303*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3304*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3305*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
3306*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3307*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
3308*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3309*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
3310*207e5cccSFangrui Song //
3311*207e5cccSFangrui Song int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
3312*207e5cccSFangrui Song   return vqdmlsl_lane_s32(a, b, v, 0);
3313*207e5cccSFangrui Song }
3314*207e5cccSFangrui Song 
3315*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
3316*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3317*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3318*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3319*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3320*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3321*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3322*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3323*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3324*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3325*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3326*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
3327*207e5cccSFangrui Song //
3328*207e5cccSFangrui Song int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
3329*207e5cccSFangrui Song   return vqdmlsl_high_lane_s16(a, b, v, 0);
3330*207e5cccSFangrui Song }
3331*207e5cccSFangrui Song 
3332*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
3333*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3334*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3335*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3336*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3337*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3338*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3339*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3340*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3341*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3342*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3343*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
3344*207e5cccSFangrui Song //
3345*207e5cccSFangrui Song int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
3346*207e5cccSFangrui Song   return vqdmlsl_high_lane_s32(a, b, v, 0);
3347*207e5cccSFangrui Song }
3348*207e5cccSFangrui Song 
3349*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_lane_s16_0(
3350*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3351*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3352*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3353*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3354*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3355*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3356*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3357*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3358*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3359*207e5cccSFangrui Song //
3360*207e5cccSFangrui Song int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
3361*207e5cccSFangrui Song   return vqdmull_lane_s16(a, v, 0);
3362*207e5cccSFangrui Song }
3363*207e5cccSFangrui Song 
3364*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_lane_s32_0(
3365*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3366*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3367*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3368*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3369*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3370*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3371*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3372*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3373*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3374*207e5cccSFangrui Song //
3375*207e5cccSFangrui Song int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
3376*207e5cccSFangrui Song   return vqdmull_lane_s32(a, v, 0);
3377*207e5cccSFangrui Song }
3378*207e5cccSFangrui Song 
3379*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
3380*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3381*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3382*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3383*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3384*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3385*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3386*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3387*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3388*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3389*207e5cccSFangrui Song //
3390*207e5cccSFangrui Song int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
3391*207e5cccSFangrui Song   return vqdmull_laneq_s16(a, v, 0);
3392*207e5cccSFangrui Song }
3393*207e5cccSFangrui Song 
3394*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
3395*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3396*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3397*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3398*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3399*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3400*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3401*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3402*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3403*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3404*207e5cccSFangrui Song //
3405*207e5cccSFangrui Song int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
3406*207e5cccSFangrui Song   return vqdmull_laneq_s32(a, v, 0);
3407*207e5cccSFangrui Song }
3408*207e5cccSFangrui Song 
3409*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
3410*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3411*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3412*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3413*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3414*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3415*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3416*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3417*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3418*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3419*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3420*207e5cccSFangrui Song //
3421*207e5cccSFangrui Song int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
3422*207e5cccSFangrui Song   return vqdmull_high_lane_s16(a, v, 0);
3423*207e5cccSFangrui Song }
3424*207e5cccSFangrui Song 
3425*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
3426*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3427*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3428*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3429*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3430*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3431*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3432*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3433*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3434*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3435*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3436*207e5cccSFangrui Song //
3437*207e5cccSFangrui Song int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
3438*207e5cccSFangrui Song   return vqdmull_high_lane_s32(a, v, 0);
3439*207e5cccSFangrui Song }
3440*207e5cccSFangrui Song 
3441*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
3442*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3443*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3444*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3445*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3446*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3447*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3448*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3449*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3450*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3451*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3452*207e5cccSFangrui Song //
3453*207e5cccSFangrui Song int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
3454*207e5cccSFangrui Song   return vqdmull_high_laneq_s16(a, v, 0);
3455*207e5cccSFangrui Song }
3456*207e5cccSFangrui Song 
3457*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
3458*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3459*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3460*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3461*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3462*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3463*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3464*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3465*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3466*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3467*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3468*207e5cccSFangrui Song //
3469*207e5cccSFangrui Song int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
3470*207e5cccSFangrui Song   return vqdmull_high_laneq_s32(a, v, 0);
3471*207e5cccSFangrui Song }
3472*207e5cccSFangrui Song 
3473*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
3474*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3475*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3476*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3477*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3478*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3479*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
3480*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
3481*207e5cccSFangrui Song //
3482*207e5cccSFangrui Song int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
3483*207e5cccSFangrui Song   return vqdmulh_lane_s16(a, v, 0);
3484*207e5cccSFangrui Song }
3485*207e5cccSFangrui Song 
3486*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
3487*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3488*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
3489*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3490*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3491*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3492*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
3493*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
3494*207e5cccSFangrui Song //
3495*207e5cccSFangrui Song int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
3496*207e5cccSFangrui Song   return vqdmulhq_lane_s16(a, v, 0);
3497*207e5cccSFangrui Song }
3498*207e5cccSFangrui Song 
3499*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
3500*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3501*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3502*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3503*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3504*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3505*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
3506*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
3507*207e5cccSFangrui Song //
3508*207e5cccSFangrui Song int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3509*207e5cccSFangrui Song   return vqdmulh_lane_s32(a, v, 0);
3510*207e5cccSFangrui Song }
3511*207e5cccSFangrui Song 
3512*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
3513*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3514*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3515*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3516*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3517*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3518*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
3519*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
3520*207e5cccSFangrui Song //
3521*207e5cccSFangrui Song int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3522*207e5cccSFangrui Song   return vqdmulhq_lane_s32(a, v, 0);
3523*207e5cccSFangrui Song }
3524*207e5cccSFangrui Song 
3525*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
3526*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3527*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3528*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3529*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3530*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3531*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
3532*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
3533*207e5cccSFangrui Song //
3534*207e5cccSFangrui Song int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
3535*207e5cccSFangrui Song   return vqrdmulh_lane_s16(a, v, 0);
3536*207e5cccSFangrui Song }
3537*207e5cccSFangrui Song 
3538*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
3539*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3540*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
3541*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3542*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3543*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3544*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
3545*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
3546*207e5cccSFangrui Song //
3547*207e5cccSFangrui Song int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
3548*207e5cccSFangrui Song   return vqrdmulhq_lane_s16(a, v, 0);
3549*207e5cccSFangrui Song }
3550*207e5cccSFangrui Song 
3551*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
3552*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3553*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3554*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3555*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3556*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3557*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
3558*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
3559*207e5cccSFangrui Song //
3560*207e5cccSFangrui Song int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3561*207e5cccSFangrui Song   return vqrdmulh_lane_s32(a, v, 0);
3562*207e5cccSFangrui Song }
3563*207e5cccSFangrui Song 
3564*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
3565*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3566*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3567*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3568*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3569*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3570*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
3571*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
3572*207e5cccSFangrui Song //
3573*207e5cccSFangrui Song int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3574*207e5cccSFangrui Song   return vqrdmulhq_lane_s32(a, v, 0);
3575*207e5cccSFangrui Song }
3576*207e5cccSFangrui Song 
3577*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_lane_f32_0(
3578*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3579*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3580*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3581*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
3582*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
3583*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[MUL]]
3584*207e5cccSFangrui Song //
3585*207e5cccSFangrui Song float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
3586*207e5cccSFangrui Song   return vmul_lane_f32(a, v, 0);
3587*207e5cccSFangrui Song }
3588*207e5cccSFangrui Song 
3589*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_lane_f32_0(
3590*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3591*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3592*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3593*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
3594*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
3595*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[MUL]]
3596*207e5cccSFangrui Song //
3597*207e5cccSFangrui Song float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
3598*207e5cccSFangrui Song   return vmulq_lane_f32(a, v, 0);
3599*207e5cccSFangrui Song }
3600*207e5cccSFangrui Song 
3601*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_f32_0(
3602*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3603*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3604*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3605*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
3606*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
3607*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[MUL]]
3608*207e5cccSFangrui Song //
3609*207e5cccSFangrui Song float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
3610*207e5cccSFangrui Song   return vmul_laneq_f32(a, v, 0);
3611*207e5cccSFangrui Song }
3612*207e5cccSFangrui Song 
3613*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_laneq_f64_0(
3614*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3615*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
3616*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3617*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
3618*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3619*207e5cccSFangrui Song // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
3620*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
3621*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
3622*207e5cccSFangrui Song // CHECK-NEXT:    ret <1 x double> [[TMP5]]
3623*207e5cccSFangrui Song //
3624*207e5cccSFangrui Song float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
3625*207e5cccSFangrui Song   return vmul_laneq_f64(a, v, 0);
3626*207e5cccSFangrui Song }
3627*207e5cccSFangrui Song 
3628*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_f32_0(
3629*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3630*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3631*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3632*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
3633*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
3634*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[MUL]]
3635*207e5cccSFangrui Song //
3636*207e5cccSFangrui Song float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3637*207e5cccSFangrui Song   return vmulq_laneq_f32(a, v, 0);
3638*207e5cccSFangrui Song }
3639*207e5cccSFangrui Song 
3640*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_laneq_f64_0(
3641*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3642*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3643*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3644*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
3645*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
3646*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[MUL]]
3647*207e5cccSFangrui Song //
3648*207e5cccSFangrui Song float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3649*207e5cccSFangrui Song   return vmulq_laneq_f64(a, v, 0);
3650*207e5cccSFangrui Song }
3651*207e5cccSFangrui Song 
3652*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulx_lane_f32_0(
3653*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3654*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3655*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3656*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
3657*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
3658*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
3659*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
3660*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
3661*207e5cccSFangrui Song //
3662*207e5cccSFangrui Song float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
3663*207e5cccSFangrui Song   return vmulx_lane_f32(a, v, 0);
3664*207e5cccSFangrui Song }
3665*207e5cccSFangrui Song 
3666*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulxq_lane_f32_0(
3667*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3668*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3669*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3670*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
3671*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
3672*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
3673*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
3674*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
3675*207e5cccSFangrui Song //
3676*207e5cccSFangrui Song float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
3677*207e5cccSFangrui Song   return vmulxq_lane_f32(a, v, 0);
3678*207e5cccSFangrui Song }
3679*207e5cccSFangrui Song 
3680*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulxq_lane_f64_0(
3681*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3682*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
3683*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
3684*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
3685*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
3686*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
3687*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
3688*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
3689*207e5cccSFangrui Song //
3690*207e5cccSFangrui Song float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
3691*207e5cccSFangrui Song   return vmulxq_lane_f64(a, v, 0);
3692*207e5cccSFangrui Song }
3693*207e5cccSFangrui Song 
3694*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulx_laneq_f32_0(
3695*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3696*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3697*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3698*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
3699*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
3700*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
3701*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
3702*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
3703*207e5cccSFangrui Song //
3704*207e5cccSFangrui Song float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
3705*207e5cccSFangrui Song   return vmulx_laneq_f32(a, v, 0);
3706*207e5cccSFangrui Song }
3707*207e5cccSFangrui Song 
3708*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
3709*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3710*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3711*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3712*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
3713*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
3714*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
3715*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
3716*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
3717*207e5cccSFangrui Song //
3718*207e5cccSFangrui Song float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3719*207e5cccSFangrui Song   return vmulxq_laneq_f32(a, v, 0);
3720*207e5cccSFangrui Song }
3721*207e5cccSFangrui Song 
3722*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
3723*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3724*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3725*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3726*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
3727*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
3728*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
3729*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
3730*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
3731*207e5cccSFangrui Song //
3732*207e5cccSFangrui Song float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3733*207e5cccSFangrui Song   return vmulxq_laneq_f64(a, v, 0);
3734*207e5cccSFangrui Song }
3735*207e5cccSFangrui Song 
3736*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_n_s16(
3737*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3738*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3739*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
3740*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3741*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3742*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3743*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3744*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3745*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3746*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
3747*207e5cccSFangrui Song //
3748*207e5cccSFangrui Song int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
3749*207e5cccSFangrui Song   return vmull_high_n_s16(a, b);
3750*207e5cccSFangrui Song }
3751*207e5cccSFangrui Song 
3752*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_n_s32(
3753*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3754*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3755*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
3756*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3757*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3758*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3759*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3760*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
3761*207e5cccSFangrui Song //
3762*207e5cccSFangrui Song int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
3763*207e5cccSFangrui Song   return vmull_high_n_s32(a, b);
3764*207e5cccSFangrui Song }
3765*207e5cccSFangrui Song 
3766*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_n_u16(
3767*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3768*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3769*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
3770*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3771*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3772*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3773*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3774*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3775*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3776*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
3777*207e5cccSFangrui Song //
3778*207e5cccSFangrui Song uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
3779*207e5cccSFangrui Song   return vmull_high_n_u16(a, b);
3780*207e5cccSFangrui Song }
3781*207e5cccSFangrui Song 
3782*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_high_n_u32(
3783*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3784*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3785*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
3786*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3787*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3788*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3789*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3790*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
3791*207e5cccSFangrui Song //
3792*207e5cccSFangrui Song uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
3793*207e5cccSFangrui Song   return vmull_high_n_u32(a, b);
3794*207e5cccSFangrui Song }
3795*207e5cccSFangrui Song 
3796*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_n_s16(
3797*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3798*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3799*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
3800*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3801*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3802*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3803*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3804*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3805*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3806*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
3807*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
3808*207e5cccSFangrui Song //
3809*207e5cccSFangrui Song int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
3810*207e5cccSFangrui Song   return vqdmull_high_n_s16(a, b);
3811*207e5cccSFangrui Song }
3812*207e5cccSFangrui Song 
3813*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_high_n_s32(
3814*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3815*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3816*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
3817*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3818*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3819*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3820*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3821*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
3822*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
3823*207e5cccSFangrui Song //
3824*207e5cccSFangrui Song int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
3825*207e5cccSFangrui Song   return vqdmull_high_n_s32(a, b);
3826*207e5cccSFangrui Song }
3827*207e5cccSFangrui Song 
3828*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_n_s16(
3829*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3830*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3831*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3832*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3833*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3834*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3835*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3836*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3837*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3838*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3839*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
3840*207e5cccSFangrui Song //
3841*207e5cccSFangrui Song int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3842*207e5cccSFangrui Song   return vmlal_high_n_s16(a, b, c);
3843*207e5cccSFangrui Song }
3844*207e5cccSFangrui Song 
3845*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_n_s32(
3846*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3847*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3848*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3849*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3850*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3851*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3852*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3853*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3854*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
3855*207e5cccSFangrui Song //
3856*207e5cccSFangrui Song int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3857*207e5cccSFangrui Song   return vmlal_high_n_s32(a, b, c);
3858*207e5cccSFangrui Song }
3859*207e5cccSFangrui Song 
3860*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_n_u16(
3861*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3862*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3863*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3864*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3865*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3866*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3867*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3868*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3869*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3870*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3871*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
3872*207e5cccSFangrui Song //
3873*207e5cccSFangrui Song uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3874*207e5cccSFangrui Song   return vmlal_high_n_u16(a, b, c);
3875*207e5cccSFangrui Song }
3876*207e5cccSFangrui Song 
3877*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_high_n_u32(
3878*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3879*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3880*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3881*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3882*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3883*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3884*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3885*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3886*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
3887*207e5cccSFangrui Song //
3888*207e5cccSFangrui Song uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3889*207e5cccSFangrui Song   return vmlal_high_n_u32(a, b, c);
3890*207e5cccSFangrui Song }
3891*207e5cccSFangrui Song 
3892*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_n_s16(
3893*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3894*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3895*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3896*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3897*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3898*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3899*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3900*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3901*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3902*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3903*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
3904*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
3905*207e5cccSFangrui Song //
3906*207e5cccSFangrui Song int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3907*207e5cccSFangrui Song   return vqdmlal_high_n_s16(a, b, c);
3908*207e5cccSFangrui Song }
3909*207e5cccSFangrui Song 
3910*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_n_s32(
3911*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3912*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3913*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3914*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3915*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3916*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3917*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3918*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3919*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
3920*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
3921*207e5cccSFangrui Song //
3922*207e5cccSFangrui Song int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3923*207e5cccSFangrui Song   return vqdmlal_high_n_s32(a, b, c);
3924*207e5cccSFangrui Song }
3925*207e5cccSFangrui Song 
3926*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_n_s16(
3927*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3928*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3929*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3930*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3931*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3932*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3933*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3934*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3935*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3936*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3937*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
3938*207e5cccSFangrui Song //
3939*207e5cccSFangrui Song int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3940*207e5cccSFangrui Song   return vmlsl_high_n_s16(a, b, c);
3941*207e5cccSFangrui Song }
3942*207e5cccSFangrui Song 
3943*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_n_s32(
3944*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3945*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3946*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3947*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3948*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3949*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3950*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3951*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3952*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
3953*207e5cccSFangrui Song //
3954*207e5cccSFangrui Song int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3955*207e5cccSFangrui Song   return vmlsl_high_n_s32(a, b, c);
3956*207e5cccSFangrui Song }
3957*207e5cccSFangrui Song 
3958*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_n_u16(
3959*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3960*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3961*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3962*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3963*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3964*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3965*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3966*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3967*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3968*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3969*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
3970*207e5cccSFangrui Song //
3971*207e5cccSFangrui Song uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3972*207e5cccSFangrui Song   return vmlsl_high_n_u16(a, b, c);
3973*207e5cccSFangrui Song }
3974*207e5cccSFangrui Song 
3975*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_high_n_u32(
3976*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3977*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3978*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3979*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3980*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3981*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3982*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3983*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3984*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
3985*207e5cccSFangrui Song //
3986*207e5cccSFangrui Song uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3987*207e5cccSFangrui Song   return vmlsl_high_n_u32(a, b, c);
3988*207e5cccSFangrui Song }
3989*207e5cccSFangrui Song 
3990*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
3991*207e5cccSFangrui Song // CHECK-NEXT:  entry:
3992*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3993*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3994*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3995*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3996*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3997*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3998*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3999*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4000*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
4001*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4002*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
4003*207e5cccSFangrui Song //
4004*207e5cccSFangrui Song int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
4005*207e5cccSFangrui Song   return vqdmlsl_high_n_s16(a, b, c);
4006*207e5cccSFangrui Song }
4007*207e5cccSFangrui Song 
4008*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
4009*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4010*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
4011*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4012*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4013*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4014*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
4015*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4016*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
4017*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4018*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
4019*207e5cccSFangrui Song //
4020*207e5cccSFangrui Song int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
4021*207e5cccSFangrui Song   return vqdmlsl_high_n_s32(a, b, c);
4022*207e5cccSFangrui Song }
4023*207e5cccSFangrui Song 
4024*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_n_f32(
4025*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4026*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i32 0
4027*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1
4028*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[A:%.*]], [[VECINIT1_I]]
4029*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[MUL_I]]
4030*207e5cccSFangrui Song //
4031*207e5cccSFangrui Song float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
4032*207e5cccSFangrui Song   return vmul_n_f32(a, b);
4033*207e5cccSFangrui Song }
4034*207e5cccSFangrui Song 
4035*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_n_f32(
4036*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4037*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0
4038*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1
4039*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2
4040*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3
4041*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[A:%.*]], [[VECINIT3_I]]
4042*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[MUL_I]]
4043*207e5cccSFangrui Song //
4044*207e5cccSFangrui Song float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
4045*207e5cccSFangrui Song   return vmulq_n_f32(a, b);
4046*207e5cccSFangrui Song }
4047*207e5cccSFangrui Song 
4048*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_n_f64(
4049*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4050*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0
4051*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[B]], i32 1
4052*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[A:%.*]], [[VECINIT1_I]]
4053*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x double> [[MUL_I]]
4054*207e5cccSFangrui Song //
4055*207e5cccSFangrui Song float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
4056*207e5cccSFangrui Song   return vmulq_n_f64(a, b);
4057*207e5cccSFangrui Song }
4058*207e5cccSFangrui Song 
4059*207e5cccSFangrui Song // CHECK-LABEL: @test_vfma_n_f32(
4060*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4061*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0
4062*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
4063*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
4064*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
4065*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
4066*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
4067*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[TMP3]]
4068*207e5cccSFangrui Song //
4069*207e5cccSFangrui Song float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
4070*207e5cccSFangrui Song   return vfma_n_f32(a, b, n);
4071*207e5cccSFangrui Song }
4072*207e5cccSFangrui Song 
4073*207e5cccSFangrui Song // CHECK-LABEL: @test_vfma_n_f64(
4074*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4075*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0
4076*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
4077*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
4078*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
4079*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
4080*207e5cccSFangrui Song // CHECK-NEXT:    ret <1 x double> [[TMP3]]
4081*207e5cccSFangrui Song //
4082*207e5cccSFangrui Song float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
4083*207e5cccSFangrui Song   return vfma_n_f64(a, b, n);
4084*207e5cccSFangrui Song }
4085*207e5cccSFangrui Song 
4086*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmaq_n_f32(
4087*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4088*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0
4089*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
4090*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
4091*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
4092*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
4093*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
4094*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
4095*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
4096*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[TMP3]]
4097*207e5cccSFangrui Song //
4098*207e5cccSFangrui Song float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
4099*207e5cccSFangrui Song   return vfmaq_n_f32(a, b, n);
4100*207e5cccSFangrui Song }
4101*207e5cccSFangrui Song 
4102*207e5cccSFangrui Song // CHECK-LABEL: @test_vfms_n_f32(
4103*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4104*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]]
4105*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0
4106*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
4107*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
4108*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
4109*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
4110*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
4111*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x float> [[TMP3]]
4112*207e5cccSFangrui Song //
4113*207e5cccSFangrui Song float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
4114*207e5cccSFangrui Song   return vfms_n_f32(a, b, n);
4115*207e5cccSFangrui Song }
4116*207e5cccSFangrui Song 
4117*207e5cccSFangrui Song // CHECK-LABEL: @test_vfms_n_f64(
4118*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4119*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]]
4120*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0
4121*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
4122*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8>
4123*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
4124*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
4125*207e5cccSFangrui Song // CHECK-NEXT:    ret <1 x double> [[TMP3]]
4126*207e5cccSFangrui Song //
4127*207e5cccSFangrui Song float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
4128*207e5cccSFangrui Song   return vfms_n_f64(a, b, n);
4129*207e5cccSFangrui Song }
4130*207e5cccSFangrui Song 
4131*207e5cccSFangrui Song // CHECK-LABEL: @test_vfmsq_n_f32(
4132*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4133*207e5cccSFangrui Song // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[B:%.*]]
4134*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0
4135*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
4136*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
4137*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
4138*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
4139*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
4140*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
4141*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
4142*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x float> [[TMP3]]
4143*207e5cccSFangrui Song //
4144*207e5cccSFangrui Song float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
4145*207e5cccSFangrui Song   return vfmsq_n_f32(a, b, n);
4146*207e5cccSFangrui Song }
4147*207e5cccSFangrui Song 
4148*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_n_s16(
4149*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4150*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4151*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4152*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4153*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4154*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
4155*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
4156*207e5cccSFangrui Song //
4157*207e5cccSFangrui Song int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
4158*207e5cccSFangrui Song   return vmul_n_s16(a, b);
4159*207e5cccSFangrui Song }
4160*207e5cccSFangrui Song 
4161*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_n_s16(
4162*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4163*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
4164*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4165*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4166*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4167*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4168*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4169*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4170*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4171*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
4172*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
4173*207e5cccSFangrui Song //
4174*207e5cccSFangrui Song int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
4175*207e5cccSFangrui Song   return vmulq_n_s16(a, b);
4176*207e5cccSFangrui Song }
4177*207e5cccSFangrui Song 
4178*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_n_s32(
4179*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4180*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4181*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4182*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
4183*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
4184*207e5cccSFangrui Song //
4185*207e5cccSFangrui Song int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
4186*207e5cccSFangrui Song   return vmul_n_s32(a, b);
4187*207e5cccSFangrui Song }
4188*207e5cccSFangrui Song 
4189*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_n_s32(
4190*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4191*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
4192*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4193*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4194*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4195*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
4196*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
4197*207e5cccSFangrui Song //
4198*207e5cccSFangrui Song int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
4199*207e5cccSFangrui Song   return vmulq_n_s32(a, b);
4200*207e5cccSFangrui Song }
4201*207e5cccSFangrui Song 
4202*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_n_u16(
4203*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4204*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4205*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4206*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4207*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4208*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
4209*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
4210*207e5cccSFangrui Song //
4211*207e5cccSFangrui Song uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
4212*207e5cccSFangrui Song   return vmul_n_u16(a, b);
4213*207e5cccSFangrui Song }
4214*207e5cccSFangrui Song 
4215*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_n_u16(
4216*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4217*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
4218*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4219*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4220*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4221*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4222*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4223*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4224*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4225*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
4226*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
4227*207e5cccSFangrui Song //
4228*207e5cccSFangrui Song uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
4229*207e5cccSFangrui Song   return vmulq_n_u16(a, b);
4230*207e5cccSFangrui Song }
4231*207e5cccSFangrui Song 
4232*207e5cccSFangrui Song // CHECK-LABEL: @test_vmul_n_u32(
4233*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4234*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4235*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4236*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
4237*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
4238*207e5cccSFangrui Song //
4239*207e5cccSFangrui Song uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
4240*207e5cccSFangrui Song   return vmul_n_u32(a, b);
4241*207e5cccSFangrui Song }
4242*207e5cccSFangrui Song 
4243*207e5cccSFangrui Song // CHECK-LABEL: @test_vmulq_n_u32(
4244*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4245*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
4246*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4247*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4248*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4249*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
4250*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
4251*207e5cccSFangrui Song //
4252*207e5cccSFangrui Song uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
4253*207e5cccSFangrui Song   return vmulq_n_u32(a, b);
4254*207e5cccSFangrui Song }
4255*207e5cccSFangrui Song 
4256*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_n_s16(
4257*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4258*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4259*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4260*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4261*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4262*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4263*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4264*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4265*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
4266*207e5cccSFangrui Song //
4267*207e5cccSFangrui Song int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
4268*207e5cccSFangrui Song   return vmull_n_s16(a, b);
4269*207e5cccSFangrui Song }
4270*207e5cccSFangrui Song 
4271*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_n_s32(
4272*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4273*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4274*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4275*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4276*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4277*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4278*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
4279*207e5cccSFangrui Song //
4280*207e5cccSFangrui Song int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
4281*207e5cccSFangrui Song   return vmull_n_s32(a, b);
4282*207e5cccSFangrui Song }
4283*207e5cccSFangrui Song 
4284*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_n_u16(
4285*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4286*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4287*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4288*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4289*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4290*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4291*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4292*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4293*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
4294*207e5cccSFangrui Song //
4295*207e5cccSFangrui Song uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
4296*207e5cccSFangrui Song   return vmull_n_u16(a, b);
4297*207e5cccSFangrui Song }
4298*207e5cccSFangrui Song 
4299*207e5cccSFangrui Song // CHECK-LABEL: @test_vmull_n_u32(
4300*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4301*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4302*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4303*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4304*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4305*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4306*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
4307*207e5cccSFangrui Song //
4308*207e5cccSFangrui Song uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
4309*207e5cccSFangrui Song   return vmull_n_u32(a, b);
4310*207e5cccSFangrui Song }
4311*207e5cccSFangrui Song 
4312*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_n_s16(
4313*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4314*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4315*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4316*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4317*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4318*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4319*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4320*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4321*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
4322*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
4323*207e5cccSFangrui Song //
4324*207e5cccSFangrui Song int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
4325*207e5cccSFangrui Song   return vqdmull_n_s16(a, b);
4326*207e5cccSFangrui Song }
4327*207e5cccSFangrui Song 
4328*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmull_n_s32(
4329*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4330*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4331*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4332*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4333*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4334*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4335*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
4336*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
4337*207e5cccSFangrui Song //
4338*207e5cccSFangrui Song int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
4339*207e5cccSFangrui Song   return vqdmull_n_s32(a, b);
4340*207e5cccSFangrui Song }
4341*207e5cccSFangrui Song 
4342*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_n_s16(
4343*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4344*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4345*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4346*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4347*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4348*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4349*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4350*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4351*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4352*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
4353*207e5cccSFangrui Song //
4354*207e5cccSFangrui Song int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
4355*207e5cccSFangrui Song   return vqdmulh_n_s16(a, b);
4356*207e5cccSFangrui Song }
4357*207e5cccSFangrui Song 
4358*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_n_s16(
4359*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4360*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
4361*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4362*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4363*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4364*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4365*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4366*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4367*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4368*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
4369*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
4370*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
4371*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4372*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
4373*207e5cccSFangrui Song //
4374*207e5cccSFangrui Song int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
4375*207e5cccSFangrui Song   return vqdmulhq_n_s16(a, b);
4376*207e5cccSFangrui Song }
4377*207e5cccSFangrui Song 
4378*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_n_s32(
4379*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4380*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4381*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4382*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4383*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4384*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4385*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4386*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
4387*207e5cccSFangrui Song //
4388*207e5cccSFangrui Song int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
4389*207e5cccSFangrui Song   return vqdmulh_n_s32(a, b);
4390*207e5cccSFangrui Song }
4391*207e5cccSFangrui Song 
4392*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_n_s32(
4393*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4394*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
4395*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4396*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4397*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4398*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4399*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
4400*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
4401*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4402*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
4403*207e5cccSFangrui Song //
4404*207e5cccSFangrui Song int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
4405*207e5cccSFangrui Song   return vqdmulhq_n_s32(a, b);
4406*207e5cccSFangrui Song }
4407*207e5cccSFangrui Song 
4408*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_n_s16(
4409*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4410*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4411*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4412*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4413*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4414*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4415*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4416*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4417*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4418*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
4419*207e5cccSFangrui Song //
4420*207e5cccSFangrui Song int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
4421*207e5cccSFangrui Song   return vqrdmulh_n_s16(a, b);
4422*207e5cccSFangrui Song }
4423*207e5cccSFangrui Song 
4424*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_n_s16(
4425*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4426*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
4427*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4428*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4429*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4430*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4431*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4432*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4433*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4434*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
4435*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
4436*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
4437*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4438*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
4439*207e5cccSFangrui Song //
4440*207e5cccSFangrui Song int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
4441*207e5cccSFangrui Song   return vqrdmulhq_n_s16(a, b);
4442*207e5cccSFangrui Song }
4443*207e5cccSFangrui Song 
4444*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_n_s32(
4445*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4446*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4447*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4448*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4449*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4450*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4451*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4452*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
4453*207e5cccSFangrui Song //
4454*207e5cccSFangrui Song int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
4455*207e5cccSFangrui Song   return vqrdmulh_n_s32(a, b);
4456*207e5cccSFangrui Song }
4457*207e5cccSFangrui Song 
4458*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_n_s32(
4459*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4460*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
4461*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4462*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4463*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4464*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4465*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
4466*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
4467*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4468*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
4469*207e5cccSFangrui Song //
4470*207e5cccSFangrui Song int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
4471*207e5cccSFangrui Song   return vqrdmulhq_n_s32(a, b);
4472*207e5cccSFangrui Song }
4473*207e5cccSFangrui Song 
4474*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_n_s16(
4475*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4476*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4477*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4478*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4479*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4480*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4481*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
4482*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
4483*207e5cccSFangrui Song //
4484*207e5cccSFangrui Song int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4485*207e5cccSFangrui Song   return vmla_n_s16(a, b, c);
4486*207e5cccSFangrui Song }
4487*207e5cccSFangrui Song 
4488*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_n_s16(
4489*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4490*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
4491*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4492*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4493*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4494*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4495*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4496*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4497*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4498*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4499*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
4500*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
4501*207e5cccSFangrui Song //
4502*207e5cccSFangrui Song int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4503*207e5cccSFangrui Song   return vmlaq_n_s16(a, b, c);
4504*207e5cccSFangrui Song }
4505*207e5cccSFangrui Song 
4506*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_n_s32(
4507*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4508*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4509*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4510*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4511*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
4512*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
4513*207e5cccSFangrui Song //
4514*207e5cccSFangrui Song int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4515*207e5cccSFangrui Song   return vmla_n_s32(a, b, c);
4516*207e5cccSFangrui Song }
4517*207e5cccSFangrui Song 
4518*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_n_s32(
4519*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4520*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
4521*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4522*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4523*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4524*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4525*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
4526*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4527*207e5cccSFangrui Song //
4528*207e5cccSFangrui Song int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4529*207e5cccSFangrui Song   return vmlaq_n_s32(a, b, c);
4530*207e5cccSFangrui Song }
4531*207e5cccSFangrui Song 
4532*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_n_u16(
4533*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4534*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4535*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4536*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4537*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4538*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4539*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
4540*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
4541*207e5cccSFangrui Song //
4542*207e5cccSFangrui Song uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4543*207e5cccSFangrui Song   return vmla_n_u16(a, b, c);
4544*207e5cccSFangrui Song }
4545*207e5cccSFangrui Song 
4546*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_n_u16(
4547*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4548*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
4549*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4550*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4551*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4552*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4553*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4554*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4555*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4556*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4557*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
4558*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
4559*207e5cccSFangrui Song //
4560*207e5cccSFangrui Song uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4561*207e5cccSFangrui Song   return vmlaq_n_u16(a, b, c);
4562*207e5cccSFangrui Song }
4563*207e5cccSFangrui Song 
4564*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_n_u32(
4565*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4566*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4567*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4568*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4569*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
4570*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
4571*207e5cccSFangrui Song //
4572*207e5cccSFangrui Song uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4573*207e5cccSFangrui Song   return vmla_n_u32(a, b, c);
4574*207e5cccSFangrui Song }
4575*207e5cccSFangrui Song 
4576*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_n_u32(
4577*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4578*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
4579*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4580*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4581*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4582*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4583*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
4584*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4585*207e5cccSFangrui Song //
4586*207e5cccSFangrui Song uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4587*207e5cccSFangrui Song   return vmlaq_n_u32(a, b, c);
4588*207e5cccSFangrui Song }
4589*207e5cccSFangrui Song 
4590*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_n_s16(
4591*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4592*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4593*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4594*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4595*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4596*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4597*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4598*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4599*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4600*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4601*207e5cccSFangrui Song //
4602*207e5cccSFangrui Song int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4603*207e5cccSFangrui Song   return vmlal_n_s16(a, b, c);
4604*207e5cccSFangrui Song }
4605*207e5cccSFangrui Song 
4606*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_n_s32(
4607*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4608*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4609*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4610*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4611*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4612*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4613*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4614*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
4615*207e5cccSFangrui Song //
4616*207e5cccSFangrui Song int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4617*207e5cccSFangrui Song   return vmlal_n_s32(a, b, c);
4618*207e5cccSFangrui Song }
4619*207e5cccSFangrui Song 
4620*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_n_u16(
4621*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4622*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4623*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4624*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4625*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4626*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4627*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4628*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4629*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4630*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4631*207e5cccSFangrui Song //
4632*207e5cccSFangrui Song uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4633*207e5cccSFangrui Song   return vmlal_n_u16(a, b, c);
4634*207e5cccSFangrui Song }
4635*207e5cccSFangrui Song 
4636*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlal_n_u32(
4637*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4638*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4639*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4640*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4641*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4642*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4643*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4644*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
4645*207e5cccSFangrui Song //
4646*207e5cccSFangrui Song uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4647*207e5cccSFangrui Song   return vmlal_n_u32(a, b, c);
4648*207e5cccSFangrui Song }
4649*207e5cccSFangrui Song 
4650*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_n_s16(
4651*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4652*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4653*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4654*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4655*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4656*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4657*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4658*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4659*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4660*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4661*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
4662*207e5cccSFangrui Song //
4663*207e5cccSFangrui Song int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4664*207e5cccSFangrui Song   return vqdmlal_n_s16(a, b, c);
4665*207e5cccSFangrui Song }
4666*207e5cccSFangrui Song 
4667*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_n_s32(
4668*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4669*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4670*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4671*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4672*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4673*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4674*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4675*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4676*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
4677*207e5cccSFangrui Song //
4678*207e5cccSFangrui Song int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4679*207e5cccSFangrui Song   return vqdmlal_n_s32(a, b, c);
4680*207e5cccSFangrui Song }
4681*207e5cccSFangrui Song 
4682*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_n_s16(
4683*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4684*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4685*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4686*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4687*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4688*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4689*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
4690*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
4691*207e5cccSFangrui Song //
4692*207e5cccSFangrui Song int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4693*207e5cccSFangrui Song   return vmls_n_s16(a, b, c);
4694*207e5cccSFangrui Song }
4695*207e5cccSFangrui Song 
4696*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_n_s16(
4697*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4698*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
4699*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4700*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4701*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4702*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4703*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4704*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4705*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4706*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4707*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
4708*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
4709*207e5cccSFangrui Song //
4710*207e5cccSFangrui Song int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4711*207e5cccSFangrui Song   return vmlsq_n_s16(a, b, c);
4712*207e5cccSFangrui Song }
4713*207e5cccSFangrui Song 
4714*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_n_s32(
4715*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4716*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4717*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4718*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4719*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
4720*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
4721*207e5cccSFangrui Song //
4722*207e5cccSFangrui Song int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4723*207e5cccSFangrui Song   return vmls_n_s32(a, b, c);
4724*207e5cccSFangrui Song }
4725*207e5cccSFangrui Song 
4726*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_n_s32(
4727*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4728*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
4729*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4730*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4731*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4732*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4733*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
4734*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4735*207e5cccSFangrui Song //
4736*207e5cccSFangrui Song int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4737*207e5cccSFangrui Song   return vmlsq_n_s32(a, b, c);
4738*207e5cccSFangrui Song }
4739*207e5cccSFangrui Song 
4740*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_n_u16(
4741*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4742*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4743*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4744*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4745*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4746*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4747*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
4748*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
4749*207e5cccSFangrui Song //
4750*207e5cccSFangrui Song uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4751*207e5cccSFangrui Song   return vmls_n_u16(a, b, c);
4752*207e5cccSFangrui Song }
4753*207e5cccSFangrui Song 
4754*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_n_u16(
4755*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4756*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
4757*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4758*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4759*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4760*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4761*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4762*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4763*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4764*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4765*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
4766*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
4767*207e5cccSFangrui Song //
4768*207e5cccSFangrui Song uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4769*207e5cccSFangrui Song   return vmlsq_n_u16(a, b, c);
4770*207e5cccSFangrui Song }
4771*207e5cccSFangrui Song 
4772*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_n_u32(
4773*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4774*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4775*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4776*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4777*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
4778*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
4779*207e5cccSFangrui Song //
4780*207e5cccSFangrui Song uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4781*207e5cccSFangrui Song   return vmls_n_u32(a, b, c);
4782*207e5cccSFangrui Song }
4783*207e5cccSFangrui Song 
4784*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_n_u32(
4785*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4786*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
4787*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4788*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4789*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4790*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4791*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
4792*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4793*207e5cccSFangrui Song //
4794*207e5cccSFangrui Song uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4795*207e5cccSFangrui Song   return vmlsq_n_u32(a, b, c);
4796*207e5cccSFangrui Song }
4797*207e5cccSFangrui Song 
4798*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_n_s16(
4799*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4800*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4801*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4802*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4803*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4804*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4805*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4806*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4807*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4808*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4809*207e5cccSFangrui Song //
4810*207e5cccSFangrui Song int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4811*207e5cccSFangrui Song   return vmlsl_n_s16(a, b, c);
4812*207e5cccSFangrui Song }
4813*207e5cccSFangrui Song 
4814*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_n_s32(
4815*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4816*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4817*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4818*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4819*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4820*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4821*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4822*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
4823*207e5cccSFangrui Song //
4824*207e5cccSFangrui Song int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4825*207e5cccSFangrui Song   return vmlsl_n_s32(a, b, c);
4826*207e5cccSFangrui Song }
4827*207e5cccSFangrui Song 
4828*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_n_u16(
4829*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4830*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4831*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4832*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4833*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4834*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4835*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4836*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4837*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4838*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4839*207e5cccSFangrui Song //
4840*207e5cccSFangrui Song uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4841*207e5cccSFangrui Song   return vmlsl_n_u16(a, b, c);
4842*207e5cccSFangrui Song }
4843*207e5cccSFangrui Song 
4844*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsl_n_u32(
4845*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4846*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4847*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4848*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4849*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4850*207e5cccSFangrui Song // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4851*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4852*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
4853*207e5cccSFangrui Song //
4854*207e5cccSFangrui Song uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4855*207e5cccSFangrui Song   return vmlsl_n_u32(a, b, c);
4856*207e5cccSFangrui Song }
4857*207e5cccSFangrui Song 
4858*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_n_s16(
4859*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4860*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4861*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4862*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4863*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4864*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4865*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4866*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4867*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4868*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4869*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
4870*207e5cccSFangrui Song //
4871*207e5cccSFangrui Song int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4872*207e5cccSFangrui Song   return vqdmlsl_n_s16(a, b, c);
4873*207e5cccSFangrui Song }
4874*207e5cccSFangrui Song 
4875*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_n_s32(
4876*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4877*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4878*207e5cccSFangrui Song // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4879*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4880*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4881*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4882*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4883*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4884*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
4885*207e5cccSFangrui Song //
4886*207e5cccSFangrui Song int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4887*207e5cccSFangrui Song   return vqdmlsl_n_s32(a, b, c);
4888*207e5cccSFangrui Song }
4889*207e5cccSFangrui Song 
4890*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_lane_u16_0(
4891*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4892*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
4893*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4894*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
4895*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
4896*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
4897*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD]]
4898*207e5cccSFangrui Song //
4899*207e5cccSFangrui Song uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4900*207e5cccSFangrui Song   return vmla_lane_u16(a, b, v, 0);
4901*207e5cccSFangrui Song }
4902*207e5cccSFangrui Song 
4903*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_lane_u16_0(
4904*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4905*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
4906*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4907*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
4908*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
4909*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
4910*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD]]
4911*207e5cccSFangrui Song //
4912*207e5cccSFangrui Song uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4913*207e5cccSFangrui Song   return vmlaq_lane_u16(a, b, v, 0);
4914*207e5cccSFangrui Song }
4915*207e5cccSFangrui Song 
4916*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_lane_u32_0(
4917*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4918*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
4919*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4920*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
4921*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
4922*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
4923*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD]]
4924*207e5cccSFangrui Song //
4925*207e5cccSFangrui Song uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4926*207e5cccSFangrui Song   return vmla_lane_u32(a, b, v, 0);
4927*207e5cccSFangrui Song }
4928*207e5cccSFangrui Song 
4929*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_lane_u32_0(
4930*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4931*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
4932*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4933*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
4934*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
4935*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
4936*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
4937*207e5cccSFangrui Song //
4938*207e5cccSFangrui Song uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4939*207e5cccSFangrui Song   return vmlaq_lane_u32(a, b, v, 0);
4940*207e5cccSFangrui Song }
4941*207e5cccSFangrui Song 
4942*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_laneq_u16_0(
4943*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4944*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4945*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4946*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
4947*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
4948*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
4949*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD]]
4950*207e5cccSFangrui Song //
4951*207e5cccSFangrui Song uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4952*207e5cccSFangrui Song   return vmla_laneq_u16(a, b, v, 0);
4953*207e5cccSFangrui Song }
4954*207e5cccSFangrui Song 
4955*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
4956*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4957*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4958*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4959*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
4960*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
4961*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
4962*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD]]
4963*207e5cccSFangrui Song //
4964*207e5cccSFangrui Song uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4965*207e5cccSFangrui Song   return vmlaq_laneq_u16(a, b, v, 0);
4966*207e5cccSFangrui Song }
4967*207e5cccSFangrui Song 
4968*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_laneq_u32_0(
4969*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4970*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
4971*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4972*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
4973*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
4974*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
4975*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD]]
4976*207e5cccSFangrui Song //
4977*207e5cccSFangrui Song uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4978*207e5cccSFangrui Song   return vmla_laneq_u32(a, b, v, 0);
4979*207e5cccSFangrui Song }
4980*207e5cccSFangrui Song 
4981*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
4982*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4983*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
4984*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4985*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
4986*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
4987*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
4988*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
4989*207e5cccSFangrui Song //
4990*207e5cccSFangrui Song uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4991*207e5cccSFangrui Song   return vmlaq_laneq_u32(a, b, v, 0);
4992*207e5cccSFangrui Song }
4993*207e5cccSFangrui Song 
4994*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
4995*207e5cccSFangrui Song // CHECK-NEXT:  entry:
4996*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4997*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4998*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
4999*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5000*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5001*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5002*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5003*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5004*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5005*207e5cccSFangrui Song //
5006*207e5cccSFangrui Song int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
5007*207e5cccSFangrui Song   return vqdmlal_laneq_s16(a, b, v, 0);
5008*207e5cccSFangrui Song }
5009*207e5cccSFangrui Song 
5010*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
5011*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5012*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5013*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5014*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5015*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5016*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5017*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5018*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5019*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5020*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5021*207e5cccSFangrui Song //
5022*207e5cccSFangrui Song int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
5023*207e5cccSFangrui Song   return vqdmlal_laneq_s32(a, b, v, 0);
5024*207e5cccSFangrui Song }
5025*207e5cccSFangrui Song 
5026*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
5027*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5028*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5029*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5030*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5031*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5032*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5033*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5034*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5035*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5036*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5037*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5038*207e5cccSFangrui Song //
5039*207e5cccSFangrui Song int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
5040*207e5cccSFangrui Song   return vqdmlal_high_laneq_s16(a, b, v, 0);
5041*207e5cccSFangrui Song }
5042*207e5cccSFangrui Song 
5043*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
5044*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5045*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5046*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5047*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5048*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5049*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5050*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5051*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5052*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5053*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5054*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5055*207e5cccSFangrui Song //
5056*207e5cccSFangrui Song int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
5057*207e5cccSFangrui Song   return vqdmlal_high_laneq_s32(a, b, v, 0);
5058*207e5cccSFangrui Song }
5059*207e5cccSFangrui Song 
5060*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_lane_u16_0(
5061*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5062*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5063*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5064*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
5065*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5066*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5067*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5068*207e5cccSFangrui Song //
5069*207e5cccSFangrui Song uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5070*207e5cccSFangrui Song   return vmls_lane_u16(a, b, v, 0);
5071*207e5cccSFangrui Song }
5072*207e5cccSFangrui Song 
5073*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_lane_u16_0(
5074*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5075*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5076*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5077*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
5078*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5079*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5080*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5081*207e5cccSFangrui Song //
5082*207e5cccSFangrui Song uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5083*207e5cccSFangrui Song   return vmlsq_lane_u16(a, b, v, 0);
5084*207e5cccSFangrui Song }
5085*207e5cccSFangrui Song 
5086*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_lane_u32_0(
5087*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5088*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5089*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5090*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
5091*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5092*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5093*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5094*207e5cccSFangrui Song //
5095*207e5cccSFangrui Song uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5096*207e5cccSFangrui Song   return vmls_lane_u32(a, b, v, 0);
5097*207e5cccSFangrui Song }
5098*207e5cccSFangrui Song 
5099*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_lane_u32_0(
5100*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5101*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5102*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5103*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
5104*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5105*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5106*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5107*207e5cccSFangrui Song //
5108*207e5cccSFangrui Song uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5109*207e5cccSFangrui Song   return vmlsq_lane_u32(a, b, v, 0);
5110*207e5cccSFangrui Song }
5111*207e5cccSFangrui Song 
5112*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_laneq_u16_0(
5113*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5114*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5115*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5116*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5117*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5118*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5119*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5120*207e5cccSFangrui Song //
5121*207e5cccSFangrui Song uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5122*207e5cccSFangrui Song   return vmls_laneq_u16(a, b, v, 0);
5123*207e5cccSFangrui Song }
5124*207e5cccSFangrui Song 
5125*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
5126*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5127*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5128*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5129*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
5130*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5131*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5132*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5133*207e5cccSFangrui Song //
5134*207e5cccSFangrui Song uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5135*207e5cccSFangrui Song   return vmlsq_laneq_u16(a, b, v, 0);
5136*207e5cccSFangrui Song }
5137*207e5cccSFangrui Song 
5138*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_laneq_u32_0(
5139*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5140*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5141*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5142*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5143*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5144*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5145*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5146*207e5cccSFangrui Song //
5147*207e5cccSFangrui Song uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5148*207e5cccSFangrui Song   return vmls_laneq_u32(a, b, v, 0);
5149*207e5cccSFangrui Song }
5150*207e5cccSFangrui Song 
5151*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
5152*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5153*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5154*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5155*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
5156*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5157*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5158*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5159*207e5cccSFangrui Song //
5160*207e5cccSFangrui Song uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5161*207e5cccSFangrui Song   return vmlsq_laneq_u32(a, b, v, 0);
5162*207e5cccSFangrui Song }
5163*207e5cccSFangrui Song 
5164*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
5165*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5166*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5167*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5168*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5169*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5170*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5171*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5172*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5173*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5174*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5175*207e5cccSFangrui Song //
5176*207e5cccSFangrui Song int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
5177*207e5cccSFangrui Song   return vqdmlsl_laneq_s16(a, b, v, 0);
5178*207e5cccSFangrui Song }
5179*207e5cccSFangrui Song 
5180*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
5181*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5182*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5183*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5184*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5185*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5186*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5187*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5188*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5189*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5190*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5191*207e5cccSFangrui Song //
5192*207e5cccSFangrui Song int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
5193*207e5cccSFangrui Song   return vqdmlsl_laneq_s32(a, b, v, 0);
5194*207e5cccSFangrui Song }
5195*207e5cccSFangrui Song 
5196*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
5197*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5198*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5199*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5200*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5201*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5202*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5203*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5204*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5205*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5206*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5207*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5208*207e5cccSFangrui Song //
5209*207e5cccSFangrui Song int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
5210*207e5cccSFangrui Song   return vqdmlsl_high_laneq_s16(a, b, v, 0);
5211*207e5cccSFangrui Song }
5212*207e5cccSFangrui Song 
5213*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
5214*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5215*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5216*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5217*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5218*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5219*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5220*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5221*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5222*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5223*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5224*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5225*207e5cccSFangrui Song //
5226*207e5cccSFangrui Song int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
5227*207e5cccSFangrui Song   return vqdmlsl_high_laneq_s32(a, b, v, 0);
5228*207e5cccSFangrui Song }
5229*207e5cccSFangrui Song 
5230*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
5231*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5232*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5233*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5234*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5235*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5236*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
5237*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
5238*207e5cccSFangrui Song //
5239*207e5cccSFangrui Song int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
5240*207e5cccSFangrui Song   return vqdmulh_laneq_s16(a, v, 0);
5241*207e5cccSFangrui Song }
5242*207e5cccSFangrui Song 
5243*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
5244*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5245*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5246*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5247*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5248*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5249*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
5250*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
5251*207e5cccSFangrui Song //
5252*207e5cccSFangrui Song int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
5253*207e5cccSFangrui Song   return vqdmulhq_laneq_s16(a, v, 0);
5254*207e5cccSFangrui Song }
5255*207e5cccSFangrui Song 
5256*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
5257*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5258*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5259*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5260*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5261*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5262*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
5263*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
5264*207e5cccSFangrui Song //
5265*207e5cccSFangrui Song int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
5266*207e5cccSFangrui Song   return vqdmulh_laneq_s32(a, v, 0);
5267*207e5cccSFangrui Song }
5268*207e5cccSFangrui Song 
5269*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
5270*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5271*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5272*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5273*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5274*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5275*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
5276*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
5277*207e5cccSFangrui Song //
5278*207e5cccSFangrui Song int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
5279*207e5cccSFangrui Song   return vqdmulhq_laneq_s32(a, v, 0);
5280*207e5cccSFangrui Song }
5281*207e5cccSFangrui Song 
5282*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
5283*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5284*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5285*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5286*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5287*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5288*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
5289*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
5290*207e5cccSFangrui Song //
5291*207e5cccSFangrui Song int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
5292*207e5cccSFangrui Song   return vqrdmulh_laneq_s16(a, v, 0);
5293*207e5cccSFangrui Song }
5294*207e5cccSFangrui Song 
5295*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
5296*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5297*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5298*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5299*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5300*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5301*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
5302*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
5303*207e5cccSFangrui Song //
5304*207e5cccSFangrui Song int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
5305*207e5cccSFangrui Song   return vqrdmulhq_laneq_s16(a, v, 0);
5306*207e5cccSFangrui Song }
5307*207e5cccSFangrui Song 
5308*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
5309*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5310*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5311*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5312*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5313*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5314*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
5315*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
5316*207e5cccSFangrui Song //
5317*207e5cccSFangrui Song int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
5318*207e5cccSFangrui Song   return vqrdmulh_laneq_s32(a, v, 0);
5319*207e5cccSFangrui Song }
5320*207e5cccSFangrui Song 
5321*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
5322*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5323*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5324*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5325*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5326*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5327*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
5328*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
5329*207e5cccSFangrui Song //
5330*207e5cccSFangrui Song int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
5331*207e5cccSFangrui Song   return vqrdmulhq_laneq_s32(a, v, 0);
5332*207e5cccSFangrui Song }
5333*207e5cccSFangrui Song 
5334*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_lane_u16(
5335*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5336*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5337*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5338*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5339*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5340*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
5341*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD]]
5342*207e5cccSFangrui Song //
5343*207e5cccSFangrui Song uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5344*207e5cccSFangrui Song   return vmla_lane_u16(a, b, v, 3);
5345*207e5cccSFangrui Song }
5346*207e5cccSFangrui Song 
5347*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_lane_u16(
5348*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5349*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5350*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5351*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5352*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5353*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
5354*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD]]
5355*207e5cccSFangrui Song //
5356*207e5cccSFangrui Song uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5357*207e5cccSFangrui Song   return vmlaq_lane_u16(a, b, v, 3);
5358*207e5cccSFangrui Song }
5359*207e5cccSFangrui Song 
5360*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_lane_u32(
5361*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5362*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5363*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5364*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
5365*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5366*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
5367*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD]]
5368*207e5cccSFangrui Song //
5369*207e5cccSFangrui Song uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5370*207e5cccSFangrui Song   return vmla_lane_u32(a, b, v, 1);
5371*207e5cccSFangrui Song }
5372*207e5cccSFangrui Song 
5373*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_lane_u32(
5374*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5375*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5376*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5377*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5378*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5379*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
5380*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
5381*207e5cccSFangrui Song //
5382*207e5cccSFangrui Song uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5383*207e5cccSFangrui Song   return vmlaq_lane_u32(a, b, v, 1);
5384*207e5cccSFangrui Song }
5385*207e5cccSFangrui Song 
5386*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_laneq_u16(
5387*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5388*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5389*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5390*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5391*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5392*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
5393*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[ADD]]
5394*207e5cccSFangrui Song //
5395*207e5cccSFangrui Song uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5396*207e5cccSFangrui Song   return vmla_laneq_u16(a, b, v, 7);
5397*207e5cccSFangrui Song }
5398*207e5cccSFangrui Song 
5399*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_laneq_u16(
5400*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5401*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5402*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5403*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5404*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5405*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
5406*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[ADD]]
5407*207e5cccSFangrui Song //
5408*207e5cccSFangrui Song uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5409*207e5cccSFangrui Song   return vmlaq_laneq_u16(a, b, v, 7);
5410*207e5cccSFangrui Song }
5411*207e5cccSFangrui Song 
5412*207e5cccSFangrui Song // CHECK-LABEL: @test_vmla_laneq_u32(
5413*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5414*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5415*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5416*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5417*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5418*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
5419*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[ADD]]
5420*207e5cccSFangrui Song //
5421*207e5cccSFangrui Song uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5422*207e5cccSFangrui Song   return vmla_laneq_u32(a, b, v, 3);
5423*207e5cccSFangrui Song }
5424*207e5cccSFangrui Song 
5425*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlaq_laneq_u32(
5426*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5427*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5428*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5429*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5430*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5431*207e5cccSFangrui Song // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
5432*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[ADD]]
5433*207e5cccSFangrui Song //
5434*207e5cccSFangrui Song uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5435*207e5cccSFangrui Song   return vmlaq_laneq_u32(a, b, v, 3);
5436*207e5cccSFangrui Song }
5437*207e5cccSFangrui Song 
5438*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_laneq_s16(
5439*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5440*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5441*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5442*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5443*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5444*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5445*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5446*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5447*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5448*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5449*207e5cccSFangrui Song //
5450*207e5cccSFangrui Song int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
5451*207e5cccSFangrui Song   return vqdmlal_laneq_s16(a, b, v, 7);
5452*207e5cccSFangrui Song }
5453*207e5cccSFangrui Song 
5454*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_laneq_s32(
5455*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5456*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5457*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5458*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5459*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5460*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5461*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5462*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5463*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5464*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5465*207e5cccSFangrui Song //
5466*207e5cccSFangrui Song int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
5467*207e5cccSFangrui Song   return vqdmlal_laneq_s32(a, b, v, 3);
5468*207e5cccSFangrui Song }
5469*207e5cccSFangrui Song 
5470*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
5471*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5472*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5473*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5474*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5475*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5476*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5477*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5478*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5479*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5480*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5481*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5482*207e5cccSFangrui Song //
5483*207e5cccSFangrui Song int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
5484*207e5cccSFangrui Song   return vqdmlal_high_laneq_s16(a, b, v, 7);
5485*207e5cccSFangrui Song }
5486*207e5cccSFangrui Song 
5487*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
5488*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5489*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5490*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5491*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5492*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5493*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5494*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5495*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5496*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5497*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5498*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5499*207e5cccSFangrui Song //
5500*207e5cccSFangrui Song int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
5501*207e5cccSFangrui Song   return vqdmlal_high_laneq_s32(a, b, v, 3);
5502*207e5cccSFangrui Song }
5503*207e5cccSFangrui Song 
5504*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_lane_u16(
5505*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5506*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5507*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5508*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5509*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5510*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5511*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5512*207e5cccSFangrui Song //
5513*207e5cccSFangrui Song uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5514*207e5cccSFangrui Song   return vmls_lane_u16(a, b, v, 3);
5515*207e5cccSFangrui Song }
5516*207e5cccSFangrui Song 
5517*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_lane_u16(
5518*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5519*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5520*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5521*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5522*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5523*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5524*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5525*207e5cccSFangrui Song //
5526*207e5cccSFangrui Song uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5527*207e5cccSFangrui Song   return vmlsq_lane_u16(a, b, v, 3);
5528*207e5cccSFangrui Song }
5529*207e5cccSFangrui Song 
5530*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_lane_u32(
5531*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5532*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5533*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5534*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
5535*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5536*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5537*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5538*207e5cccSFangrui Song //
5539*207e5cccSFangrui Song uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5540*207e5cccSFangrui Song   return vmls_lane_u32(a, b, v, 1);
5541*207e5cccSFangrui Song }
5542*207e5cccSFangrui Song 
5543*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_lane_u32(
5544*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5545*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5546*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5547*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5548*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5549*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5550*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5551*207e5cccSFangrui Song //
5552*207e5cccSFangrui Song uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5553*207e5cccSFangrui Song   return vmlsq_lane_u32(a, b, v, 1);
5554*207e5cccSFangrui Song }
5555*207e5cccSFangrui Song 
5556*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_laneq_u16(
5557*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5558*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5559*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5560*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5561*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5562*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5563*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5564*207e5cccSFangrui Song //
5565*207e5cccSFangrui Song uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5566*207e5cccSFangrui Song   return vmls_laneq_u16(a, b, v, 7);
5567*207e5cccSFangrui Song }
5568*207e5cccSFangrui Song 
5569*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_laneq_u16(
5570*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5571*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5572*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5573*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5574*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5575*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5576*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5577*207e5cccSFangrui Song //
5578*207e5cccSFangrui Song uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5579*207e5cccSFangrui Song   return vmlsq_laneq_u16(a, b, v, 7);
5580*207e5cccSFangrui Song }
5581*207e5cccSFangrui Song 
5582*207e5cccSFangrui Song // CHECK-LABEL: @test_vmls_laneq_u32(
5583*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5584*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5585*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5586*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5587*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5588*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5589*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5590*207e5cccSFangrui Song //
5591*207e5cccSFangrui Song uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5592*207e5cccSFangrui Song   return vmls_laneq_u32(a, b, v, 3);
5593*207e5cccSFangrui Song }
5594*207e5cccSFangrui Song 
5595*207e5cccSFangrui Song // CHECK-LABEL: @test_vmlsq_laneq_u32(
5596*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5597*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5598*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5599*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5600*207e5cccSFangrui Song // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5601*207e5cccSFangrui Song // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5602*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5603*207e5cccSFangrui Song //
5604*207e5cccSFangrui Song uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5605*207e5cccSFangrui Song   return vmlsq_laneq_u32(a, b, v, 3);
5606*207e5cccSFangrui Song }
5607*207e5cccSFangrui Song 
5608*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
5609*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5610*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5611*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5612*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5613*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5614*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5615*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5616*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5617*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5618*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5619*207e5cccSFangrui Song //
5620*207e5cccSFangrui Song int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
5621*207e5cccSFangrui Song   return vqdmlsl_laneq_s16(a, b, v, 7);
5622*207e5cccSFangrui Song }
5623*207e5cccSFangrui Song 
5624*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
5625*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5626*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5627*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5628*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5629*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5630*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5631*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5632*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5633*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5634*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5635*207e5cccSFangrui Song //
5636*207e5cccSFangrui Song int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
5637*207e5cccSFangrui Song   return vqdmlsl_laneq_s32(a, b, v, 3);
5638*207e5cccSFangrui Song }
5639*207e5cccSFangrui Song 
5640*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
5641*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5642*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5643*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5644*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5645*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5646*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5647*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5648*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5649*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5650*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5651*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5652*207e5cccSFangrui Song //
5653*207e5cccSFangrui Song int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
5654*207e5cccSFangrui Song   return vqdmlsl_high_laneq_s16(a, b, v, 7);
5655*207e5cccSFangrui Song }
5656*207e5cccSFangrui Song 
5657*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
5658*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5659*207e5cccSFangrui Song // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5660*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5661*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5662*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5663*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5664*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5665*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5666*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5667*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5668*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5669*207e5cccSFangrui Song //
5670*207e5cccSFangrui Song int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
5671*207e5cccSFangrui Song   return vqdmlsl_high_laneq_s32(a, b, v, 3);
5672*207e5cccSFangrui Song }
5673*207e5cccSFangrui Song 
5674*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_laneq_s16(
5675*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5676*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5677*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5678*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5679*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5680*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
5681*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
5682*207e5cccSFangrui Song //
5683*207e5cccSFangrui Song int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
5684*207e5cccSFangrui Song   return vqdmulh_laneq_s16(a, v, 7);
5685*207e5cccSFangrui Song }
5686*207e5cccSFangrui Song 
5687*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
5688*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5689*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5690*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5691*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5692*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5693*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
5694*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
5695*207e5cccSFangrui Song //
5696*207e5cccSFangrui Song int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
5697*207e5cccSFangrui Song   return vqdmulhq_laneq_s16(a, v, 7);
5698*207e5cccSFangrui Song }
5699*207e5cccSFangrui Song 
5700*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulh_laneq_s32(
5701*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5702*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5703*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5704*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5705*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5706*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
5707*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
5708*207e5cccSFangrui Song //
5709*207e5cccSFangrui Song int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
5710*207e5cccSFangrui Song   return vqdmulh_laneq_s32(a, v, 3);
5711*207e5cccSFangrui Song }
5712*207e5cccSFangrui Song 
5713*207e5cccSFangrui Song // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
5714*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5715*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5716*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5717*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5718*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5719*207e5cccSFangrui Song // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
5720*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
5721*207e5cccSFangrui Song //
5722*207e5cccSFangrui Song int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5723*207e5cccSFangrui Song   return vqdmulhq_laneq_s32(a, v, 3);
5724*207e5cccSFangrui Song }
5725*207e5cccSFangrui Song 
5726*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
5727*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5728*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5729*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5730*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5731*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5732*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
5733*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
5734*207e5cccSFangrui Song //
5735*207e5cccSFangrui Song int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
5736*207e5cccSFangrui Song   return vqrdmulh_laneq_s16(a, v, 7);
5737*207e5cccSFangrui Song }
5738*207e5cccSFangrui Song 
5739*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
5740*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5741*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5742*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5743*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5744*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5745*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
5746*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
5747*207e5cccSFangrui Song //
5748*207e5cccSFangrui Song int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
5749*207e5cccSFangrui Song   return vqrdmulhq_laneq_s16(a, v, 7);
5750*207e5cccSFangrui Song }
5751*207e5cccSFangrui Song 
5752*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
5753*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5754*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5755*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5756*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5757*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5758*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
5759*207e5cccSFangrui Song // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
5760*207e5cccSFangrui Song //
5761*207e5cccSFangrui Song int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
5762*207e5cccSFangrui Song   return vqrdmulh_laneq_s32(a, v, 3);
5763*207e5cccSFangrui Song }
5764*207e5cccSFangrui Song 
5765*207e5cccSFangrui Song // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
5766*207e5cccSFangrui Song // CHECK-NEXT:  entry:
5767*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5768*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5769*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5770*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5771*207e5cccSFangrui Song // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
5772*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
5773*207e5cccSFangrui Song //
5774*207e5cccSFangrui Song int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5775*207e5cccSFangrui Song   return vqrdmulhq_laneq_s32(a, v, 3);
5776*207e5cccSFangrui Song }
5777