xref: /llvm-project/clang/test/CodeGen/AArch64/neon-ldst-one.c (revision 207e5ccceec8d3cc3f32723e78f2a142bc61b07d)
1*207e5cccSFangrui Song // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
2*207e5cccSFangrui Song // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
3*207e5cccSFangrui Song // RUN: | opt -S -passes=mem2reg | FileCheck %s
4*207e5cccSFangrui Song 
5*207e5cccSFangrui Song // REQUIRES: aarch64-registered-target || arm-registered-target
6*207e5cccSFangrui Song 
7*207e5cccSFangrui Song #include <arm_neon.h>
8*207e5cccSFangrui Song 
9*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_u8(ptr noundef %a) #0 {
10*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
11*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
12*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
13*207e5cccSFangrui Song // CHECK:   ret <16 x i8> [[LANE]]
14*207e5cccSFangrui Song uint8x16_t test_vld1q_dup_u8(uint8_t  *a) {
15*207e5cccSFangrui Song   return vld1q_dup_u8(a);
16*207e5cccSFangrui Song }
17*207e5cccSFangrui Song 
18*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_u16(ptr noundef %a) #0 {
19*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
20*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
21*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
22*207e5cccSFangrui Song // CHECK:   ret <8 x i16> [[LANE]]
23*207e5cccSFangrui Song uint16x8_t test_vld1q_dup_u16(uint16_t  *a) {
24*207e5cccSFangrui Song   return vld1q_dup_u16(a);
25*207e5cccSFangrui Song }
26*207e5cccSFangrui Song 
27*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_u32(ptr noundef %a) #0 {
28*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i32, ptr %a
29*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
30*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
31*207e5cccSFangrui Song // CHECK:   ret <4 x i32> [[LANE]]
32*207e5cccSFangrui Song uint32x4_t test_vld1q_dup_u32(uint32_t  *a) {
33*207e5cccSFangrui Song   return vld1q_dup_u32(a);
34*207e5cccSFangrui Song }
35*207e5cccSFangrui Song 
36*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_u64(ptr noundef %a) #0 {
37*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
38*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
39*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
40*207e5cccSFangrui Song // CHECK:   ret <2 x i64> [[LANE]]
41*207e5cccSFangrui Song uint64x2_t test_vld1q_dup_u64(uint64_t  *a) {
42*207e5cccSFangrui Song   return vld1q_dup_u64(a);
43*207e5cccSFangrui Song }
44*207e5cccSFangrui Song 
45*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_s8(ptr noundef %a) #0 {
46*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
47*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
48*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
49*207e5cccSFangrui Song // CHECK:   ret <16 x i8> [[LANE]]
50*207e5cccSFangrui Song int8x16_t test_vld1q_dup_s8(int8_t  *a) {
51*207e5cccSFangrui Song   return vld1q_dup_s8(a);
52*207e5cccSFangrui Song }
53*207e5cccSFangrui Song 
54*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_s16(ptr noundef %a) #0 {
55*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
56*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
57*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
58*207e5cccSFangrui Song // CHECK:   ret <8 x i16> [[LANE]]
59*207e5cccSFangrui Song int16x8_t test_vld1q_dup_s16(int16_t  *a) {
60*207e5cccSFangrui Song   return vld1q_dup_s16(a);
61*207e5cccSFangrui Song }
62*207e5cccSFangrui Song 
63*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_s32(ptr noundef %a) #0 {
64*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i32, ptr %a
65*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
66*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
67*207e5cccSFangrui Song // CHECK:   ret <4 x i32> [[LANE]]
68*207e5cccSFangrui Song int32x4_t test_vld1q_dup_s32(int32_t  *a) {
69*207e5cccSFangrui Song   return vld1q_dup_s32(a);
70*207e5cccSFangrui Song }
71*207e5cccSFangrui Song 
72*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_s64(ptr noundef %a) #0 {
73*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
74*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
75*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
76*207e5cccSFangrui Song // CHECK:   ret <2 x i64> [[LANE]]
77*207e5cccSFangrui Song int64x2_t test_vld1q_dup_s64(int64_t  *a) {
78*207e5cccSFangrui Song   return vld1q_dup_s64(a);
79*207e5cccSFangrui Song }
80*207e5cccSFangrui Song 
81*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_dup_f16(ptr noundef %a) #0 {
82*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load half, ptr %a
83*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
84*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
85*207e5cccSFangrui Song // CHECK:   ret <8 x half> [[LANE]]
86*207e5cccSFangrui Song float16x8_t test_vld1q_dup_f16(float16_t  *a) {
87*207e5cccSFangrui Song   return vld1q_dup_f16(a);
88*207e5cccSFangrui Song }
89*207e5cccSFangrui Song 
90*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_dup_f32(ptr noundef %a) #0 {
91*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load float, ptr %a
92*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
93*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
94*207e5cccSFangrui Song // CHECK:   ret <4 x float> [[LANE]]
95*207e5cccSFangrui Song float32x4_t test_vld1q_dup_f32(float32_t  *a) {
96*207e5cccSFangrui Song   return vld1q_dup_f32(a);
97*207e5cccSFangrui Song }
98*207e5cccSFangrui Song 
99*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_dup_f64(ptr noundef %a) #0 {
100*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load double, ptr %a
101*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
102*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
103*207e5cccSFangrui Song // CHECK:   ret <2 x double> [[LANE]]
104*207e5cccSFangrui Song float64x2_t test_vld1q_dup_f64(float64_t  *a) {
105*207e5cccSFangrui Song   return vld1q_dup_f64(a);
106*207e5cccSFangrui Song }
107*207e5cccSFangrui Song 
108*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_p8(ptr noundef %a) #0 {
109*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
110*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
111*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
112*207e5cccSFangrui Song // CHECK:   ret <16 x i8> [[LANE]]
113*207e5cccSFangrui Song poly8x16_t test_vld1q_dup_p8(poly8_t  *a) {
114*207e5cccSFangrui Song   return vld1q_dup_p8(a);
115*207e5cccSFangrui Song }
116*207e5cccSFangrui Song 
117*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_p16(ptr noundef %a) #0 {
118*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
119*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
120*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
121*207e5cccSFangrui Song // CHECK:   ret <8 x i16> [[LANE]]
122*207e5cccSFangrui Song poly16x8_t test_vld1q_dup_p16(poly16_t  *a) {
123*207e5cccSFangrui Song   return vld1q_dup_p16(a);
124*207e5cccSFangrui Song }
125*207e5cccSFangrui Song 
126*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_p64(ptr noundef %a) #0 {
127*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
128*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
129*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
130*207e5cccSFangrui Song // CHECK:   ret <2 x i64> [[LANE]]
131*207e5cccSFangrui Song poly64x2_t test_vld1q_dup_p64(poly64_t  *a) {
132*207e5cccSFangrui Song   return vld1q_dup_p64(a);
133*207e5cccSFangrui Song }
134*207e5cccSFangrui Song 
135*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_u8(ptr noundef %a) #0 {
136*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
137*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
138*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
139*207e5cccSFangrui Song // CHECK:   ret <8 x i8> [[LANE]]
140*207e5cccSFangrui Song uint8x8_t test_vld1_dup_u8(uint8_t  *a) {
141*207e5cccSFangrui Song   return vld1_dup_u8(a);
142*207e5cccSFangrui Song }
143*207e5cccSFangrui Song 
144*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_u16(ptr noundef %a) #0 {
145*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
146*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
147*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
148*207e5cccSFangrui Song // CHECK:   ret <4 x i16> [[LANE]]
149*207e5cccSFangrui Song uint16x4_t test_vld1_dup_u16(uint16_t  *a) {
150*207e5cccSFangrui Song   return vld1_dup_u16(a);
151*207e5cccSFangrui Song }
152*207e5cccSFangrui Song 
153*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_u32(ptr noundef %a) #0 {
154*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i32, ptr %a
155*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
156*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
157*207e5cccSFangrui Song // CHECK:   ret <2 x i32> [[LANE]]
158*207e5cccSFangrui Song uint32x2_t test_vld1_dup_u32(uint32_t  *a) {
159*207e5cccSFangrui Song   return vld1_dup_u32(a);
160*207e5cccSFangrui Song }
161*207e5cccSFangrui Song 
162*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_u64(ptr noundef %a) #0 {
163*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
164*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
165*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
166*207e5cccSFangrui Song // CHECK:   ret <1 x i64> [[LANE]]
167*207e5cccSFangrui Song uint64x1_t test_vld1_dup_u64(uint64_t  *a) {
168*207e5cccSFangrui Song   return vld1_dup_u64(a);
169*207e5cccSFangrui Song }
170*207e5cccSFangrui Song 
171*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_s8(ptr noundef %a) #0 {
172*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
173*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
174*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
175*207e5cccSFangrui Song // CHECK:   ret <8 x i8> [[LANE]]
176*207e5cccSFangrui Song int8x8_t test_vld1_dup_s8(int8_t  *a) {
177*207e5cccSFangrui Song   return vld1_dup_s8(a);
178*207e5cccSFangrui Song }
179*207e5cccSFangrui Song 
180*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_s16(ptr noundef %a) #0 {
181*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
182*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
183*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
184*207e5cccSFangrui Song // CHECK:   ret <4 x i16> [[LANE]]
185*207e5cccSFangrui Song int16x4_t test_vld1_dup_s16(int16_t  *a) {
186*207e5cccSFangrui Song   return vld1_dup_s16(a);
187*207e5cccSFangrui Song }
188*207e5cccSFangrui Song 
189*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_s32(ptr noundef %a) #0 {
190*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i32, ptr %a
191*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
192*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
193*207e5cccSFangrui Song // CHECK:   ret <2 x i32> [[LANE]]
194*207e5cccSFangrui Song int32x2_t test_vld1_dup_s32(int32_t  *a) {
195*207e5cccSFangrui Song   return vld1_dup_s32(a);
196*207e5cccSFangrui Song }
197*207e5cccSFangrui Song 
198*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_s64(ptr noundef %a) #0 {
199*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
200*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
201*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
202*207e5cccSFangrui Song // CHECK:   ret <1 x i64> [[LANE]]
203*207e5cccSFangrui Song int64x1_t test_vld1_dup_s64(int64_t  *a) {
204*207e5cccSFangrui Song   return vld1_dup_s64(a);
205*207e5cccSFangrui Song }
206*207e5cccSFangrui Song 
207*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_dup_f16(ptr noundef %a) #0 {
208*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load half, ptr %a
209*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0
210*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
211*207e5cccSFangrui Song // CHECK:   ret <4 x half> [[LANE]]
212*207e5cccSFangrui Song float16x4_t test_vld1_dup_f16(float16_t  *a) {
213*207e5cccSFangrui Song   return vld1_dup_f16(a);
214*207e5cccSFangrui Song }
215*207e5cccSFangrui Song 
216*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_dup_f32(ptr noundef %a) #0 {
217*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load float, ptr %a
218*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
219*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
220*207e5cccSFangrui Song // CHECK:   ret <2 x float> [[LANE]]
221*207e5cccSFangrui Song float32x2_t test_vld1_dup_f32(float32_t  *a) {
222*207e5cccSFangrui Song   return vld1_dup_f32(a);
223*207e5cccSFangrui Song }
224*207e5cccSFangrui Song 
225*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_dup_f64(ptr noundef %a) #0 {
226*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load double, ptr %a
227*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i32 0
228*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
229*207e5cccSFangrui Song // CHECK:   ret <1 x double> [[LANE]]
230*207e5cccSFangrui Song float64x1_t test_vld1_dup_f64(float64_t  *a) {
231*207e5cccSFangrui Song   return vld1_dup_f64(a);
232*207e5cccSFangrui Song }
233*207e5cccSFangrui Song 
234*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_p8(ptr noundef %a) #0 {
235*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
236*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
237*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
238*207e5cccSFangrui Song // CHECK:   ret <8 x i8> [[LANE]]
239*207e5cccSFangrui Song poly8x8_t test_vld1_dup_p8(poly8_t  *a) {
240*207e5cccSFangrui Song   return vld1_dup_p8(a);
241*207e5cccSFangrui Song }
242*207e5cccSFangrui Song 
243*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_p16(ptr noundef %a) #0 {
244*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
245*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
246*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
247*207e5cccSFangrui Song // CHECK:   ret <4 x i16> [[LANE]]
248*207e5cccSFangrui Song poly16x4_t test_vld1_dup_p16(poly16_t  *a) {
249*207e5cccSFangrui Song   return vld1_dup_p16(a);
250*207e5cccSFangrui Song }
251*207e5cccSFangrui Song 
252*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_p64(ptr noundef %a) #0 {
253*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
254*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
255*207e5cccSFangrui Song // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
256*207e5cccSFangrui Song // CHECK:   ret <1 x i64> [[LANE]]
257*207e5cccSFangrui Song poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
258*207e5cccSFangrui Song   return vld1_dup_p64(a);
259*207e5cccSFangrui Song }
260*207e5cccSFangrui Song 
261*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_dup_u64(ptr noundef %a) #0 {
262*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
263*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
264*207e5cccSFangrui Song // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
265*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
266*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
267*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
268*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
269*207e5cccSFangrui Song uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
270*207e5cccSFangrui Song   return vld2q_dup_u64(a);
271*207e5cccSFangrui Song }
272*207e5cccSFangrui Song 
273*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_dup_s64(ptr noundef %a) #0 {
274*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
275*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
276*207e5cccSFangrui Song // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
277*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
278*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
279*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
280*207e5cccSFangrui Song // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
281*207e5cccSFangrui Song int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
282*207e5cccSFangrui Song   return vld2q_dup_s64(a);
283*207e5cccSFangrui Song }
284*207e5cccSFangrui Song 
285*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_dup_f64(ptr noundef %a) #0 {
286*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
287*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
288*207e5cccSFangrui Song // CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr %a)
289*207e5cccSFangrui Song // CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]]
290*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
291*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
292*207e5cccSFangrui Song // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
293*207e5cccSFangrui Song float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
294*207e5cccSFangrui Song   return vld2q_dup_f64(a);
295*207e5cccSFangrui Song }
296*207e5cccSFangrui Song 
297*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_dup_p64(ptr noundef %a) #0 {
298*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
299*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
300*207e5cccSFangrui Song // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
301*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
302*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
303*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
304*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
305*207e5cccSFangrui Song poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
306*207e5cccSFangrui Song   return vld2q_dup_p64(a);
307*207e5cccSFangrui Song }
308*207e5cccSFangrui Song 
309*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_dup_f64(ptr noundef %a) #0 {
310*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
311*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
312*207e5cccSFangrui Song // CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr %a)
313*207e5cccSFangrui Song // CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]]
314*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
315*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
316*207e5cccSFangrui Song // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
317*207e5cccSFangrui Song float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
318*207e5cccSFangrui Song   return vld2_dup_f64(a);
319*207e5cccSFangrui Song }
320*207e5cccSFangrui Song 
321*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_dup_p64(ptr noundef %a) #0 {
322*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
323*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
324*207e5cccSFangrui Song // CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %a)
325*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]]
326*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
327*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
328*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
329*207e5cccSFangrui Song poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
330*207e5cccSFangrui Song   return vld2_dup_p64(a);
331*207e5cccSFangrui Song }
332*207e5cccSFangrui Song 
333*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_dup_u64(ptr noundef %a) #0 {
334*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
335*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
336*207e5cccSFangrui Song // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
337*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
338*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
339*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
340*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
341*207e5cccSFangrui Song uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
342*207e5cccSFangrui Song   return vld3q_dup_u64(a);
343*207e5cccSFangrui Song   // [{{x[0-9]+|sp}}]
344*207e5cccSFangrui Song }
345*207e5cccSFangrui Song 
346*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_dup_s64(ptr noundef %a) #0 {
347*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
348*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
349*207e5cccSFangrui Song // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
350*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
351*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
352*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
353*207e5cccSFangrui Song // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
354*207e5cccSFangrui Song int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
355*207e5cccSFangrui Song   return vld3q_dup_s64(a);
356*207e5cccSFangrui Song   // [{{x[0-9]+|sp}}]
357*207e5cccSFangrui Song }
358*207e5cccSFangrui Song 
359*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_dup_f64(ptr noundef %a) #0 {
360*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
361*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
362*207e5cccSFangrui Song // CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr %a)
363*207e5cccSFangrui Song // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]]
364*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
365*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
366*207e5cccSFangrui Song // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
367*207e5cccSFangrui Song float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
368*207e5cccSFangrui Song   return vld3q_dup_f64(a);
369*207e5cccSFangrui Song   // [{{x[0-9]+|sp}}]
370*207e5cccSFangrui Song }
371*207e5cccSFangrui Song 
372*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_dup_p64(ptr noundef %a) #0 {
373*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
374*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
375*207e5cccSFangrui Song // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
376*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
377*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
378*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
379*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
380*207e5cccSFangrui Song poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
381*207e5cccSFangrui Song   return vld3q_dup_p64(a);
382*207e5cccSFangrui Song   // [{{x[0-9]+|sp}}]
383*207e5cccSFangrui Song }
384*207e5cccSFangrui Song 
385*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_dup_f64(ptr noundef %a) #0 {
386*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
387*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
388*207e5cccSFangrui Song // CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr %a)
389*207e5cccSFangrui Song // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]]
390*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
391*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
392*207e5cccSFangrui Song // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
393*207e5cccSFangrui Song float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
394*207e5cccSFangrui Song   return vld3_dup_f64(a);
395*207e5cccSFangrui Song   // [{{x[0-9]+|sp}}]
396*207e5cccSFangrui Song }
397*207e5cccSFangrui Song 
398*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_dup_p64(ptr noundef %a) #0 {
399*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
400*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
401*207e5cccSFangrui Song // CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %a)
402*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]]
403*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
404*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
405*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
406*207e5cccSFangrui Song poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
407*207e5cccSFangrui Song   return vld3_dup_p64(a);
408*207e5cccSFangrui Song   // [{{x[0-9]+|sp}}]
409*207e5cccSFangrui Song }
410*207e5cccSFangrui Song 
411*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_dup_u64(ptr noundef %a) #0 {
412*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
413*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
414*207e5cccSFangrui Song // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
415*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
416*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
417*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
418*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
419*207e5cccSFangrui Song uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
420*207e5cccSFangrui Song   return vld4q_dup_u64(a);
421*207e5cccSFangrui Song }
422*207e5cccSFangrui Song 
423*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_dup_s64(ptr noundef %a) #0 {
424*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
425*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
426*207e5cccSFangrui Song // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
427*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
428*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
429*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
430*207e5cccSFangrui Song // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
431*207e5cccSFangrui Song int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
432*207e5cccSFangrui Song   return vld4q_dup_s64(a);
433*207e5cccSFangrui Song }
434*207e5cccSFangrui Song 
435*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_dup_f64(ptr noundef %a) #0 {
436*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
437*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
438*207e5cccSFangrui Song // CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr %a)
439*207e5cccSFangrui Song // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]]
440*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
441*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
442*207e5cccSFangrui Song // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
443*207e5cccSFangrui Song float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
444*207e5cccSFangrui Song   return vld4q_dup_f64(a);
445*207e5cccSFangrui Song }
446*207e5cccSFangrui Song 
447*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_dup_p64(ptr noundef %a) #0 {
448*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
449*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
450*207e5cccSFangrui Song // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
451*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
452*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
453*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
454*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
455*207e5cccSFangrui Song poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
456*207e5cccSFangrui Song   return vld4q_dup_p64(a);
457*207e5cccSFangrui Song }
458*207e5cccSFangrui Song 
459*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_dup_f64(ptr noundef %a) #0 {
460*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
461*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
462*207e5cccSFangrui Song // CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr %a)
463*207e5cccSFangrui Song // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]]
464*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
465*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
466*207e5cccSFangrui Song // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
467*207e5cccSFangrui Song float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
468*207e5cccSFangrui Song   return vld4_dup_f64(a);
469*207e5cccSFangrui Song }
470*207e5cccSFangrui Song 
471*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_dup_p64(ptr noundef %a) #0 {
472*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
473*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
474*207e5cccSFangrui Song // CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %a)
475*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]]
476*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
477*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
478*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
479*207e5cccSFangrui Song poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
480*207e5cccSFangrui Song   return vld4_dup_p64(a);
481*207e5cccSFangrui Song }
482*207e5cccSFangrui Song 
483*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 {
484*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
485*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
486*207e5cccSFangrui Song // CHECK:   ret <16 x i8> [[VLD1_LANE]]
487*207e5cccSFangrui Song uint8x16_t test_vld1q_lane_u8(uint8_t  *a, uint8x16_t b) {
488*207e5cccSFangrui Song   return vld1q_lane_u8(a, b, 15);
489*207e5cccSFangrui Song }
490*207e5cccSFangrui Song 
491*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 {
492*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
493*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
494*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
495*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
496*207e5cccSFangrui Song // CHECK:   ret <8 x i16> [[VLD1_LANE]]
497*207e5cccSFangrui Song uint16x8_t test_vld1q_lane_u16(uint16_t  *a, uint16x8_t b) {
498*207e5cccSFangrui Song   return vld1q_lane_u16(a, b, 7);
499*207e5cccSFangrui Song }
500*207e5cccSFangrui Song 
501*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 {
502*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
503*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
504*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i32, ptr %a
505*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
506*207e5cccSFangrui Song // CHECK:   ret <4 x i32> [[VLD1_LANE]]
507*207e5cccSFangrui Song uint32x4_t test_vld1q_lane_u32(uint32_t  *a, uint32x4_t b) {
508*207e5cccSFangrui Song   return vld1q_lane_u32(a, b, 3);
509*207e5cccSFangrui Song }
510*207e5cccSFangrui Song 
511*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 {
512*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
513*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
514*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
515*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
516*207e5cccSFangrui Song // CHECK:   ret <2 x i64> [[VLD1_LANE]]
517*207e5cccSFangrui Song uint64x2_t test_vld1q_lane_u64(uint64_t  *a, uint64x2_t b) {
518*207e5cccSFangrui Song   return vld1q_lane_u64(a, b, 1);
519*207e5cccSFangrui Song }
520*207e5cccSFangrui Song 
521*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 {
522*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
523*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
524*207e5cccSFangrui Song // CHECK:   ret <16 x i8> [[VLD1_LANE]]
525*207e5cccSFangrui Song int8x16_t test_vld1q_lane_s8(int8_t  *a, int8x16_t b) {
526*207e5cccSFangrui Song   return vld1q_lane_s8(a, b, 15);
527*207e5cccSFangrui Song }
528*207e5cccSFangrui Song 
529*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 {
530*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
531*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
532*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
533*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
534*207e5cccSFangrui Song // CHECK:   ret <8 x i16> [[VLD1_LANE]]
535*207e5cccSFangrui Song int16x8_t test_vld1q_lane_s16(int16_t  *a, int16x8_t b) {
536*207e5cccSFangrui Song   return vld1q_lane_s16(a, b, 7);
537*207e5cccSFangrui Song }
538*207e5cccSFangrui Song 
539*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 {
540*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
541*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
542*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i32, ptr %a
543*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
544*207e5cccSFangrui Song // CHECK:   ret <4 x i32> [[VLD1_LANE]]
545*207e5cccSFangrui Song int32x4_t test_vld1q_lane_s32(int32_t  *a, int32x4_t b) {
546*207e5cccSFangrui Song   return vld1q_lane_s32(a, b, 3);
547*207e5cccSFangrui Song }
548*207e5cccSFangrui Song 
549*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 {
550*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
551*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
552*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
553*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
554*207e5cccSFangrui Song // CHECK:   ret <2 x i64> [[VLD1_LANE]]
555*207e5cccSFangrui Song int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
556*207e5cccSFangrui Song   return vld1q_lane_s64(a, b, 1);
557*207e5cccSFangrui Song }
558*207e5cccSFangrui Song 
559*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 {
560*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
561*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
562*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load half, ptr %a
563*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
564*207e5cccSFangrui Song // CHECK:   ret <8 x half> [[VLD1_LANE]]
565*207e5cccSFangrui Song float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
566*207e5cccSFangrui Song   return vld1q_lane_f16(a, b, 7);
567*207e5cccSFangrui Song }
568*207e5cccSFangrui Song 
569*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 {
570*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
571*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
572*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load float, ptr %a
573*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
574*207e5cccSFangrui Song // CHECK:   ret <4 x float> [[VLD1_LANE]]
575*207e5cccSFangrui Song float32x4_t test_vld1q_lane_f32(float32_t  *a, float32x4_t b) {
576*207e5cccSFangrui Song   return vld1q_lane_f32(a, b, 3);
577*207e5cccSFangrui Song }
578*207e5cccSFangrui Song 
579*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 {
580*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
581*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
582*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load double, ptr %a
583*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
584*207e5cccSFangrui Song // CHECK:   ret <2 x double> [[VLD1_LANE]]
585*207e5cccSFangrui Song float64x2_t test_vld1q_lane_f64(float64_t  *a, float64x2_t b) {
586*207e5cccSFangrui Song   return vld1q_lane_f64(a, b, 1);
587*207e5cccSFangrui Song }
588*207e5cccSFangrui Song 
589*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 {
590*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
591*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
592*207e5cccSFangrui Song // CHECK:   ret <16 x i8> [[VLD1_LANE]]
593*207e5cccSFangrui Song poly8x16_t test_vld1q_lane_p8(poly8_t  *a, poly8x16_t b) {
594*207e5cccSFangrui Song   return vld1q_lane_p8(a, b, 15);
595*207e5cccSFangrui Song }
596*207e5cccSFangrui Song 
597*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 {
598*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
599*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
600*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
601*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
602*207e5cccSFangrui Song // CHECK:   ret <8 x i16> [[VLD1_LANE]]
603*207e5cccSFangrui Song poly16x8_t test_vld1q_lane_p16(poly16_t  *a, poly16x8_t b) {
604*207e5cccSFangrui Song   return vld1q_lane_p16(a, b, 7);
605*207e5cccSFangrui Song }
606*207e5cccSFangrui Song 
607*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 {
608*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
609*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
610*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
611*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
612*207e5cccSFangrui Song // CHECK:   ret <2 x i64> [[VLD1_LANE]]
613*207e5cccSFangrui Song poly64x2_t test_vld1q_lane_p64(poly64_t  *a, poly64x2_t b) {
614*207e5cccSFangrui Song   return vld1q_lane_p64(a, b, 1);
615*207e5cccSFangrui Song }
616*207e5cccSFangrui Song 
617*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 {
618*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
619*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
620*207e5cccSFangrui Song // CHECK:   ret <8 x i8> [[VLD1_LANE]]
621*207e5cccSFangrui Song uint8x8_t test_vld1_lane_u8(uint8_t  *a, uint8x8_t b) {
622*207e5cccSFangrui Song   return vld1_lane_u8(a, b, 7);
623*207e5cccSFangrui Song }
624*207e5cccSFangrui Song 
625*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 {
626*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
627*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
628*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
629*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
630*207e5cccSFangrui Song // CHECK:   ret <4 x i16> [[VLD1_LANE]]
631*207e5cccSFangrui Song uint16x4_t test_vld1_lane_u16(uint16_t  *a, uint16x4_t b) {
632*207e5cccSFangrui Song   return vld1_lane_u16(a, b, 3);
633*207e5cccSFangrui Song }
634*207e5cccSFangrui Song 
635*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 {
636*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
637*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
638*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i32, ptr %a
639*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
640*207e5cccSFangrui Song // CHECK:   ret <2 x i32> [[VLD1_LANE]]
641*207e5cccSFangrui Song uint32x2_t test_vld1_lane_u32(uint32_t  *a, uint32x2_t b) {
642*207e5cccSFangrui Song   return vld1_lane_u32(a, b, 1);
643*207e5cccSFangrui Song }
644*207e5cccSFangrui Song 
645*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 {
646*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
647*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
648*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
649*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
650*207e5cccSFangrui Song // CHECK:   ret <1 x i64> [[VLD1_LANE]]
651*207e5cccSFangrui Song uint64x1_t test_vld1_lane_u64(uint64_t  *a, uint64x1_t b) {
652*207e5cccSFangrui Song   return vld1_lane_u64(a, b, 0);
653*207e5cccSFangrui Song }
654*207e5cccSFangrui Song 
655*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 {
656*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
657*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
658*207e5cccSFangrui Song // CHECK:   ret <8 x i8> [[VLD1_LANE]]
659*207e5cccSFangrui Song int8x8_t test_vld1_lane_s8(int8_t  *a, int8x8_t b) {
660*207e5cccSFangrui Song   return vld1_lane_s8(a, b, 7);
661*207e5cccSFangrui Song }
662*207e5cccSFangrui Song 
663*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 {
664*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
665*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
666*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
667*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
668*207e5cccSFangrui Song // CHECK:   ret <4 x i16> [[VLD1_LANE]]
669*207e5cccSFangrui Song int16x4_t test_vld1_lane_s16(int16_t  *a, int16x4_t b) {
670*207e5cccSFangrui Song   return vld1_lane_s16(a, b, 3);
671*207e5cccSFangrui Song }
672*207e5cccSFangrui Song 
673*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 {
674*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
675*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
676*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i32, ptr %a
677*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
678*207e5cccSFangrui Song // CHECK:   ret <2 x i32> [[VLD1_LANE]]
679*207e5cccSFangrui Song int32x2_t test_vld1_lane_s32(int32_t  *a, int32x2_t b) {
680*207e5cccSFangrui Song   return vld1_lane_s32(a, b, 1);
681*207e5cccSFangrui Song }
682*207e5cccSFangrui Song 
683*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 {
684*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
685*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
686*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
687*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
688*207e5cccSFangrui Song // CHECK:   ret <1 x i64> [[VLD1_LANE]]
689*207e5cccSFangrui Song int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
690*207e5cccSFangrui Song   return vld1_lane_s64(a, b, 0);
691*207e5cccSFangrui Song }
692*207e5cccSFangrui Song 
693*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 {
694*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
695*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
696*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load half, ptr %a
697*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
698*207e5cccSFangrui Song // CHECK:   ret <4 x half> [[VLD1_LANE]]
699*207e5cccSFangrui Song float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
700*207e5cccSFangrui Song   return vld1_lane_f16(a, b, 3);
701*207e5cccSFangrui Song }
702*207e5cccSFangrui Song 
703*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 {
704*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
705*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
706*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load float, ptr %a
707*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
708*207e5cccSFangrui Song // CHECK:   ret <2 x float> [[VLD1_LANE]]
709*207e5cccSFangrui Song float32x2_t test_vld1_lane_f32(float32_t  *a, float32x2_t b) {
710*207e5cccSFangrui Song   return vld1_lane_f32(a, b, 1);
711*207e5cccSFangrui Song }
712*207e5cccSFangrui Song 
713*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 {
714*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
715*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
716*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load double, ptr %a
717*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
718*207e5cccSFangrui Song // CHECK:   ret <1 x double> [[VLD1_LANE]]
719*207e5cccSFangrui Song float64x1_t test_vld1_lane_f64(float64_t  *a, float64x1_t b) {
720*207e5cccSFangrui Song   return vld1_lane_f64(a, b, 0);
721*207e5cccSFangrui Song }
722*207e5cccSFangrui Song 
723*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 {
724*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
725*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
726*207e5cccSFangrui Song // CHECK:   ret <8 x i8> [[VLD1_LANE]]
727*207e5cccSFangrui Song poly8x8_t test_vld1_lane_p8(poly8_t  *a, poly8x8_t b) {
728*207e5cccSFangrui Song   return vld1_lane_p8(a, b, 7);
729*207e5cccSFangrui Song }
730*207e5cccSFangrui Song 
731*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 {
732*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
733*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
734*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
735*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
736*207e5cccSFangrui Song // CHECK:   ret <4 x i16> [[VLD1_LANE]]
737*207e5cccSFangrui Song poly16x4_t test_vld1_lane_p16(poly16_t  *a, poly16x4_t b) {
738*207e5cccSFangrui Song   return vld1_lane_p16(a, b, 3);
739*207e5cccSFangrui Song }
740*207e5cccSFangrui Song 
741*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 {
742*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
743*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
744*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
745*207e5cccSFangrui Song // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
746*207e5cccSFangrui Song // CHECK:   ret <1 x i64> [[VLD1_LANE]]
747*207e5cccSFangrui Song poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
748*207e5cccSFangrui Song   return vld1_lane_p64(a, b, 0);
749*207e5cccSFangrui Song }
750*207e5cccSFangrui Song 
751*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int8x16x2_t @test_vld2q_lane_s8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
752*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
753*207e5cccSFangrui Song // CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
754*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
755*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
756*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[SRC]], i32 0, i32 0
757*207e5cccSFangrui Song // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
758*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
759*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
760*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
761*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
762*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
763*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
764*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
765*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
766*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
767*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
768*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16
769*207e5cccSFangrui Song // CHECK:   ret %struct.int8x16x2_t [[TMP8]]
770*207e5cccSFangrui Song int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
771*207e5cccSFangrui Song   return vld2q_lane_s8(ptr, src, 15);
772*207e5cccSFangrui Song }
773*207e5cccSFangrui Song 
774*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint8x16x2_t @test_vld2q_lane_u8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
775*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
776*207e5cccSFangrui Song // CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
777*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
778*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
779*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[SRC]], i32 0, i32 0
780*207e5cccSFangrui Song // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
781*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
782*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
783*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
784*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
785*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
786*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
787*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
788*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
789*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
790*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
791*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16
792*207e5cccSFangrui Song // CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
793*207e5cccSFangrui Song uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
794*207e5cccSFangrui Song   return vld2q_lane_u8(ptr, src, 15);
795*207e5cccSFangrui Song }
796*207e5cccSFangrui Song 
797*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly8x16x2_t @test_vld2q_lane_p8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
798*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
799*207e5cccSFangrui Song // CHECK:   [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
800*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
801*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
802*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[SRC]], i32 0, i32 0
803*207e5cccSFangrui Song // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
804*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
805*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
806*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
807*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
808*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
809*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
810*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
811*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
812*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
813*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
814*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16
815*207e5cccSFangrui Song // CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
816*207e5cccSFangrui Song poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
817*207e5cccSFangrui Song   return vld2q_lane_p8(ptr, src, 15);
818*207e5cccSFangrui Song }
819*207e5cccSFangrui Song 
820*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int8x16x3_t @test_vld3q_lane_s8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 {
821*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
822*207e5cccSFangrui Song // CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
823*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
824*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
825*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[SRC]], i32 0, i32 0
826*207e5cccSFangrui Song // CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
827*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false)
828*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
829*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
830*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
831*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
832*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
833*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
834*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
835*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
836*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
837*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr)
838*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
839*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
840*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16
841*207e5cccSFangrui Song // CHECK:   ret %struct.int8x16x3_t [[TMP9]]
842*207e5cccSFangrui Song int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
843*207e5cccSFangrui Song   return vld3q_lane_s8(ptr, src, 15);
844*207e5cccSFangrui Song }
845*207e5cccSFangrui Song 
846*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint8x16x3_t @test_vld3q_lane_u8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 {
847*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
848*207e5cccSFangrui Song // CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
849*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
850*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
851*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[SRC]], i32 0, i32 0
852*207e5cccSFangrui Song // CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
853*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false)
854*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
855*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
856*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
857*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
858*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
859*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
860*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
861*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
862*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
863*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr)
864*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
865*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
866*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16
867*207e5cccSFangrui Song // CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
868*207e5cccSFangrui Song uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
869*207e5cccSFangrui Song   return vld3q_lane_u8(ptr, src, 15);
870*207e5cccSFangrui Song }
871*207e5cccSFangrui Song 
872*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint16x8x2_t @test_vld2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
873*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
874*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
875*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
876*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
877*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
878*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
879*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
880*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
881*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
882*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
883*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
884*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
885*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
886*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
887*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
888*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
889*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
890*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
891*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
892*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
893*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16
894*207e5cccSFangrui Song // CHECK:   ret %struct.uint16x8x2_t [[TMP13]]
895*207e5cccSFangrui Song uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
896*207e5cccSFangrui Song   return vld2q_lane_u16(a, b, 7);
897*207e5cccSFangrui Song }
898*207e5cccSFangrui Song 
899*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint32x4x2_t @test_vld2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
900*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
901*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
902*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
903*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
904*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
905*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
906*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
907*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
908*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
909*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
910*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
911*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
912*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
913*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
914*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
915*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
916*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
917*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a)
918*207e5cccSFangrui Song // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]]
919*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
920*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16
921*207e5cccSFangrui Song // CHECK:   ret %struct.uint32x4x2_t [[TMP13]]
922*207e5cccSFangrui Song uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
923*207e5cccSFangrui Song   return vld2q_lane_u32(a, b, 3);
924*207e5cccSFangrui Song }
925*207e5cccSFangrui Song 
926*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
927*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
928*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
929*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
930*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
931*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
932*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
933*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
934*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
935*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
936*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
937*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
938*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
939*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
940*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
941*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
942*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
943*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
944*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
945*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
946*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
947*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
948*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x2x2_t [[TMP13]]
949*207e5cccSFangrui Song uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
950*207e5cccSFangrui Song   return vld2q_lane_u64(a, b, 1);
951*207e5cccSFangrui Song }
952*207e5cccSFangrui Song 
953*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int16x8x2_t @test_vld2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
954*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
955*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
956*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
957*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
958*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
959*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
960*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
961*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
962*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
963*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
964*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
965*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
966*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
967*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
968*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
969*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
970*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
971*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
972*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
973*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
974*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16
975*207e5cccSFangrui Song // CHECK:   ret %struct.int16x8x2_t [[TMP13]]
976*207e5cccSFangrui Song int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
977*207e5cccSFangrui Song   return vld2q_lane_s16(a, b, 7);
978*207e5cccSFangrui Song }
979*207e5cccSFangrui Song 
980*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int32x4x2_t @test_vld2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
981*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
982*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
983*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
984*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
985*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
986*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
987*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
988*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
989*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
990*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
991*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
992*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
993*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
994*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
995*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
996*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
997*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
998*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a)
999*207e5cccSFangrui Song // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1000*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1001*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16
1002*207e5cccSFangrui Song // CHECK:   ret %struct.int32x4x2_t [[TMP13]]
1003*207e5cccSFangrui Song int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
1004*207e5cccSFangrui Song   return vld2q_lane_s32(a, b, 3);
1005*207e5cccSFangrui Song }
1006*207e5cccSFangrui Song 
1007*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1008*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
1009*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
1010*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
1011*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
1012*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
1013*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1014*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1015*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
1016*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1017*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1018*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1019*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
1020*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1021*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1022*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1023*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1024*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1025*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
1026*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1027*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1028*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
1029*207e5cccSFangrui Song // CHECK:   ret %struct.int64x2x2_t [[TMP13]]
1030*207e5cccSFangrui Song int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
1031*207e5cccSFangrui Song   return vld2q_lane_s64(a, b, 1);
1032*207e5cccSFangrui Song }
1033*207e5cccSFangrui Song 
1034*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float16x8x2_t @test_vld2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 {
1035*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
1036*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
1037*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
1038*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
1039*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
1040*207e5cccSFangrui Song // CHECK:   store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1041*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1042*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
1043*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
1044*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
1045*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
1046*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
1047*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
1048*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
1049*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
1050*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
1051*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
1052*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, ptr %a)
1053*207e5cccSFangrui Song // CHECK:   store { <8 x half>, <8 x half> } [[VLD2_LANE]], ptr [[__RET]]
1054*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1055*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16
1056*207e5cccSFangrui Song // CHECK:   ret %struct.float16x8x2_t [[TMP13]]
1057*207e5cccSFangrui Song float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
1058*207e5cccSFangrui Song   return vld2q_lane_f16(a, b, 7);
1059*207e5cccSFangrui Song }
1060*207e5cccSFangrui Song 
1061*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float32x4x2_t @test_vld2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 {
1062*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
1063*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
1064*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
1065*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
1066*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
1067*207e5cccSFangrui Song // CHECK:   store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1068*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1069*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
1070*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
1071*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
1072*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
1073*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
1074*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
1075*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
1076*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
1077*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
1078*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
1079*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, ptr %a)
1080*207e5cccSFangrui Song // CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], ptr [[__RET]]
1081*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1082*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16
1083*207e5cccSFangrui Song // CHECK:   ret %struct.float32x4x2_t [[TMP13]]
1084*207e5cccSFangrui Song float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
1085*207e5cccSFangrui Song   return vld2q_lane_f32(a, b, 3);
1086*207e5cccSFangrui Song }
1087*207e5cccSFangrui Song 
1088*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 {
1089*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
1090*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
1091*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
1092*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
1093*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
1094*207e5cccSFangrui Song // CHECK:   store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1095*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1096*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
1097*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
1098*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
1099*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
1100*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
1101*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
1102*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
1103*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
1104*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
1105*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
1106*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, ptr %a)
1107*207e5cccSFangrui Song // CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], ptr [[__RET]]
1108*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1109*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
1110*207e5cccSFangrui Song // CHECK:   ret %struct.float64x2x2_t [[TMP13]]
1111*207e5cccSFangrui Song float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
1112*207e5cccSFangrui Song   return vld2q_lane_f64(a, b, 1);
1113*207e5cccSFangrui Song }
1114*207e5cccSFangrui Song 
1115*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly16x8x2_t @test_vld2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1116*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
1117*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
1118*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
1119*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
1120*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
1121*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1122*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1123*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
1124*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1125*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1126*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1127*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
1128*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1129*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1130*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1131*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1132*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1133*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
1134*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1135*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1136*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16
1137*207e5cccSFangrui Song // CHECK:   ret %struct.poly16x8x2_t [[TMP13]]
1138*207e5cccSFangrui Song poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
1139*207e5cccSFangrui Song   return vld2q_lane_p16(a, b, 7);
1140*207e5cccSFangrui Song }
1141*207e5cccSFangrui Song 
1142*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1143*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
1144*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
1145*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
1146*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
1147*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
1148*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1149*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1150*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
1151*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1152*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1153*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1154*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
1155*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1156*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1157*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1158*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1159*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1160*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
1161*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1162*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1163*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
1164*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x2x2_t [[TMP13]]
1165*207e5cccSFangrui Song poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
1166*207e5cccSFangrui Song   return vld2q_lane_p64(a, b, 1);
1167*207e5cccSFangrui Song }
1168*207e5cccSFangrui Song 
1169*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint8x8x2_t @test_vld2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1170*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
1171*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
1172*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
1173*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
1174*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
1175*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1176*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1177*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
1178*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1179*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1180*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
1181*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1182*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1183*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1184*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1185*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1186*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8
1187*207e5cccSFangrui Song // CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
1188*207e5cccSFangrui Song uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
1189*207e5cccSFangrui Song   return vld2_lane_u8(a, b, 7);
1190*207e5cccSFangrui Song }
1191*207e5cccSFangrui Song 
1192*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint16x4x2_t @test_vld2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1193*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
1194*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
1195*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
1196*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
1197*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
1198*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1199*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1200*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
1201*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1202*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1203*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1204*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
1205*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1206*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1207*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1208*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1209*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1210*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1211*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1212*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1213*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8
1214*207e5cccSFangrui Song // CHECK:   ret %struct.uint16x4x2_t [[TMP13]]
1215*207e5cccSFangrui Song uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
1216*207e5cccSFangrui Song   return vld2_lane_u16(a, b, 3);
1217*207e5cccSFangrui Song }
1218*207e5cccSFangrui Song 
1219*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint32x2x2_t @test_vld2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1220*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
1221*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
1222*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
1223*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
1224*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
1225*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1226*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1227*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
1228*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1229*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1230*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1231*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
1232*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1233*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1234*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1235*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1236*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1237*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a)
1238*207e5cccSFangrui Song // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1239*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1240*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8
1241*207e5cccSFangrui Song // CHECK:   ret %struct.uint32x2x2_t [[TMP13]]
1242*207e5cccSFangrui Song uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
1243*207e5cccSFangrui Song   return vld2_lane_u32(a, b, 1);
1244*207e5cccSFangrui Song }
1245*207e5cccSFangrui Song 
1246*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x1x2_t @test_vld2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1247*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
1248*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
1249*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
1250*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
1251*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
1252*207e5cccSFangrui Song // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1253*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1254*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
1255*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1256*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1257*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1258*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
1259*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1260*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1261*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1262*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1263*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1264*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1265*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1266*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1267*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8
1268*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x1x2_t [[TMP13]]
1269*207e5cccSFangrui Song uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
1270*207e5cccSFangrui Song   return vld2_lane_u64(a, b, 0);
1271*207e5cccSFangrui Song }
1272*207e5cccSFangrui Song 
1273*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int8x8x2_t @test_vld2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1274*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
1275*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
1276*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
1277*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
1278*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
1279*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1280*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1281*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
1282*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1283*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1284*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
1285*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1286*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1287*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1288*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1289*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1290*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8
1291*207e5cccSFangrui Song // CHECK:   ret %struct.int8x8x2_t [[TMP8]]
1292*207e5cccSFangrui Song int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
1293*207e5cccSFangrui Song   return vld2_lane_s8(a, b, 7);
1294*207e5cccSFangrui Song }
1295*207e5cccSFangrui Song 
1296*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int16x4x2_t @test_vld2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1297*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
1298*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
1299*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
1300*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
1301*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
1302*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1303*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1304*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
1305*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1306*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1307*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1308*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
1309*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1310*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1311*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1312*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1313*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1314*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1315*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1316*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1317*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8
1318*207e5cccSFangrui Song // CHECK:   ret %struct.int16x4x2_t [[TMP13]]
1319*207e5cccSFangrui Song int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
1320*207e5cccSFangrui Song   return vld2_lane_s16(a, b, 3);
1321*207e5cccSFangrui Song }
1322*207e5cccSFangrui Song 
1323*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int32x2x2_t @test_vld2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1324*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
1325*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
1326*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
1327*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
1328*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
1329*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1330*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1331*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
1332*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1333*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1334*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1335*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
1336*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1337*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1338*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1339*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1340*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1341*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a)
1342*207e5cccSFangrui Song // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1343*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1344*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8
1345*207e5cccSFangrui Song // CHECK:   ret %struct.int32x2x2_t [[TMP13]]
1346*207e5cccSFangrui Song int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
1347*207e5cccSFangrui Song   return vld2_lane_s32(a, b, 1);
1348*207e5cccSFangrui Song }
1349*207e5cccSFangrui Song 
1350*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x1x2_t @test_vld2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1351*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
1352*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
1353*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
1354*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
1355*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
1356*207e5cccSFangrui Song // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1357*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1358*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
1359*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1360*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1361*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1362*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
1363*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1364*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1365*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1366*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1367*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1368*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1369*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1370*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1371*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8
1372*207e5cccSFangrui Song // CHECK:   ret %struct.int64x1x2_t [[TMP13]]
1373*207e5cccSFangrui Song int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
1374*207e5cccSFangrui Song   return vld2_lane_s64(a, b, 0);
1375*207e5cccSFangrui Song }
1376*207e5cccSFangrui Song 
1377*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float16x4x2_t @test_vld2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 {
1378*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
1379*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
1380*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
1381*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
1382*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
1383*207e5cccSFangrui Song // CHECK:   store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1384*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1385*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
1386*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
1387*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
1388*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
1389*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
1390*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
1391*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
1392*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
1393*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
1394*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
1395*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, ptr %a)
1396*207e5cccSFangrui Song // CHECK:   store { <4 x half>, <4 x half> } [[VLD2_LANE]], ptr [[__RET]]
1397*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1398*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8
1399*207e5cccSFangrui Song // CHECK:   ret %struct.float16x4x2_t [[TMP13]]
1400*207e5cccSFangrui Song float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
1401*207e5cccSFangrui Song   return vld2_lane_f16(a, b, 3);
1402*207e5cccSFangrui Song }
1403*207e5cccSFangrui Song 
1404*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float32x2x2_t @test_vld2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 {
1405*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
1406*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
1407*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
1408*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
1409*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
1410*207e5cccSFangrui Song // CHECK:   store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1411*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1412*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
1413*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
1414*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
1415*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
1416*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
1417*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
1418*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
1419*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
1420*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
1421*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
1422*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, ptr %a)
1423*207e5cccSFangrui Song // CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], ptr [[__RET]]
1424*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1425*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8
1426*207e5cccSFangrui Song // CHECK:   ret %struct.float32x2x2_t [[TMP13]]
1427*207e5cccSFangrui Song float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
1428*207e5cccSFangrui Song   return vld2_lane_f32(a, b, 1);
1429*207e5cccSFangrui Song }
1430*207e5cccSFangrui Song 
1431*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 {
1432*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
1433*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
1434*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
1435*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
1436*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
1437*207e5cccSFangrui Song // CHECK:   store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1438*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1439*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
1440*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
1441*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
1442*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
1443*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
1444*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
1445*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
1446*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
1447*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
1448*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
1449*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, ptr %a)
1450*207e5cccSFangrui Song // CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], ptr [[__RET]]
1451*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1452*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
1453*207e5cccSFangrui Song // CHECK:   ret %struct.float64x1x2_t [[TMP13]]
1454*207e5cccSFangrui Song float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
1455*207e5cccSFangrui Song   return vld2_lane_f64(a, b, 0);
1456*207e5cccSFangrui Song }
1457*207e5cccSFangrui Song 
1458*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly8x8x2_t @test_vld2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1459*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
1460*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
1461*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
1462*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
1463*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
1464*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1465*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1466*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
1467*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1468*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1469*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
1470*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1471*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1472*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1473*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1474*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1475*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8
1476*207e5cccSFangrui Song // CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
1477*207e5cccSFangrui Song poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
1478*207e5cccSFangrui Song   return vld2_lane_p8(a, b, 7);
1479*207e5cccSFangrui Song }
1480*207e5cccSFangrui Song 
1481*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly16x4x2_t @test_vld2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1482*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
1483*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
1484*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
1485*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
1486*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
1487*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1488*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1489*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
1490*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1491*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1492*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1493*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
1494*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1495*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1496*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1497*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1498*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1499*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1500*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1501*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1502*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8
1503*207e5cccSFangrui Song // CHECK:   ret %struct.poly16x4x2_t [[TMP13]]
1504*207e5cccSFangrui Song poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
1505*207e5cccSFangrui Song   return vld2_lane_p16(a, b, 3);
1506*207e5cccSFangrui Song }
1507*207e5cccSFangrui Song 
1508*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1509*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
1510*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
1511*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
1512*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
1513*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
1514*207e5cccSFangrui Song // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1515*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1516*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
1517*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1518*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1519*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1520*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
1521*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1522*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1523*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1524*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1525*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1526*207e5cccSFangrui Song // CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1527*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1528*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1529*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
1530*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x1x2_t [[TMP13]]
1531*207e5cccSFangrui Song poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
1532*207e5cccSFangrui Song   return vld2_lane_p64(a, b, 0);
1533*207e5cccSFangrui Song }
1534*207e5cccSFangrui Song 
1535*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint16x8x3_t @test_vld3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1536*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
1537*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
1538*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
1539*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
1540*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
1541*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1542*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1543*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1544*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1545*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1546*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1547*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1548*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1549*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1550*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1551*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1552*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1553*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1554*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1555*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1556*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1557*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1558*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1559*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1560*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1561*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16
1562*207e5cccSFangrui Song // CHECK:   ret %struct.uint16x8x3_t [[TMP16]]
1563*207e5cccSFangrui Song uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
1564*207e5cccSFangrui Song   return vld3q_lane_u16(a, b, 7);
1565*207e5cccSFangrui Song }
1566*207e5cccSFangrui Song 
1567*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint32x4x3_t @test_vld3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
1568*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
1569*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
1570*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
1571*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
1572*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
1573*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1574*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1575*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1576*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
1577*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
1578*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1579*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1580*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
1581*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
1582*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1583*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1584*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
1585*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
1586*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
1587*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1588*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1589*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
1590*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a)
1591*207e5cccSFangrui Song // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1592*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1593*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16
1594*207e5cccSFangrui Song // CHECK:   ret %struct.uint32x4x3_t [[TMP16]]
1595*207e5cccSFangrui Song uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
1596*207e5cccSFangrui Song   return vld3q_lane_u32(a, b, 3);
1597*207e5cccSFangrui Song }
1598*207e5cccSFangrui Song 
1599*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1600*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
1601*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
1602*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
1603*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
1604*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
1605*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1606*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1607*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1608*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1609*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1610*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1611*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1612*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1613*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1614*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1615*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1616*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1617*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1618*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1619*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1620*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1621*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1622*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1623*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1624*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1625*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
1626*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x2x3_t [[TMP16]]
1627*207e5cccSFangrui Song uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
1628*207e5cccSFangrui Song   return vld3q_lane_u64(a, b, 1);
1629*207e5cccSFangrui Song }
1630*207e5cccSFangrui Song 
1631*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int16x8x3_t @test_vld3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1632*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
1633*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
1634*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
1635*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
1636*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
1637*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1638*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1639*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1640*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1641*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1642*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1643*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1644*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1645*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1646*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1647*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1648*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1649*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1650*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1651*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1652*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1653*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1654*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1655*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1656*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1657*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16
1658*207e5cccSFangrui Song // CHECK:   ret %struct.int16x8x3_t [[TMP16]]
1659*207e5cccSFangrui Song int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
1660*207e5cccSFangrui Song   return vld3q_lane_s16(a, b, 7);
1661*207e5cccSFangrui Song }
1662*207e5cccSFangrui Song 
1663*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int32x4x3_t @test_vld3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
1664*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
1665*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
1666*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
1667*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
1668*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
1669*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1670*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1671*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1672*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
1673*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
1674*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1675*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1676*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
1677*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
1678*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1679*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1680*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
1681*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
1682*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
1683*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1684*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1685*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
1686*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a)
1687*207e5cccSFangrui Song // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1688*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1689*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16
1690*207e5cccSFangrui Song // CHECK:   ret %struct.int32x4x3_t [[TMP16]]
1691*207e5cccSFangrui Song int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
1692*207e5cccSFangrui Song   return vld3q_lane_s32(a, b, 3);
1693*207e5cccSFangrui Song }
1694*207e5cccSFangrui Song 
1695*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1696*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
1697*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
1698*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
1699*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
1700*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
1701*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1702*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1703*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1704*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1705*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1706*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1707*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1708*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1709*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1710*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1711*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1712*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1713*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1714*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1715*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1716*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1717*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1718*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1719*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1720*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1721*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
1722*207e5cccSFangrui Song // CHECK:   ret %struct.int64x2x3_t [[TMP16]]
1723*207e5cccSFangrui Song int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
1724*207e5cccSFangrui Song   return vld3q_lane_s64(a, b, 1);
1725*207e5cccSFangrui Song }
1726*207e5cccSFangrui Song 
1727*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float16x8x3_t @test_vld3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 {
1728*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
1729*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
1730*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
1731*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
1732*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
1733*207e5cccSFangrui Song // CHECK:   store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1734*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1735*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1736*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
1737*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
1738*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
1739*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1740*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
1741*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
1742*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
1743*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1744*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
1745*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
1746*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
1747*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
1748*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
1749*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
1750*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, ptr %a)
1751*207e5cccSFangrui Song // CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], ptr [[__RET]]
1752*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1753*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16
1754*207e5cccSFangrui Song // CHECK:   ret %struct.float16x8x3_t [[TMP16]]
1755*207e5cccSFangrui Song float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
1756*207e5cccSFangrui Song   return vld3q_lane_f16(a, b, 7);
1757*207e5cccSFangrui Song }
1758*207e5cccSFangrui Song 
1759*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float32x4x3_t @test_vld3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 {
1760*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
1761*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
1762*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
1763*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
1764*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
1765*207e5cccSFangrui Song // CHECK:   store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1766*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1767*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1768*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
1769*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
1770*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
1771*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1772*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
1773*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
1774*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
1775*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1776*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
1777*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
1778*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
1779*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
1780*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
1781*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
1782*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, ptr %a)
1783*207e5cccSFangrui Song // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], ptr [[__RET]]
1784*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1785*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16
1786*207e5cccSFangrui Song // CHECK:   ret %struct.float32x4x3_t [[TMP16]]
1787*207e5cccSFangrui Song float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
1788*207e5cccSFangrui Song   return vld3q_lane_f32(a, b, 3);
1789*207e5cccSFangrui Song }
1790*207e5cccSFangrui Song 
1791*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 {
1792*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
1793*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
1794*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
1795*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
1796*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
1797*207e5cccSFangrui Song // CHECK:   store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1798*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1799*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1800*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
1801*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
1802*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
1803*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1804*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
1805*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
1806*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
1807*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1808*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
1809*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
1810*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
1811*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
1812*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
1813*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
1814*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, ptr %a)
1815*207e5cccSFangrui Song // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], ptr [[__RET]]
1816*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1817*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
1818*207e5cccSFangrui Song // CHECK:   ret %struct.float64x2x3_t [[TMP16]]
1819*207e5cccSFangrui Song float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
1820*207e5cccSFangrui Song   return vld3q_lane_f64(a, b, 1);
1821*207e5cccSFangrui Song }
1822*207e5cccSFangrui Song 
1823*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly8x16x3_t @test_vld3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
1824*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
1825*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
1826*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
1827*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
1828*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
1829*207e5cccSFangrui Song // CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1830*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1831*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1832*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
1833*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
1834*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1835*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
1836*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
1837*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1838*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
1839*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
1840*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
1841*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
1842*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1843*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16
1844*207e5cccSFangrui Song // CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
1845*207e5cccSFangrui Song poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
1846*207e5cccSFangrui Song   return vld3q_lane_p8(a, b, 15);
1847*207e5cccSFangrui Song }
1848*207e5cccSFangrui Song 
1849*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly16x8x3_t @test_vld3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1850*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
1851*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
1852*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
1853*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
1854*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
1855*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1856*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1857*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1858*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1859*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1860*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1861*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1862*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1863*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1864*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1865*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1866*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1867*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1868*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1869*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1870*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1871*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1872*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1873*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1874*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1875*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16
1876*207e5cccSFangrui Song // CHECK:   ret %struct.poly16x8x3_t [[TMP16]]
1877*207e5cccSFangrui Song poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
1878*207e5cccSFangrui Song   return vld3q_lane_p16(a, b, 7);
1879*207e5cccSFangrui Song }
1880*207e5cccSFangrui Song 
1881*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1882*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
1883*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
1884*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
1885*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
1886*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
1887*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1888*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1889*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1890*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1891*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1892*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1893*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1894*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1895*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1896*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1897*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1898*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1899*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1900*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1901*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1902*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1903*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1904*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1905*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1906*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1907*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
1908*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x2x3_t [[TMP16]]
1909*207e5cccSFangrui Song poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
1910*207e5cccSFangrui Song   return vld3q_lane_p64(a, b, 1);
1911*207e5cccSFangrui Song }
1912*207e5cccSFangrui Song 
1913*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint8x8x3_t @test_vld3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1914*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
1915*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
1916*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
1917*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
1918*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
1919*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1920*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1921*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1922*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1923*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1924*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1925*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1926*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1927*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1928*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
1929*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
1930*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
1931*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
1932*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1933*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8
1934*207e5cccSFangrui Song // CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
1935*207e5cccSFangrui Song uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
1936*207e5cccSFangrui Song   return vld3_lane_u8(a, b, 7);
1937*207e5cccSFangrui Song }
1938*207e5cccSFangrui Song 
1939*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint16x4x3_t @test_vld3_lane_u16(ptr noundef %a,  [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1940*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
1941*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
1942*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
1943*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
1944*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
1945*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1946*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1947*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1948*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1949*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1950*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1951*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1952*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1953*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1954*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1955*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1956*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
1957*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
1958*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
1959*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1960*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1961*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
1962*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
1963*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1964*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1965*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8
1966*207e5cccSFangrui Song // CHECK:   ret %struct.uint16x4x3_t [[TMP16]]
1967*207e5cccSFangrui Song uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
1968*207e5cccSFangrui Song   return vld3_lane_u16(a, b, 3);
1969*207e5cccSFangrui Song }
1970*207e5cccSFangrui Song 
1971*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint32x2x3_t @test_vld3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1972*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
1973*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
1974*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
1975*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
1976*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
1977*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1978*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1979*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1980*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1981*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1982*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1983*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1984*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1985*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1986*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1987*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1988*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
1989*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
1990*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
1991*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1992*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1993*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
1994*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a)
1995*207e5cccSFangrui Song // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1996*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1997*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8
1998*207e5cccSFangrui Song // CHECK:   ret %struct.uint32x2x3_t [[TMP16]]
1999*207e5cccSFangrui Song uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
2000*207e5cccSFangrui Song   return vld3_lane_u32(a, b, 1);
2001*207e5cccSFangrui Song }
2002*207e5cccSFangrui Song 
2003*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x1x3_t @test_vld3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2004*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
2005*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
2006*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
2007*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
2008*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
2009*207e5cccSFangrui Song // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2010*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2011*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2012*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2013*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2014*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2015*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2016*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2017*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2018*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2019*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2020*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2021*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2022*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2023*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2024*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2025*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2026*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2027*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2028*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2029*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8
2030*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x1x3_t [[TMP16]]
2031*207e5cccSFangrui Song uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
2032*207e5cccSFangrui Song   return vld3_lane_u64(a, b, 0);
2033*207e5cccSFangrui Song }
2034*207e5cccSFangrui Song 
2035*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int8x8x3_t @test_vld3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2036*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
2037*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
2038*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
2039*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
2040*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
2041*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2042*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2043*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2044*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2045*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2046*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2047*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2048*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2049*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2050*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2051*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2052*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
2053*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
2054*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2055*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8
2056*207e5cccSFangrui Song // CHECK:   ret %struct.int8x8x3_t [[TMP9]]
2057*207e5cccSFangrui Song int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
2058*207e5cccSFangrui Song   return vld3_lane_s8(a, b, 7);
2059*207e5cccSFangrui Song }
2060*207e5cccSFangrui Song 
2061*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int16x4x3_t @test_vld3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2062*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
2063*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
2064*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
2065*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
2066*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
2067*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2068*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2069*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2070*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2071*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2072*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2073*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2074*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2075*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2076*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2077*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2078*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2079*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2080*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2081*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2082*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2083*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2084*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
2085*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
2086*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2087*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8
2088*207e5cccSFangrui Song // CHECK:   ret %struct.int16x4x3_t [[TMP16]]
2089*207e5cccSFangrui Song int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
2090*207e5cccSFangrui Song   return vld3_lane_s16(a, b, 3);
2091*207e5cccSFangrui Song }
2092*207e5cccSFangrui Song 
2093*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int32x2x3_t @test_vld3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
2094*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
2095*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
2096*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
2097*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
2098*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
2099*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2100*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2101*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2102*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
2103*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
2104*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
2105*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2106*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
2107*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
2108*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
2109*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2110*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
2111*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
2112*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
2113*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
2114*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
2115*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
2116*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a)
2117*207e5cccSFangrui Song // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]]
2118*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2119*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8
2120*207e5cccSFangrui Song // CHECK:   ret %struct.int32x2x3_t [[TMP16]]
2121*207e5cccSFangrui Song int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
2122*207e5cccSFangrui Song   return vld3_lane_s32(a, b, 1);
2123*207e5cccSFangrui Song }
2124*207e5cccSFangrui Song 
2125*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x1x3_t @test_vld3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2126*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
2127*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
2128*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
2129*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
2130*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
2131*207e5cccSFangrui Song // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2132*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2133*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2134*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2135*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2136*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2137*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2138*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2139*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2140*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2141*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2142*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2143*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2144*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2145*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2146*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2147*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2148*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2149*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2150*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2151*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8
2152*207e5cccSFangrui Song // CHECK:   ret %struct.int64x1x3_t [[TMP16]]
2153*207e5cccSFangrui Song int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
2154*207e5cccSFangrui Song   return vld3_lane_s64(a, b, 0);
2155*207e5cccSFangrui Song }
2156*207e5cccSFangrui Song 
2157*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float16x4x3_t @test_vld3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 {
2158*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
2159*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
2160*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
2161*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
2162*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
2163*207e5cccSFangrui Song // CHECK:   store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2164*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2165*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2166*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
2167*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
2168*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
2169*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2170*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
2171*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
2172*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
2173*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2174*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
2175*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
2176*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
2177*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
2178*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
2179*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
2180*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, ptr %a)
2181*207e5cccSFangrui Song // CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], ptr [[__RET]]
2182*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2183*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8
2184*207e5cccSFangrui Song // CHECK:   ret %struct.float16x4x3_t [[TMP16]]
2185*207e5cccSFangrui Song float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
2186*207e5cccSFangrui Song   return vld3_lane_f16(a, b, 3);
2187*207e5cccSFangrui Song }
2188*207e5cccSFangrui Song 
2189*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float32x2x3_t @test_vld3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 {
2190*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
2191*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
2192*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
2193*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
2194*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
2195*207e5cccSFangrui Song // CHECK:   store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2196*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2197*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2198*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
2199*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
2200*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
2201*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2202*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
2203*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
2204*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
2205*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2206*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
2207*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
2208*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
2209*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
2210*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
2211*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
2212*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, ptr %a)
2213*207e5cccSFangrui Song // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], ptr [[__RET]]
2214*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2215*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8
2216*207e5cccSFangrui Song // CHECK:   ret %struct.float32x2x3_t [[TMP16]]
2217*207e5cccSFangrui Song float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
2218*207e5cccSFangrui Song   return vld3_lane_f32(a, b, 1);
2219*207e5cccSFangrui Song }
2220*207e5cccSFangrui Song 
2221*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 {
2222*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
2223*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
2224*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
2225*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
2226*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
2227*207e5cccSFangrui Song // CHECK:   store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2228*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2229*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2230*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
2231*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
2232*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
2233*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2234*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
2235*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
2236*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
2237*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2238*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
2239*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
2240*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
2241*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
2242*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
2243*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
2244*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, ptr %a)
2245*207e5cccSFangrui Song // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], ptr [[__RET]]
2246*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2247*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
2248*207e5cccSFangrui Song // CHECK:   ret %struct.float64x1x3_t [[TMP16]]
2249*207e5cccSFangrui Song float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
2250*207e5cccSFangrui Song   return vld3_lane_f64(a, b, 0);
2251*207e5cccSFangrui Song }
2252*207e5cccSFangrui Song 
2253*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly8x8x3_t @test_vld3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2254*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
2255*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
2256*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
2257*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
2258*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
2259*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2260*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2261*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2262*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2263*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2264*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2265*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2266*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2267*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2268*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2269*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2270*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
2271*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
2272*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2273*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8
2274*207e5cccSFangrui Song // CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
2275*207e5cccSFangrui Song poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
2276*207e5cccSFangrui Song   return vld3_lane_p8(a, b, 7);
2277*207e5cccSFangrui Song }
2278*207e5cccSFangrui Song 
2279*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly16x4x3_t @test_vld3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2280*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
2281*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
2282*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
2283*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
2284*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
2285*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2286*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2287*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2288*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2289*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2290*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2291*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2292*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2293*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2294*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2295*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2296*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2297*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2298*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2299*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2300*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2301*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2302*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
2303*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
2304*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2305*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8
2306*207e5cccSFangrui Song // CHECK:   ret %struct.poly16x4x3_t [[TMP16]]
2307*207e5cccSFangrui Song poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
2308*207e5cccSFangrui Song   return vld3_lane_p16(a, b, 3);
2309*207e5cccSFangrui Song }
2310*207e5cccSFangrui Song 
2311*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2312*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
2313*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
2314*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
2315*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
2316*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
2317*207e5cccSFangrui Song // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2318*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2319*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2320*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2321*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2322*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2323*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2324*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2325*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2326*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2327*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2328*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2329*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2330*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2331*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2332*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2333*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2334*207e5cccSFangrui Song // CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2335*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2336*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2337*207e5cccSFangrui Song // CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
2338*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x1x3_t [[TMP16]]
2339*207e5cccSFangrui Song poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
2340*207e5cccSFangrui Song   return vld3_lane_p64(a, b, 0);
2341*207e5cccSFangrui Song }
2342*207e5cccSFangrui Song 
2343*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint8x16x4_t @test_vld4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2344*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
2345*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
2346*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
2347*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
2348*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
2349*207e5cccSFangrui Song // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2350*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2351*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2352*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2353*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2354*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2355*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2356*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2357*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2358*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2359*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2360*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2361*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2362*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2363*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2364*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2365*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2366*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16
2367*207e5cccSFangrui Song // CHECK:   ret %struct.uint8x16x4_t [[TMP10]]
2368*207e5cccSFangrui Song uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
2369*207e5cccSFangrui Song   return vld4q_lane_u8(a, b, 15);
2370*207e5cccSFangrui Song }
2371*207e5cccSFangrui Song 
2372*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint16x8x4_t @test_vld4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2373*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
2374*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
2375*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
2376*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
2377*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
2378*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2379*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2380*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2381*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2382*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2383*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2384*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2385*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2386*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2387*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2388*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2389*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2390*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2391*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2392*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2393*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2394*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2395*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2396*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2397*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2398*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2399*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2400*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2401*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2402*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2403*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16
2404*207e5cccSFangrui Song // CHECK:   ret %struct.uint16x8x4_t [[TMP19]]
2405*207e5cccSFangrui Song uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
2406*207e5cccSFangrui Song   return vld4q_lane_u16(a, b, 7);
2407*207e5cccSFangrui Song }
2408*207e5cccSFangrui Song 
2409*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint32x4x4_t @test_vld4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
2410*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
2411*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
2412*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
2413*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
2414*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
2415*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2416*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2417*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2418*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
2419*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
2420*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
2421*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2422*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
2423*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
2424*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
2425*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2426*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
2427*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
2428*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
2429*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2430*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
2431*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
2432*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2433*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2434*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2435*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
2436*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
2437*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a)
2438*207e5cccSFangrui Song // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2439*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2440*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16
2441*207e5cccSFangrui Song // CHECK:   ret %struct.uint32x4x4_t [[TMP19]]
2442*207e5cccSFangrui Song uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
2443*207e5cccSFangrui Song   return vld4q_lane_u32(a, b, 3);
2444*207e5cccSFangrui Song }
2445*207e5cccSFangrui Song 
2446*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2447*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
2448*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
2449*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
2450*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
2451*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
2452*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2453*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2454*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2455*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2456*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2457*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2458*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2459*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2460*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2461*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2462*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2463*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2464*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2465*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2466*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2467*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2468*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2469*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2470*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2471*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2472*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2473*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2474*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2475*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2476*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2477*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
2478*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x2x4_t [[TMP19]]
2479*207e5cccSFangrui Song uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
2480*207e5cccSFangrui Song   return vld4q_lane_u64(a, b, 1);
2481*207e5cccSFangrui Song }
2482*207e5cccSFangrui Song 
2483*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int8x16x4_t @test_vld4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2484*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
2485*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
2486*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
2487*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
2488*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
2489*207e5cccSFangrui Song // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2490*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2491*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2492*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2493*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2494*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2495*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2496*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2497*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2498*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2499*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2500*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2501*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2502*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2503*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2504*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2505*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2506*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16
2507*207e5cccSFangrui Song // CHECK:   ret %struct.int8x16x4_t [[TMP10]]
2508*207e5cccSFangrui Song int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
2509*207e5cccSFangrui Song   return vld4q_lane_s8(a, b, 15);
2510*207e5cccSFangrui Song }
2511*207e5cccSFangrui Song 
2512*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int16x8x4_t @test_vld4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2513*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
2514*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
2515*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
2516*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
2517*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
2518*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2519*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2520*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2521*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2522*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2523*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2524*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2525*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2526*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2527*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2528*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2529*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2530*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2531*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2532*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2533*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2534*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2535*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2536*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2537*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2538*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2539*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2540*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2541*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2542*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2543*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16
2544*207e5cccSFangrui Song // CHECK:   ret %struct.int16x8x4_t [[TMP19]]
2545*207e5cccSFangrui Song int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
2546*207e5cccSFangrui Song   return vld4q_lane_s16(a, b, 7);
2547*207e5cccSFangrui Song }
2548*207e5cccSFangrui Song 
2549*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int32x4x4_t @test_vld4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
2550*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
2551*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
2552*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
2553*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
2554*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
2555*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2556*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2557*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2558*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
2559*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
2560*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
2561*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2562*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
2563*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
2564*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
2565*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2566*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
2567*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
2568*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
2569*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2570*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
2571*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
2572*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2573*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2574*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2575*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
2576*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
2577*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a)
2578*207e5cccSFangrui Song // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2579*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2580*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16
2581*207e5cccSFangrui Song // CHECK:   ret %struct.int32x4x4_t [[TMP19]]
2582*207e5cccSFangrui Song int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
2583*207e5cccSFangrui Song   return vld4q_lane_s32(a, b, 3);
2584*207e5cccSFangrui Song }
2585*207e5cccSFangrui Song 
2586*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2587*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
2588*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
2589*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
2590*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
2591*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
2592*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2593*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2594*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2595*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2596*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2597*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2598*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2599*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2600*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2601*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2602*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2603*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2604*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2605*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2606*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2607*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2608*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2609*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2610*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2611*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2612*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2613*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2614*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2615*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2616*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2617*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
2618*207e5cccSFangrui Song // CHECK:   ret %struct.int64x2x4_t [[TMP19]]
2619*207e5cccSFangrui Song int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
2620*207e5cccSFangrui Song   return vld4q_lane_s64(a, b, 1);
2621*207e5cccSFangrui Song }
2622*207e5cccSFangrui Song 
2623*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float16x8x4_t @test_vld4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 {
2624*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
2625*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
2626*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
2627*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
2628*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
2629*207e5cccSFangrui Song // CHECK:   store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2630*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2631*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2632*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
2633*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
2634*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
2635*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2636*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
2637*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
2638*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
2639*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2640*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
2641*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
2642*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
2643*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2644*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
2645*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
2646*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
2647*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
2648*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
2649*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
2650*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
2651*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, ptr %a)
2652*207e5cccSFangrui Song // CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], ptr [[__RET]]
2653*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2654*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16
2655*207e5cccSFangrui Song // CHECK:   ret %struct.float16x8x4_t [[TMP19]]
2656*207e5cccSFangrui Song float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
2657*207e5cccSFangrui Song   return vld4q_lane_f16(a, b, 7);
2658*207e5cccSFangrui Song }
2659*207e5cccSFangrui Song 
2660*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float32x4x4_t @test_vld4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 {
2661*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
2662*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
2663*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
2664*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
2665*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
2666*207e5cccSFangrui Song // CHECK:   store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2667*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2668*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2669*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
2670*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
2671*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
2672*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2673*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
2674*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
2675*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
2676*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2677*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
2678*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
2679*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
2680*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2681*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
2682*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
2683*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
2684*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
2685*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
2686*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
2687*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
2688*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, ptr %a)
2689*207e5cccSFangrui Song // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], ptr [[__RET]]
2690*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2691*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16
2692*207e5cccSFangrui Song // CHECK:   ret %struct.float32x4x4_t [[TMP19]]
2693*207e5cccSFangrui Song float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
2694*207e5cccSFangrui Song   return vld4q_lane_f32(a, b, 3);
2695*207e5cccSFangrui Song }
2696*207e5cccSFangrui Song 
2697*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 {
2698*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
2699*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
2700*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
2701*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
2702*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
2703*207e5cccSFangrui Song // CHECK:   store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2704*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2705*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2706*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
2707*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
2708*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
2709*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2710*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
2711*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
2712*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
2713*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2714*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
2715*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
2716*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
2717*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2718*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
2719*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
2720*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
2721*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
2722*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
2723*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
2724*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
2725*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, ptr %a)
2726*207e5cccSFangrui Song // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], ptr [[__RET]]
2727*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2728*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
2729*207e5cccSFangrui Song // CHECK:   ret %struct.float64x2x4_t [[TMP19]]
2730*207e5cccSFangrui Song float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
2731*207e5cccSFangrui Song   return vld4q_lane_f64(a, b, 1);
2732*207e5cccSFangrui Song }
2733*207e5cccSFangrui Song 
2734*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly8x16x4_t @test_vld4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2735*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
2736*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
2737*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
2738*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
2739*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
2740*207e5cccSFangrui Song // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2741*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2742*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2743*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2744*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2745*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2746*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2747*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2748*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2749*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2750*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2751*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2752*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2753*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2754*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2755*207e5cccSFangrui Song // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2756*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2757*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16
2758*207e5cccSFangrui Song // CHECK:   ret %struct.poly8x16x4_t [[TMP10]]
2759*207e5cccSFangrui Song poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
2760*207e5cccSFangrui Song   return vld4q_lane_p8(a, b, 15);
2761*207e5cccSFangrui Song }
2762*207e5cccSFangrui Song 
2763*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly16x8x4_t @test_vld4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2764*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
2765*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
2766*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
2767*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
2768*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
2769*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2770*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2771*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2772*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2773*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2774*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2775*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2776*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2777*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2778*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2779*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2780*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2781*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2782*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2783*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2784*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2785*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2786*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2787*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2788*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2789*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2790*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2791*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2792*207e5cccSFangrui Song // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2793*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2794*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16
2795*207e5cccSFangrui Song // CHECK:   ret %struct.poly16x8x4_t [[TMP19]]
2796*207e5cccSFangrui Song poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
2797*207e5cccSFangrui Song   return vld4q_lane_p16(a, b, 7);
2798*207e5cccSFangrui Song }
2799*207e5cccSFangrui Song 
2800*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2801*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
2802*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
2803*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
2804*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
2805*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
2806*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2807*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2808*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2809*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2810*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2811*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2812*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2813*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2814*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2815*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2816*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2817*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2818*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2819*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2820*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2821*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2822*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2823*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2824*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2825*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2826*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2827*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2828*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2829*207e5cccSFangrui Song // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2830*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2831*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
2832*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x2x4_t [[TMP19]]
2833*207e5cccSFangrui Song poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
2834*207e5cccSFangrui Song   return vld4q_lane_p64(a, b, 1);
2835*207e5cccSFangrui Song }
2836*207e5cccSFangrui Song 
2837*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint8x8x4_t @test_vld4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2838*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
2839*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
2840*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
2841*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
2842*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
2843*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2844*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2845*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2846*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2847*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2848*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2849*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2850*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2851*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2852*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2853*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2854*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2855*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
2856*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
2857*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
2858*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2859*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2860*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8
2861*207e5cccSFangrui Song // CHECK:   ret %struct.uint8x8x4_t [[TMP10]]
2862*207e5cccSFangrui Song uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
2863*207e5cccSFangrui Song   return vld4_lane_u8(a, b, 7);
2864*207e5cccSFangrui Song }
2865*207e5cccSFangrui Song 
2866*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint16x4x4_t @test_vld4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2867*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
2868*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
2869*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
2870*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
2871*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
2872*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2873*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2874*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2875*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2876*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2877*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2878*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2879*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2880*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2881*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2882*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2883*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2884*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2885*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2886*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2887*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
2888*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
2889*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
2890*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2891*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2892*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2893*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
2894*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
2895*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2896*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2897*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8
2898*207e5cccSFangrui Song // CHECK:   ret %struct.uint16x4x4_t [[TMP19]]
2899*207e5cccSFangrui Song uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
2900*207e5cccSFangrui Song   return vld4_lane_u16(a, b, 3);
2901*207e5cccSFangrui Song }
2902*207e5cccSFangrui Song 
2903*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint32x2x4_t @test_vld4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
2904*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
2905*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
2906*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
2907*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
2908*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
2909*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2910*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2911*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2912*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
2913*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
2914*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
2915*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2916*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
2917*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
2918*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
2919*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2920*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
2921*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
2922*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
2923*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2924*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
2925*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
2926*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
2927*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
2928*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
2929*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
2930*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
2931*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a)
2932*207e5cccSFangrui Song // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2933*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2934*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8
2935*207e5cccSFangrui Song // CHECK:   ret %struct.uint32x2x4_t [[TMP19]]
2936*207e5cccSFangrui Song uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
2937*207e5cccSFangrui Song   return vld4_lane_u32(a, b, 1);
2938*207e5cccSFangrui Song }
2939*207e5cccSFangrui Song 
2940*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.uint64x1x4_t @test_vld4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2941*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
2942*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
2943*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
2944*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
2945*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
2946*207e5cccSFangrui Song // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2947*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2948*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2949*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2950*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2951*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2952*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2953*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2954*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2955*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2956*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2957*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2958*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2959*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2960*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2961*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
2962*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
2963*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
2964*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2965*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2966*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2967*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
2968*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
2969*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2970*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2971*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8
2972*207e5cccSFangrui Song // CHECK:   ret %struct.uint64x1x4_t [[TMP19]]
2973*207e5cccSFangrui Song uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
2974*207e5cccSFangrui Song   return vld4_lane_u64(a, b, 0);
2975*207e5cccSFangrui Song }
2976*207e5cccSFangrui Song 
2977*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int8x8x4_t @test_vld4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2978*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
2979*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
2980*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
2981*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
2982*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
2983*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2984*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2985*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2986*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2987*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2988*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2989*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2990*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2991*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2992*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2993*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2994*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2995*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
2996*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
2997*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
2998*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2999*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3000*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8
3001*207e5cccSFangrui Song // CHECK:   ret %struct.int8x8x4_t [[TMP10]]
3002*207e5cccSFangrui Song int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
3003*207e5cccSFangrui Song   return vld4_lane_s8(a, b, 7);
3004*207e5cccSFangrui Song }
3005*207e5cccSFangrui Song 
3006*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int16x4x4_t @test_vld4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3007*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
3008*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
3009*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
3010*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
3011*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
3012*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3013*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3014*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3015*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3016*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3017*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
3018*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3019*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3020*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3021*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
3022*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3023*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
3024*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
3025*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
3026*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3027*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
3028*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
3029*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
3030*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
3031*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
3032*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
3033*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
3034*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
3035*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
3036*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3037*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8
3038*207e5cccSFangrui Song // CHECK:   ret %struct.int16x4x4_t [[TMP19]]
3039*207e5cccSFangrui Song int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
3040*207e5cccSFangrui Song   return vld4_lane_s16(a, b, 3);
3041*207e5cccSFangrui Song }
3042*207e5cccSFangrui Song 
3043*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int32x2x4_t @test_vld4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
3044*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
3045*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
3046*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
3047*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
3048*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
3049*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3050*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3051*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3052*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
3053*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
3054*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
3055*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3056*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
3057*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
3058*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
3059*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3060*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
3061*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
3062*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
3063*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3064*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
3065*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
3066*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
3067*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
3068*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
3069*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
3070*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
3071*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a)
3072*207e5cccSFangrui Song // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]]
3073*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3074*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8
3075*207e5cccSFangrui Song // CHECK:   ret %struct.int32x2x4_t [[TMP19]]
3076*207e5cccSFangrui Song int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
3077*207e5cccSFangrui Song   return vld4_lane_s32(a, b, 1);
3078*207e5cccSFangrui Song }
3079*207e5cccSFangrui Song 
3080*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.int64x1x4_t @test_vld4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3081*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
3082*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
3083*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
3084*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
3085*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
3086*207e5cccSFangrui Song // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3087*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3088*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3089*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3090*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3091*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
3092*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3093*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3094*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3095*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
3096*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3097*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
3098*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
3099*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3100*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3101*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
3102*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
3103*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
3104*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3105*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
3106*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
3107*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
3108*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
3109*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
3110*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3111*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8
3112*207e5cccSFangrui Song // CHECK:   ret %struct.int64x1x4_t [[TMP19]]
3113*207e5cccSFangrui Song int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
3114*207e5cccSFangrui Song   return vld4_lane_s64(a, b, 0);
3115*207e5cccSFangrui Song }
3116*207e5cccSFangrui Song 
3117*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float16x4x4_t @test_vld4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 {
3118*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
3119*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
3120*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
3121*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
3122*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
3123*207e5cccSFangrui Song // CHECK:   store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3124*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3125*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3126*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
3127*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
3128*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
3129*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3130*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
3131*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
3132*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
3133*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3134*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
3135*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
3136*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
3137*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3138*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
3139*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
3140*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
3141*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
3142*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
3143*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
3144*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
3145*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, ptr %a)
3146*207e5cccSFangrui Song // CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], ptr [[__RET]]
3147*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3148*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8
3149*207e5cccSFangrui Song // CHECK:   ret %struct.float16x4x4_t [[TMP19]]
3150*207e5cccSFangrui Song float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
3151*207e5cccSFangrui Song   return vld4_lane_f16(a, b, 3);
3152*207e5cccSFangrui Song }
3153*207e5cccSFangrui Song 
3154*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float32x2x4_t @test_vld4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 {
3155*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
3156*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
3157*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
3158*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
3159*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
3160*207e5cccSFangrui Song // CHECK:   store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3161*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3162*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3163*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
3164*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
3165*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
3166*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3167*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
3168*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
3169*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
3170*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3171*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
3172*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
3173*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
3174*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3175*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
3176*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
3177*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
3178*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
3179*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
3180*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
3181*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
3182*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, ptr %a)
3183*207e5cccSFangrui Song // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], ptr [[__RET]]
3184*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3185*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8
3186*207e5cccSFangrui Song // CHECK:   ret %struct.float32x2x4_t [[TMP19]]
3187*207e5cccSFangrui Song float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
3188*207e5cccSFangrui Song   return vld4_lane_f32(a, b, 1);
3189*207e5cccSFangrui Song }
3190*207e5cccSFangrui Song 
3191*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 {
3192*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
3193*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
3194*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
3195*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
3196*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
3197*207e5cccSFangrui Song // CHECK:   store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3198*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3199*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3200*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
3201*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
3202*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
3203*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3204*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
3205*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
3206*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
3207*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3208*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
3209*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
3210*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
3211*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3212*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
3213*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
3214*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
3215*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
3216*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
3217*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
3218*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
3219*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, ptr %a)
3220*207e5cccSFangrui Song // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], ptr [[__RET]]
3221*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3222*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
3223*207e5cccSFangrui Song // CHECK:   ret %struct.float64x1x4_t [[TMP19]]
3224*207e5cccSFangrui Song float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
3225*207e5cccSFangrui Song   return vld4_lane_f64(a, b, 0);
3226*207e5cccSFangrui Song }
3227*207e5cccSFangrui Song 
3228*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly8x8x4_t @test_vld4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3229*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
3230*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
3231*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
3232*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
3233*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
3234*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3235*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3236*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3237*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3238*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3239*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3240*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3241*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3242*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3243*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
3244*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
3245*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3246*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
3247*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
3248*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
3249*207e5cccSFangrui Song // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
3250*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3251*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8
3252*207e5cccSFangrui Song // CHECK:   ret %struct.poly8x8x4_t [[TMP10]]
3253*207e5cccSFangrui Song poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
3254*207e5cccSFangrui Song   return vld4_lane_p8(a, b, 7);
3255*207e5cccSFangrui Song }
3256*207e5cccSFangrui Song 
3257*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly16x4x4_t @test_vld4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3258*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
3259*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
3260*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
3261*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
3262*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
3263*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3264*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3265*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3266*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3267*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3268*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
3269*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3270*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3271*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3272*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
3273*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3274*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
3275*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
3276*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
3277*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3278*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
3279*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
3280*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
3281*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
3282*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
3283*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
3284*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
3285*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
3286*207e5cccSFangrui Song // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
3287*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3288*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8
3289*207e5cccSFangrui Song // CHECK:   ret %struct.poly16x4x4_t [[TMP19]]
3290*207e5cccSFangrui Song poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
3291*207e5cccSFangrui Song   return vld4_lane_p16(a, b, 3);
3292*207e5cccSFangrui Song }
3293*207e5cccSFangrui Song 
3294*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3295*207e5cccSFangrui Song // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
3296*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
3297*207e5cccSFangrui Song // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
3298*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
3299*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
3300*207e5cccSFangrui Song // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3301*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3302*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3303*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3304*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3305*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
3306*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3307*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3308*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3309*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
3310*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3311*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
3312*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
3313*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3314*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3315*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
3316*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
3317*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
3318*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3319*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
3320*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
3321*207e5cccSFangrui Song // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
3322*207e5cccSFangrui Song // CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
3323*207e5cccSFangrui Song // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
3324*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3325*207e5cccSFangrui Song // CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
3326*207e5cccSFangrui Song // CHECK:   ret %struct.poly64x1x4_t [[TMP19]]
3327*207e5cccSFangrui Song poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
3328*207e5cccSFangrui Song   return vld4_lane_p64(a, b, 0);
3329*207e5cccSFangrui Song }
3330*207e5cccSFangrui Song 
3331*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3332*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3333*207e5cccSFangrui Song // CHECK:   store i8 [[TMP0]], ptr %a
3334*207e5cccSFangrui Song // CHECK:   ret void
3335*207e5cccSFangrui Song void test_vst1q_lane_u8(uint8_t  *a, uint8x16_t b) {
3336*207e5cccSFangrui Song   vst1q_lane_u8(a, b, 15);
3337*207e5cccSFangrui Song }
3338*207e5cccSFangrui Song 
3339*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3340*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3341*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3342*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3343*207e5cccSFangrui Song // CHECK:   store i16 [[TMP3]], ptr %a
3344*207e5cccSFangrui Song // CHECK:   ret void
3345*207e5cccSFangrui Song void test_vst1q_lane_u16(uint16_t  *a, uint16x8_t b) {
3346*207e5cccSFangrui Song   vst1q_lane_u16(a, b, 7);
3347*207e5cccSFangrui Song }
3348*207e5cccSFangrui Song 
3349*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 {
3350*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3351*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3352*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
3353*207e5cccSFangrui Song // CHECK:   store i32 [[TMP3]], ptr %a
3354*207e5cccSFangrui Song // CHECK:   ret void
3355*207e5cccSFangrui Song void test_vst1q_lane_u32(uint32_t  *a, uint32x4_t b) {
3356*207e5cccSFangrui Song   vst1q_lane_u32(a, b, 3);
3357*207e5cccSFangrui Song }
3358*207e5cccSFangrui Song 
3359*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3360*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3361*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3362*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3363*207e5cccSFangrui Song // CHECK:   store i64 [[TMP3]], ptr %a
3364*207e5cccSFangrui Song // CHECK:   ret void
3365*207e5cccSFangrui Song void test_vst1q_lane_u64(uint64_t  *a, uint64x2_t b) {
3366*207e5cccSFangrui Song   vst1q_lane_u64(a, b, 1);
3367*207e5cccSFangrui Song }
3368*207e5cccSFangrui Song 
3369*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3370*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3371*207e5cccSFangrui Song // CHECK:   store i8 [[TMP0]], ptr %a
3372*207e5cccSFangrui Song // CHECK:   ret void
3373*207e5cccSFangrui Song void test_vst1q_lane_s8(int8_t  *a, int8x16_t b) {
3374*207e5cccSFangrui Song   vst1q_lane_s8(a, b, 15);
3375*207e5cccSFangrui Song }
3376*207e5cccSFangrui Song 
3377*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3378*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3379*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3380*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3381*207e5cccSFangrui Song // CHECK:   store i16 [[TMP3]], ptr %a
3382*207e5cccSFangrui Song // CHECK:   ret void
3383*207e5cccSFangrui Song void test_vst1q_lane_s16(int16_t  *a, int16x8_t b) {
3384*207e5cccSFangrui Song   vst1q_lane_s16(a, b, 7);
3385*207e5cccSFangrui Song }
3386*207e5cccSFangrui Song 
3387*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 {
3388*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3389*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3390*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
3391*207e5cccSFangrui Song // CHECK:   store i32 [[TMP3]], ptr %a
3392*207e5cccSFangrui Song // CHECK:   ret void
3393*207e5cccSFangrui Song void test_vst1q_lane_s32(int32_t  *a, int32x4_t b) {
3394*207e5cccSFangrui Song   vst1q_lane_s32(a, b, 3);
3395*207e5cccSFangrui Song }
3396*207e5cccSFangrui Song 
3397*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3398*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3399*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3400*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3401*207e5cccSFangrui Song // CHECK:   store i64 [[TMP3]], ptr %a
3402*207e5cccSFangrui Song // CHECK:   ret void
3403*207e5cccSFangrui Song void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
3404*207e5cccSFangrui Song   vst1q_lane_s64(a, b, 1);
3405*207e5cccSFangrui Song }
3406*207e5cccSFangrui Song 
3407*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 {
3408*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
3409*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
3410*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
3411*207e5cccSFangrui Song // CHECK:   store half [[TMP3]], ptr %a
3412*207e5cccSFangrui Song // CHECK:   ret void
3413*207e5cccSFangrui Song void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
3414*207e5cccSFangrui Song   vst1q_lane_f16(a, b, 7);
3415*207e5cccSFangrui Song }
3416*207e5cccSFangrui Song 
3417*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 {
3418*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3419*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3420*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
3421*207e5cccSFangrui Song // CHECK:   store float [[TMP3]], ptr %a
3422*207e5cccSFangrui Song // CHECK:   ret void
3423*207e5cccSFangrui Song void test_vst1q_lane_f32(float32_t  *a, float32x4_t b) {
3424*207e5cccSFangrui Song   vst1q_lane_f32(a, b, 3);
3425*207e5cccSFangrui Song }
3426*207e5cccSFangrui Song 
3427*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 {
3428*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
3429*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3430*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
3431*207e5cccSFangrui Song // CHECK:   store double [[TMP3]], ptr %a
3432*207e5cccSFangrui Song // CHECK:   ret void
3433*207e5cccSFangrui Song void test_vst1q_lane_f64(float64_t  *a, float64x2_t b) {
3434*207e5cccSFangrui Song   vst1q_lane_f64(a, b, 1);
3435*207e5cccSFangrui Song }
3436*207e5cccSFangrui Song 
3437*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3438*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3439*207e5cccSFangrui Song // CHECK:   store i8 [[TMP0]], ptr %a
3440*207e5cccSFangrui Song // CHECK:   ret void
3441*207e5cccSFangrui Song void test_vst1q_lane_p8(poly8_t  *a, poly8x16_t b) {
3442*207e5cccSFangrui Song   vst1q_lane_p8(a, b, 15);
3443*207e5cccSFangrui Song }
3444*207e5cccSFangrui Song 
3445*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3446*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3447*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3448*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3449*207e5cccSFangrui Song // CHECK:   store i16 [[TMP3]], ptr %a
3450*207e5cccSFangrui Song // CHECK:   ret void
3451*207e5cccSFangrui Song void test_vst1q_lane_p16(poly16_t  *a, poly16x8_t b) {
3452*207e5cccSFangrui Song   vst1q_lane_p16(a, b, 7);
3453*207e5cccSFangrui Song }
3454*207e5cccSFangrui Song 
3455*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3456*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3457*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3458*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3459*207e5cccSFangrui Song // CHECK:   store i64 [[TMP3]], ptr %a
3460*207e5cccSFangrui Song // CHECK:   ret void
3461*207e5cccSFangrui Song void test_vst1q_lane_p64(poly64_t  *a, poly64x2_t b) {
3462*207e5cccSFangrui Song   vst1q_lane_p64(a, b, 1);
3463*207e5cccSFangrui Song }
3464*207e5cccSFangrui Song 
3465*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3466*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3467*207e5cccSFangrui Song // CHECK:   store i8 [[TMP0]], ptr %a
3468*207e5cccSFangrui Song // CHECK:   ret void
3469*207e5cccSFangrui Song void test_vst1_lane_u8(uint8_t  *a, uint8x8_t b) {
3470*207e5cccSFangrui Song   vst1_lane_u8(a, b, 7);
3471*207e5cccSFangrui Song }
3472*207e5cccSFangrui Song 
3473*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3474*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3475*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3476*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3477*207e5cccSFangrui Song // CHECK:   store i16 [[TMP3]], ptr %a
3478*207e5cccSFangrui Song // CHECK:   ret void
3479*207e5cccSFangrui Song void test_vst1_lane_u16(uint16_t  *a, uint16x4_t b) {
3480*207e5cccSFangrui Song   vst1_lane_u16(a, b, 3);
3481*207e5cccSFangrui Song }
3482*207e5cccSFangrui Song 
3483*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 {
3484*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3485*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3486*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
3487*207e5cccSFangrui Song // CHECK:   store i32 [[TMP3]], ptr %a
3488*207e5cccSFangrui Song // CHECK:   ret void
3489*207e5cccSFangrui Song void test_vst1_lane_u32(uint32_t  *a, uint32x2_t b) {
3490*207e5cccSFangrui Song   vst1_lane_u32(a, b, 1);
3491*207e5cccSFangrui Song }
3492*207e5cccSFangrui Song 
3493*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3494*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3495*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3496*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3497*207e5cccSFangrui Song // CHECK:   store i64 [[TMP3]], ptr %a
3498*207e5cccSFangrui Song // CHECK:   ret void
3499*207e5cccSFangrui Song void test_vst1_lane_u64(uint64_t  *a, uint64x1_t b) {
3500*207e5cccSFangrui Song   vst1_lane_u64(a, b, 0);
3501*207e5cccSFangrui Song }
3502*207e5cccSFangrui Song 
3503*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3504*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3505*207e5cccSFangrui Song // CHECK:   store i8 [[TMP0]], ptr %a
3506*207e5cccSFangrui Song // CHECK:   ret void
3507*207e5cccSFangrui Song void test_vst1_lane_s8(int8_t  *a, int8x8_t b) {
3508*207e5cccSFangrui Song   vst1_lane_s8(a, b, 7);
3509*207e5cccSFangrui Song }
3510*207e5cccSFangrui Song 
3511*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3512*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3513*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3514*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3515*207e5cccSFangrui Song // CHECK:   store i16 [[TMP3]], ptr %a
3516*207e5cccSFangrui Song // CHECK:   ret void
3517*207e5cccSFangrui Song void test_vst1_lane_s16(int16_t  *a, int16x4_t b) {
3518*207e5cccSFangrui Song   vst1_lane_s16(a, b, 3);
3519*207e5cccSFangrui Song }
3520*207e5cccSFangrui Song 
3521*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 {
3522*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3523*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3524*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
3525*207e5cccSFangrui Song // CHECK:   store i32 [[TMP3]], ptr %a
3526*207e5cccSFangrui Song // CHECK:   ret void
3527*207e5cccSFangrui Song void test_vst1_lane_s32(int32_t  *a, int32x2_t b) {
3528*207e5cccSFangrui Song   vst1_lane_s32(a, b, 1);
3529*207e5cccSFangrui Song }
3530*207e5cccSFangrui Song 
3531*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3532*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3533*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3534*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3535*207e5cccSFangrui Song // CHECK:   store i64 [[TMP3]], ptr %a
3536*207e5cccSFangrui Song // CHECK:   ret void
3537*207e5cccSFangrui Song void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
3538*207e5cccSFangrui Song   vst1_lane_s64(a, b, 0);
3539*207e5cccSFangrui Song }
3540*207e5cccSFangrui Song 
3541*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 {
3542*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
3543*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
3544*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
3545*207e5cccSFangrui Song // CHECK:   store half [[TMP3]], ptr %a
3546*207e5cccSFangrui Song // CHECK:   ret void
3547*207e5cccSFangrui Song void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
3548*207e5cccSFangrui Song   vst1_lane_f16(a, b, 3);
3549*207e5cccSFangrui Song }
3550*207e5cccSFangrui Song 
3551*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 {
3552*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3553*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3554*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
3555*207e5cccSFangrui Song // CHECK:   store float [[TMP3]], ptr %a
3556*207e5cccSFangrui Song // CHECK:   ret void
3557*207e5cccSFangrui Song void test_vst1_lane_f32(float32_t  *a, float32x2_t b) {
3558*207e5cccSFangrui Song   vst1_lane_f32(a, b, 1);
3559*207e5cccSFangrui Song }
3560*207e5cccSFangrui Song 
3561*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 {
3562*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
3563*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
3564*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
3565*207e5cccSFangrui Song // CHECK:   store double [[TMP3]], ptr %a
3566*207e5cccSFangrui Song // CHECK:   ret void
3567*207e5cccSFangrui Song void test_vst1_lane_f64(float64_t  *a, float64x1_t b) {
3568*207e5cccSFangrui Song   vst1_lane_f64(a, b, 0);
3569*207e5cccSFangrui Song }
3570*207e5cccSFangrui Song 
3571*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3572*207e5cccSFangrui Song // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3573*207e5cccSFangrui Song // CHECK:   store i8 [[TMP0]], ptr %a
3574*207e5cccSFangrui Song // CHECK:   ret void
3575*207e5cccSFangrui Song void test_vst1_lane_p8(poly8_t  *a, poly8x8_t b) {
3576*207e5cccSFangrui Song   vst1_lane_p8(a, b, 7);
3577*207e5cccSFangrui Song }
3578*207e5cccSFangrui Song 
3579*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3580*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3581*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3582*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3583*207e5cccSFangrui Song // CHECK:   store i16 [[TMP3]], ptr %a
3584*207e5cccSFangrui Song // CHECK:   ret void
3585*207e5cccSFangrui Song void test_vst1_lane_p16(poly16_t  *a, poly16x4_t b) {
3586*207e5cccSFangrui Song   vst1_lane_p16(a, b, 3);
3587*207e5cccSFangrui Song }
3588*207e5cccSFangrui Song 
3589*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3590*207e5cccSFangrui Song // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3591*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3592*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3593*207e5cccSFangrui Song // CHECK:   store i64 [[TMP3]], ptr %a
3594*207e5cccSFangrui Song // CHECK:   ret void
3595*207e5cccSFangrui Song void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
3596*207e5cccSFangrui Song   vst1_lane_p64(a, b, 0);
3597*207e5cccSFangrui Song }
3598*207e5cccSFangrui Song 
3599*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3600*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
3601*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
3602*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0
3603*207e5cccSFangrui Song // CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3604*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3605*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
3606*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3607*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3608*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
3609*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3610*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3611*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3612*207e5cccSFangrui Song // CHECK:   ret void
3613*207e5cccSFangrui Song void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
3614*207e5cccSFangrui Song   vst2q_lane_u8(a, b, 15);
3615*207e5cccSFangrui Song }
3616*207e5cccSFangrui Song 
3617*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3618*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
3619*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
3620*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
3621*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3622*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3623*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
3624*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3625*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3626*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3627*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
3628*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3629*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3630*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3631*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3632*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3633*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3634*207e5cccSFangrui Song // CHECK:   ret void
3635*207e5cccSFangrui Song void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
3636*207e5cccSFangrui Song   vst2q_lane_u16(a, b, 7);
3637*207e5cccSFangrui Song }
3638*207e5cccSFangrui Song 
3639*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
3640*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
3641*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
3642*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
3643*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3644*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3645*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
3646*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
3647*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
3648*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
3649*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
3650*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
3651*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
3652*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
3653*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
3654*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
3655*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a)
3656*207e5cccSFangrui Song // CHECK:   ret void
3657*207e5cccSFangrui Song void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
3658*207e5cccSFangrui Song   vst2q_lane_u32(a, b, 3);
3659*207e5cccSFangrui Song }
3660*207e5cccSFangrui Song 
3661*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3662*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
3663*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
3664*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
3665*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3666*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3667*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
3668*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3669*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3670*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3671*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
3672*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3673*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3674*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3675*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3676*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3677*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3678*207e5cccSFangrui Song // CHECK:   ret void
3679*207e5cccSFangrui Song void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
3680*207e5cccSFangrui Song   vst2q_lane_u64(a, b, 1);
3681*207e5cccSFangrui Song }
3682*207e5cccSFangrui Song 
3683*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3684*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
3685*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
3686*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0
3687*207e5cccSFangrui Song // CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3688*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3689*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
3690*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3691*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3692*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
3693*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3694*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3695*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3696*207e5cccSFangrui Song // CHECK:   ret void
3697*207e5cccSFangrui Song void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
3698*207e5cccSFangrui Song   vst2q_lane_s8(a, b, 15);
3699*207e5cccSFangrui Song }
3700*207e5cccSFangrui Song 
3701*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3702*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
3703*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
3704*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
3705*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3706*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3707*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
3708*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3709*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3710*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3711*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
3712*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3713*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3714*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3715*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3716*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3717*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3718*207e5cccSFangrui Song // CHECK:   ret void
3719*207e5cccSFangrui Song void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
3720*207e5cccSFangrui Song   vst2q_lane_s16(a, b, 7);
3721*207e5cccSFangrui Song }
3722*207e5cccSFangrui Song 
3723*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
3724*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
3725*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
3726*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
3727*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3728*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3729*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
3730*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
3731*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
3732*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
3733*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
3734*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
3735*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
3736*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
3737*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
3738*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
3739*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a)
3740*207e5cccSFangrui Song // CHECK:   ret void
3741*207e5cccSFangrui Song void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
3742*207e5cccSFangrui Song   vst2q_lane_s32(a, b, 3);
3743*207e5cccSFangrui Song }
3744*207e5cccSFangrui Song 
3745*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3746*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
3747*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
3748*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
3749*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3750*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3751*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
3752*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3753*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3754*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3755*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
3756*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3757*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3758*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3759*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3760*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3761*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3762*207e5cccSFangrui Song // CHECK:   ret void
3763*207e5cccSFangrui Song void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
3764*207e5cccSFangrui Song   vst2q_lane_s64(a, b, 1);
3765*207e5cccSFangrui Song }
3766*207e5cccSFangrui Song 
3767*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 {
3768*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
3769*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
3770*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
3771*207e5cccSFangrui Song // CHECK:   store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3772*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3773*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
3774*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
3775*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
3776*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
3777*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
3778*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
3779*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
3780*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
3781*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
3782*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
3783*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, ptr %a)
3784*207e5cccSFangrui Song // CHECK:   ret void
3785*207e5cccSFangrui Song void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
3786*207e5cccSFangrui Song   vst2q_lane_f16(a, b, 7);
3787*207e5cccSFangrui Song }
3788*207e5cccSFangrui Song 
3789*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 {
3790*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
3791*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
3792*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
3793*207e5cccSFangrui Song // CHECK:   store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3794*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3795*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
3796*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
3797*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
3798*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
3799*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
3800*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
3801*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
3802*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
3803*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
3804*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
3805*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, ptr %a)
3806*207e5cccSFangrui Song // CHECK:   ret void
3807*207e5cccSFangrui Song void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
3808*207e5cccSFangrui Song   vst2q_lane_f32(a, b, 3);
3809*207e5cccSFangrui Song }
3810*207e5cccSFangrui Song 
3811*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 {
3812*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
3813*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
3814*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
3815*207e5cccSFangrui Song // CHECK:   store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3816*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3817*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
3818*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
3819*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
3820*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
3821*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
3822*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
3823*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
3824*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
3825*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
3826*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
3827*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, ptr %a)
3828*207e5cccSFangrui Song // CHECK:   ret void
3829*207e5cccSFangrui Song void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
3830*207e5cccSFangrui Song   vst2q_lane_f64(a, b, 1);
3831*207e5cccSFangrui Song }
3832*207e5cccSFangrui Song 
3833*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3834*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
3835*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
3836*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0
3837*207e5cccSFangrui Song // CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3838*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3839*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
3840*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3841*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3842*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
3843*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3844*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3845*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3846*207e5cccSFangrui Song // CHECK:   ret void
3847*207e5cccSFangrui Song void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
3848*207e5cccSFangrui Song   vst2q_lane_p8(a, b, 15);
3849*207e5cccSFangrui Song }
3850*207e5cccSFangrui Song 
3851*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3852*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
3853*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
3854*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
3855*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3856*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3857*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
3858*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3859*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3860*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3861*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
3862*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3863*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3864*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3865*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3866*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3867*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3868*207e5cccSFangrui Song // CHECK:   ret void
3869*207e5cccSFangrui Song void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
3870*207e5cccSFangrui Song   vst2q_lane_p16(a, b, 7);
3871*207e5cccSFangrui Song }
3872*207e5cccSFangrui Song 
3873*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3874*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
3875*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
3876*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
3877*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3878*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3879*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
3880*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3881*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3882*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3883*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
3884*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3885*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3886*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3887*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3888*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3889*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3890*207e5cccSFangrui Song // CHECK:   ret void
3891*207e5cccSFangrui Song void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
3892*207e5cccSFangrui Song   vst2q_lane_p64(a, b, 1);
3893*207e5cccSFangrui Song }
3894*207e5cccSFangrui Song 
3895*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3896*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
3897*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
3898*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
3899*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3900*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3901*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
3902*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3903*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3904*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
3905*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3906*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3907*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
3908*207e5cccSFangrui Song // CHECK:   ret void
3909*207e5cccSFangrui Song void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
3910*207e5cccSFangrui Song   vst2_lane_u8(a, b, 7);
3911*207e5cccSFangrui Song }
3912*207e5cccSFangrui Song 
3913*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3914*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
3915*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
3916*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
3917*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3918*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3919*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
3920*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3921*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3922*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
3923*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
3924*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3925*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3926*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
3927*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
3928*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
3929*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
3930*207e5cccSFangrui Song // CHECK:   ret void
3931*207e5cccSFangrui Song void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
3932*207e5cccSFangrui Song   vst2_lane_u16(a, b, 3);
3933*207e5cccSFangrui Song }
3934*207e5cccSFangrui Song 
3935*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
3936*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
3937*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
3938*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
3939*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3940*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3941*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
3942*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
3943*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
3944*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
3945*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
3946*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
3947*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
3948*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
3949*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
3950*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
3951*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a)
3952*207e5cccSFangrui Song // CHECK:   ret void
3953*207e5cccSFangrui Song void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
3954*207e5cccSFangrui Song   vst2_lane_u32(a, b, 1);
3955*207e5cccSFangrui Song }
3956*207e5cccSFangrui Song 
3957*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3958*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
3959*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
3960*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
3961*207e5cccSFangrui Song // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3962*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3963*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
3964*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3965*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3966*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
3967*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
3968*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3969*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3970*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
3971*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
3972*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
3973*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
3974*207e5cccSFangrui Song // CHECK:   ret void
3975*207e5cccSFangrui Song void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
3976*207e5cccSFangrui Song   vst2_lane_u64(a, b, 0);
3977*207e5cccSFangrui Song }
3978*207e5cccSFangrui Song 
3979*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3980*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
3981*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
3982*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
3983*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3984*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3985*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
3986*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3987*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3988*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
3989*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3990*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3991*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
3992*207e5cccSFangrui Song // CHECK:   ret void
3993*207e5cccSFangrui Song void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
3994*207e5cccSFangrui Song   vst2_lane_s8(a, b, 7);
3995*207e5cccSFangrui Song }
3996*207e5cccSFangrui Song 
3997*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3998*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
3999*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
4000*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
4001*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4002*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4003*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
4004*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4005*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4006*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4007*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
4008*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4009*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4010*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4011*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4012*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4013*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
4014*207e5cccSFangrui Song // CHECK:   ret void
4015*207e5cccSFangrui Song void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
4016*207e5cccSFangrui Song   vst2_lane_s16(a, b, 3);
4017*207e5cccSFangrui Song }
4018*207e5cccSFangrui Song 
4019*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4020*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
4021*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
4022*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
4023*207e5cccSFangrui Song // CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4024*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4025*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
4026*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4027*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4028*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4029*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
4030*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4031*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4032*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4033*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4034*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4035*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a)
4036*207e5cccSFangrui Song // CHECK:   ret void
4037*207e5cccSFangrui Song void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
4038*207e5cccSFangrui Song   vst2_lane_s32(a, b, 1);
4039*207e5cccSFangrui Song }
4040*207e5cccSFangrui Song 
4041*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4042*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
4043*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
4044*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
4045*207e5cccSFangrui Song // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4046*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4047*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
4048*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4049*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4050*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4051*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
4052*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4053*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4054*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4055*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4056*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4057*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
4058*207e5cccSFangrui Song // CHECK:   ret void
4059*207e5cccSFangrui Song void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
4060*207e5cccSFangrui Song   vst2_lane_s64(a, b, 0);
4061*207e5cccSFangrui Song }
4062*207e5cccSFangrui Song 
4063*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 {
4064*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
4065*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
4066*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
4067*207e5cccSFangrui Song // CHECK:   store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4068*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4069*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
4070*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
4071*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
4072*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
4073*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
4074*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
4075*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
4076*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
4077*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
4078*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
4079*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, ptr %a)
4080*207e5cccSFangrui Song // CHECK:   ret void
4081*207e5cccSFangrui Song void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
4082*207e5cccSFangrui Song   vst2_lane_f16(a, b, 3);
4083*207e5cccSFangrui Song }
4084*207e5cccSFangrui Song 
4085*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 {
4086*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
4087*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
4088*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
4089*207e5cccSFangrui Song // CHECK:   store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4090*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4091*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
4092*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
4093*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
4094*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
4095*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
4096*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
4097*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
4098*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
4099*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
4100*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
4101*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, ptr %a)
4102*207e5cccSFangrui Song // CHECK:   ret void
4103*207e5cccSFangrui Song void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
4104*207e5cccSFangrui Song   vst2_lane_f32(a, b, 1);
4105*207e5cccSFangrui Song }
4106*207e5cccSFangrui Song 
4107*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 {
4108*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
4109*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
4110*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
4111*207e5cccSFangrui Song // CHECK:   store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4112*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4113*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
4114*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
4115*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
4116*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
4117*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
4118*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
4119*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
4120*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
4121*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
4122*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
4123*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, ptr %a)
4124*207e5cccSFangrui Song // CHECK:   ret void
4125*207e5cccSFangrui Song void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
4126*207e5cccSFangrui Song   vst2_lane_f64(a, b, 0);
4127*207e5cccSFangrui Song }
4128*207e5cccSFangrui Song 
4129*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4130*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
4131*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
4132*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
4133*207e5cccSFangrui Song // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4134*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4135*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
4136*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4137*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4138*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
4139*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4140*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4141*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
4142*207e5cccSFangrui Song // CHECK:   ret void
4143*207e5cccSFangrui Song void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
4144*207e5cccSFangrui Song   vst2_lane_p8(a, b, 7);
4145*207e5cccSFangrui Song }
4146*207e5cccSFangrui Song 
4147*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4148*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
4149*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
4150*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
4151*207e5cccSFangrui Song // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4152*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4153*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
4154*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4155*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4156*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4157*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
4158*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4159*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4160*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4161*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4162*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4163*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
4164*207e5cccSFangrui Song // CHECK:   ret void
4165*207e5cccSFangrui Song void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
4166*207e5cccSFangrui Song   vst2_lane_p16(a, b, 3);
4167*207e5cccSFangrui Song }
4168*207e5cccSFangrui Song 
4169*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4170*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
4171*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
4172*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
4173*207e5cccSFangrui Song // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4174*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4175*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
4176*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4177*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4178*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4179*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
4180*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4181*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4182*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4183*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4184*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4185*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
4186*207e5cccSFangrui Song // CHECK:   ret void
4187*207e5cccSFangrui Song void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
4188*207e5cccSFangrui Song   vst2_lane_p64(a, b, 0);
4189*207e5cccSFangrui Song }
4190*207e5cccSFangrui Song 
4191*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4192*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
4193*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
4194*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0
4195*207e5cccSFangrui Song // CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4196*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4197*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4198*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4199*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4200*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4201*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4202*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4203*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4204*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4205*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4206*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4207*207e5cccSFangrui Song // CHECK:   ret void
4208*207e5cccSFangrui Song void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
4209*207e5cccSFangrui Song   vst3q_lane_u8(a, b, 15);
4210*207e5cccSFangrui Song }
4211*207e5cccSFangrui Song 
4212*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4213*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
4214*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
4215*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
4216*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4217*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4218*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4219*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4220*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4221*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4222*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4223*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4224*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4225*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4226*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4227*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4228*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4229*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4230*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4231*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4232*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4233*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4234*207e5cccSFangrui Song // CHECK:   ret void
4235*207e5cccSFangrui Song void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
4236*207e5cccSFangrui Song   vst3q_lane_u16(a, b, 7);
4237*207e5cccSFangrui Song }
4238*207e5cccSFangrui Song 
4239*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4240*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
4241*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
4242*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
4243*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4244*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4245*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4246*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4247*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4248*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4249*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4250*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4251*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4252*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4253*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4254*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4255*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4256*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4257*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4258*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4259*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4260*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a)
4261*207e5cccSFangrui Song // CHECK:   ret void
4262*207e5cccSFangrui Song void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
4263*207e5cccSFangrui Song   vst3q_lane_u32(a, b, 3);
4264*207e5cccSFangrui Song }
4265*207e5cccSFangrui Song 
4266*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4267*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
4268*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
4269*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
4270*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4271*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4272*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4273*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4274*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4275*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4276*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4277*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4278*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4279*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4280*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4281*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4282*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4283*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4284*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4285*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4286*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4287*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4288*207e5cccSFangrui Song // CHECK:   ret void
4289*207e5cccSFangrui Song void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
4290*207e5cccSFangrui Song   vst3q_lane_u64(a, b, 1);
4291*207e5cccSFangrui Song }
4292*207e5cccSFangrui Song 
4293*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4294*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
4295*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
4296*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0
4297*207e5cccSFangrui Song // CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4298*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4299*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4300*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4301*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4302*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4303*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4304*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4305*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4306*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4307*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4308*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4309*207e5cccSFangrui Song // CHECK:   ret void
4310*207e5cccSFangrui Song void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
4311*207e5cccSFangrui Song   vst3q_lane_s8(a, b, 15);
4312*207e5cccSFangrui Song }
4313*207e5cccSFangrui Song 
4314*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4315*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
4316*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
4317*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
4318*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4319*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4320*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4321*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4322*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4323*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4324*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4325*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4326*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4327*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4328*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4329*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4330*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4331*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4332*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4333*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4334*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4335*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4336*207e5cccSFangrui Song // CHECK:   ret void
4337*207e5cccSFangrui Song void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
4338*207e5cccSFangrui Song   vst3q_lane_s16(a, b, 7);
4339*207e5cccSFangrui Song }
4340*207e5cccSFangrui Song 
4341*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4342*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
4343*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
4344*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
4345*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4346*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4347*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4348*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4349*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4350*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4351*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4352*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4353*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4354*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4355*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4356*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4357*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4358*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4359*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4360*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4361*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4362*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a)
4363*207e5cccSFangrui Song // CHECK:   ret void
4364*207e5cccSFangrui Song void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
4365*207e5cccSFangrui Song   vst3q_lane_s32(a, b, 3);
4366*207e5cccSFangrui Song }
4367*207e5cccSFangrui Song 
4368*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4369*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
4370*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
4371*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
4372*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4373*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4374*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4375*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4376*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4377*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4378*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4379*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4380*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4381*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4382*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4383*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4384*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4385*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4386*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4387*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4388*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4389*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4390*207e5cccSFangrui Song // CHECK:   ret void
4391*207e5cccSFangrui Song void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
4392*207e5cccSFangrui Song   vst3q_lane_s64(a, b, 1);
4393*207e5cccSFangrui Song }
4394*207e5cccSFangrui Song 
4395*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 {
4396*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
4397*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
4398*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
4399*207e5cccSFangrui Song // CHECK:   store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4400*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4401*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4402*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
4403*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
4404*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
4405*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4406*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
4407*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
4408*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
4409*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4410*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
4411*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
4412*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
4413*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
4414*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
4415*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
4416*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, ptr %a)
4417*207e5cccSFangrui Song // CHECK:   ret void
4418*207e5cccSFangrui Song void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
4419*207e5cccSFangrui Song   vst3q_lane_f16(a, b, 7);
4420*207e5cccSFangrui Song }
4421*207e5cccSFangrui Song 
4422*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 {
4423*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
4424*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
4425*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
4426*207e5cccSFangrui Song // CHECK:   store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4427*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4428*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4429*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
4430*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
4431*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
4432*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4433*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
4434*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
4435*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
4436*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4437*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
4438*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
4439*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
4440*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
4441*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
4442*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
4443*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, ptr %a)
4444*207e5cccSFangrui Song // CHECK:   ret void
4445*207e5cccSFangrui Song void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
4446*207e5cccSFangrui Song   vst3q_lane_f32(a, b, 3);
4447*207e5cccSFangrui Song }
4448*207e5cccSFangrui Song 
4449*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 {
4450*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
4451*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
4452*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
4453*207e5cccSFangrui Song // CHECK:   store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4454*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4455*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4456*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
4457*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
4458*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
4459*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4460*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
4461*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
4462*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
4463*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4464*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
4465*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
4466*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
4467*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
4468*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
4469*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
4470*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, ptr %a)
4471*207e5cccSFangrui Song // CHECK:   ret void
4472*207e5cccSFangrui Song void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
4473*207e5cccSFangrui Song   vst3q_lane_f64(a, b, 1);
4474*207e5cccSFangrui Song }
4475*207e5cccSFangrui Song 
4476*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4477*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
4478*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
4479*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
4480*207e5cccSFangrui Song // CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4481*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4482*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4483*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4484*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4485*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4486*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4487*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4488*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4489*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4490*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4491*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4492*207e5cccSFangrui Song // CHECK:   ret void
4493*207e5cccSFangrui Song void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
4494*207e5cccSFangrui Song   vst3q_lane_p8(a, b, 15);
4495*207e5cccSFangrui Song }
4496*207e5cccSFangrui Song 
4497*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4498*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
4499*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
4500*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
4501*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4502*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4503*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4504*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4505*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4506*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4507*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4508*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4509*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4510*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4511*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4512*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4513*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4514*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4515*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4516*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4517*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4518*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4519*207e5cccSFangrui Song // CHECK:   ret void
4520*207e5cccSFangrui Song void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
4521*207e5cccSFangrui Song   vst3q_lane_p16(a, b, 7);
4522*207e5cccSFangrui Song }
4523*207e5cccSFangrui Song 
4524*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4525*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
4526*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
4527*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
4528*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4529*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4530*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4531*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4532*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4533*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4534*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4535*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4536*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4537*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4538*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4539*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4540*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4541*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4542*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4543*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4544*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4545*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4546*207e5cccSFangrui Song // CHECK:   ret void
4547*207e5cccSFangrui Song void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
4548*207e5cccSFangrui Song   vst3q_lane_p64(a, b, 1);
4549*207e5cccSFangrui Song }
4550*207e5cccSFangrui Song 
4551*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4552*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
4553*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
4554*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
4555*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4556*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4557*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4558*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4559*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4560*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4561*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4562*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4563*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4564*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4565*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4566*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4567*207e5cccSFangrui Song // CHECK:   ret void
4568*207e5cccSFangrui Song void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
4569*207e5cccSFangrui Song   vst3_lane_u8(a, b, 7);
4570*207e5cccSFangrui Song }
4571*207e5cccSFangrui Song 
4572*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4573*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
4574*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
4575*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
4576*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4577*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4578*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4579*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4580*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4581*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4582*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4583*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4584*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4585*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4586*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4587*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4588*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4589*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4590*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4591*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4592*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4593*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4594*207e5cccSFangrui Song // CHECK:   ret void
4595*207e5cccSFangrui Song void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
4596*207e5cccSFangrui Song   vst3_lane_u16(a, b, 3);
4597*207e5cccSFangrui Song }
4598*207e5cccSFangrui Song 
4599*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4600*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
4601*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
4602*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
4603*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4604*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4605*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4606*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4607*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4608*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4609*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4610*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4611*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4612*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4613*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4614*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
4615*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
4616*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4617*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4618*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4619*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4620*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a)
4621*207e5cccSFangrui Song // CHECK:   ret void
4622*207e5cccSFangrui Song void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
4623*207e5cccSFangrui Song   vst3_lane_u32(a, b, 1);
4624*207e5cccSFangrui Song }
4625*207e5cccSFangrui Song 
4626*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4627*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
4628*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
4629*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
4630*207e5cccSFangrui Song // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4631*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4632*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4633*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4634*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4635*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4636*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4637*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4638*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4639*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4640*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4641*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4642*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4643*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4644*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4645*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4646*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4647*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4648*207e5cccSFangrui Song // CHECK:   ret void
4649*207e5cccSFangrui Song void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
4650*207e5cccSFangrui Song   vst3_lane_u64(a, b, 0);
4651*207e5cccSFangrui Song }
4652*207e5cccSFangrui Song 
4653*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4654*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
4655*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
4656*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
4657*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4658*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4659*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4660*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4661*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4662*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4663*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4664*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4665*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4666*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4667*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4668*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4669*207e5cccSFangrui Song // CHECK:   ret void
4670*207e5cccSFangrui Song void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
4671*207e5cccSFangrui Song   vst3_lane_s8(a, b, 7);
4672*207e5cccSFangrui Song }
4673*207e5cccSFangrui Song 
4674*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4675*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
4676*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
4677*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
4678*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4679*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4680*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4681*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4682*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4683*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4684*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4685*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4686*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4687*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4688*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4689*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4690*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4691*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4692*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4693*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4694*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4695*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4696*207e5cccSFangrui Song // CHECK:   ret void
4697*207e5cccSFangrui Song void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
4698*207e5cccSFangrui Song   vst3_lane_s16(a, b, 3);
4699*207e5cccSFangrui Song }
4700*207e5cccSFangrui Song 
4701*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4702*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
4703*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
4704*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
4705*207e5cccSFangrui Song // CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4706*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4707*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4708*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4709*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4710*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4711*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4712*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4713*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4714*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4715*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4716*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
4717*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
4718*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4719*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4720*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4721*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4722*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a)
4723*207e5cccSFangrui Song // CHECK:   ret void
4724*207e5cccSFangrui Song void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
4725*207e5cccSFangrui Song   vst3_lane_s32(a, b, 1);
4726*207e5cccSFangrui Song }
4727*207e5cccSFangrui Song 
4728*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4729*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
4730*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
4731*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
4732*207e5cccSFangrui Song // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4733*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4734*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4735*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4736*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4737*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4738*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4739*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4740*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4741*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4742*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4743*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4744*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4745*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4746*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4747*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4748*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4749*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4750*207e5cccSFangrui Song // CHECK:   ret void
4751*207e5cccSFangrui Song void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
4752*207e5cccSFangrui Song   vst3_lane_s64(a, b, 0);
4753*207e5cccSFangrui Song }
4754*207e5cccSFangrui Song 
4755*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 {
4756*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
4757*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
4758*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
4759*207e5cccSFangrui Song // CHECK:   store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4760*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4761*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4762*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
4763*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
4764*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
4765*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4766*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
4767*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
4768*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
4769*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4770*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
4771*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
4772*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
4773*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
4774*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
4775*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
4776*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, ptr %a)
4777*207e5cccSFangrui Song // CHECK:   ret void
4778*207e5cccSFangrui Song void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
4779*207e5cccSFangrui Song   vst3_lane_f16(a, b, 3);
4780*207e5cccSFangrui Song }
4781*207e5cccSFangrui Song 
4782*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 {
4783*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
4784*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
4785*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
4786*207e5cccSFangrui Song // CHECK:   store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4787*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4788*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4789*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
4790*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
4791*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
4792*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4793*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
4794*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
4795*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
4796*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4797*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
4798*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
4799*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
4800*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
4801*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
4802*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
4803*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, ptr %a)
4804*207e5cccSFangrui Song // CHECK:   ret void
4805*207e5cccSFangrui Song void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
4806*207e5cccSFangrui Song   vst3_lane_f32(a, b, 1);
4807*207e5cccSFangrui Song }
4808*207e5cccSFangrui Song 
4809*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 {
4810*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
4811*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
4812*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
4813*207e5cccSFangrui Song // CHECK:   store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4814*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4815*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4816*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
4817*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
4818*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
4819*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4820*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
4821*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
4822*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
4823*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4824*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
4825*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
4826*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
4827*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
4828*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
4829*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
4830*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, ptr %a)
4831*207e5cccSFangrui Song // CHECK:   ret void
4832*207e5cccSFangrui Song void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
4833*207e5cccSFangrui Song   vst3_lane_f64(a, b, 0);
4834*207e5cccSFangrui Song }
4835*207e5cccSFangrui Song 
4836*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4837*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
4838*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
4839*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
4840*207e5cccSFangrui Song // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4841*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4842*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4843*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4844*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4845*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4846*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4847*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4848*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4849*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4850*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4851*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4852*207e5cccSFangrui Song // CHECK:   ret void
4853*207e5cccSFangrui Song void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
4854*207e5cccSFangrui Song   vst3_lane_p8(a, b, 7);
4855*207e5cccSFangrui Song }
4856*207e5cccSFangrui Song 
4857*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4858*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
4859*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
4860*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
4861*207e5cccSFangrui Song // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4862*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4863*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4864*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4865*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4866*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4867*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4868*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4869*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4870*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4871*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4872*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4873*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4874*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4875*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4876*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4877*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4878*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4879*207e5cccSFangrui Song // CHECK:   ret void
4880*207e5cccSFangrui Song void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
4881*207e5cccSFangrui Song   vst3_lane_p16(a, b, 3);
4882*207e5cccSFangrui Song }
4883*207e5cccSFangrui Song 
4884*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4885*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
4886*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
4887*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
4888*207e5cccSFangrui Song // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4889*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4890*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4891*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4892*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4893*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4894*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4895*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4896*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4897*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4898*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4899*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4900*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4901*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4902*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4903*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4904*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4905*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4906*207e5cccSFangrui Song // CHECK:   ret void
4907*207e5cccSFangrui Song void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
4908*207e5cccSFangrui Song   vst3_lane_p64(a, b, 0);
4909*207e5cccSFangrui Song }
4910*207e5cccSFangrui Song 
4911*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4912*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
4913*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
4914*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
4915*207e5cccSFangrui Song // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4916*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4917*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4918*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4919*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4920*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4921*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4922*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4923*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4924*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4925*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4926*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4927*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
4928*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
4929*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
4930*207e5cccSFangrui Song // CHECK:   ret void
4931*207e5cccSFangrui Song void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
4932*207e5cccSFangrui Song   vst4q_lane_u8(a, b, 15);
4933*207e5cccSFangrui Song }
4934*207e5cccSFangrui Song 
4935*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4936*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
4937*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
4938*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
4939*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4940*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4941*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4942*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4943*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4944*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4945*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4946*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4947*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4948*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4949*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4950*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4951*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4952*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4953*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4954*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
4955*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
4956*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
4957*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4958*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4959*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4960*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
4961*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
4962*207e5cccSFangrui Song // CHECK:   ret void
4963*207e5cccSFangrui Song void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
4964*207e5cccSFangrui Song   vst4q_lane_u16(a, b, 7);
4965*207e5cccSFangrui Song }
4966*207e5cccSFangrui Song 
4967*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4968*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
4969*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
4970*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
4971*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4972*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4973*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4974*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4975*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4976*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4977*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4978*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4979*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4980*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4981*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4982*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4983*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4984*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4985*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4986*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
4987*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
4988*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
4989*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4990*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4991*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4992*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
4993*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a)
4994*207e5cccSFangrui Song // CHECK:   ret void
4995*207e5cccSFangrui Song void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
4996*207e5cccSFangrui Song   vst4q_lane_u32(a, b, 3);
4997*207e5cccSFangrui Song }
4998*207e5cccSFangrui Song 
4999*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5000*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
5001*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
5002*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
5003*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5004*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5005*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5006*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5007*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5008*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5009*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5010*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5011*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5012*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5013*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5014*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5015*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5016*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5017*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5018*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5019*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5020*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5021*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5022*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5023*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5024*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5025*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5026*207e5cccSFangrui Song // CHECK:   ret void
5027*207e5cccSFangrui Song void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
5028*207e5cccSFangrui Song   vst4q_lane_u64(a, b, 1);
5029*207e5cccSFangrui Song }
5030*207e5cccSFangrui Song 
5031*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
5032*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
5033*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
5034*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
5035*207e5cccSFangrui Song // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5036*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5037*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5038*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
5039*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
5040*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5041*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
5042*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
5043*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5044*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
5045*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
5046*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5047*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
5048*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
5049*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
5050*207e5cccSFangrui Song // CHECK:   ret void
5051*207e5cccSFangrui Song void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
5052*207e5cccSFangrui Song   vst4q_lane_s8(a, b, 15);
5053*207e5cccSFangrui Song }
5054*207e5cccSFangrui Song 
5055*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
5056*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
5057*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
5058*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
5059*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5060*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5061*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5062*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
5063*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5064*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5065*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5066*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
5067*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5068*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5069*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5070*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
5071*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5072*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5073*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5074*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
5075*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5076*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5077*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5078*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5079*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5080*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5081*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
5082*207e5cccSFangrui Song // CHECK:   ret void
5083*207e5cccSFangrui Song void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
5084*207e5cccSFangrui Song   vst4q_lane_s16(a, b, 7);
5085*207e5cccSFangrui Song }
5086*207e5cccSFangrui Song 
5087*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
5088*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
5089*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
5090*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
5091*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5092*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5093*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5094*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
5095*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
5096*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
5097*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5098*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
5099*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
5100*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5101*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5102*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
5103*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
5104*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5105*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5106*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
5107*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
5108*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5109*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
5110*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5111*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5112*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5113*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a)
5114*207e5cccSFangrui Song // CHECK:   ret void
5115*207e5cccSFangrui Song void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
5116*207e5cccSFangrui Song   vst4q_lane_s32(a, b, 3);
5117*207e5cccSFangrui Song }
5118*207e5cccSFangrui Song 
5119*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5120*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
5121*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
5122*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
5123*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5124*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5125*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5126*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5127*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5128*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5129*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5130*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5131*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5132*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5133*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5134*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5135*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5136*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5137*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5138*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5139*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5140*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5141*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5142*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5143*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5144*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5145*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5146*207e5cccSFangrui Song // CHECK:   ret void
5147*207e5cccSFangrui Song void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
5148*207e5cccSFangrui Song   vst4q_lane_s64(a, b, 1);
5149*207e5cccSFangrui Song }
5150*207e5cccSFangrui Song 
5151*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 {
5152*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
5153*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
5154*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
5155*207e5cccSFangrui Song // CHECK:   store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5156*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5157*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5158*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
5159*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
5160*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
5161*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5162*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
5163*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
5164*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5165*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5166*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
5167*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
5168*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5169*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5170*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
5171*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
5172*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
5173*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
5174*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
5175*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
5176*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
5177*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, ptr %a)
5178*207e5cccSFangrui Song // CHECK:   ret void
5179*207e5cccSFangrui Song void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
5180*207e5cccSFangrui Song   vst4q_lane_f16(a, b, 7);
5181*207e5cccSFangrui Song }
5182*207e5cccSFangrui Song 
5183*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 {
5184*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
5185*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
5186*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
5187*207e5cccSFangrui Song // CHECK:   store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5188*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5189*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5190*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
5191*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
5192*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
5193*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5194*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
5195*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
5196*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5197*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5198*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
5199*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
5200*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5201*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5202*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
5203*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
5204*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
5205*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
5206*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5207*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5208*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
5209*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, ptr %a)
5210*207e5cccSFangrui Song // CHECK:   ret void
5211*207e5cccSFangrui Song void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
5212*207e5cccSFangrui Song   vst4q_lane_f32(a, b, 3);
5213*207e5cccSFangrui Song }
5214*207e5cccSFangrui Song 
5215*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 {
5216*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
5217*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
5218*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
5219*207e5cccSFangrui Song // CHECK:   store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5220*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5221*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5222*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
5223*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
5224*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
5225*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5226*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
5227*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
5228*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
5229*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5230*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
5231*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
5232*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
5233*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5234*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
5235*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
5236*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
5237*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
5238*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
5239*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
5240*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
5241*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, ptr %a)
5242*207e5cccSFangrui Song // CHECK:   ret void
5243*207e5cccSFangrui Song void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
5244*207e5cccSFangrui Song   vst4q_lane_f64(a, b, 1);
5245*207e5cccSFangrui Song }
5246*207e5cccSFangrui Song 
5247*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
5248*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
5249*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
5250*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
5251*207e5cccSFangrui Song // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5252*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5253*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5254*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
5255*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
5256*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5257*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
5258*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
5259*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5260*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
5261*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
5262*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5263*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
5264*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
5265*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
5266*207e5cccSFangrui Song // CHECK:   ret void
5267*207e5cccSFangrui Song void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
5268*207e5cccSFangrui Song   vst4q_lane_p8(a, b, 15);
5269*207e5cccSFangrui Song }
5270*207e5cccSFangrui Song 
5271*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
5272*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
5273*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
5274*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
5275*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5276*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5277*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5278*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
5279*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5280*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5281*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5282*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
5283*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5284*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5285*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5286*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
5287*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5288*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5289*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5290*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
5291*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5292*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5293*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5294*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5295*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5296*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5297*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
5298*207e5cccSFangrui Song // CHECK:   ret void
5299*207e5cccSFangrui Song void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
5300*207e5cccSFangrui Song   vst4q_lane_p16(a, b, 7);
5301*207e5cccSFangrui Song }
5302*207e5cccSFangrui Song 
5303*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5304*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
5305*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
5306*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
5307*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5308*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5309*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5310*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5311*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5312*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5313*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5314*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5315*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5316*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5317*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5318*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5319*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5320*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5321*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5322*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5323*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5324*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5325*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5326*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5327*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5328*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5329*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5330*207e5cccSFangrui Song // CHECK:   ret void
5331*207e5cccSFangrui Song void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
5332*207e5cccSFangrui Song   vst4q_lane_p64(a, b, 1);
5333*207e5cccSFangrui Song }
5334*207e5cccSFangrui Song 
5335*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5336*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
5337*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
5338*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
5339*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5340*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5341*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5342*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5343*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5344*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5345*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5346*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5347*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5348*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5349*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5350*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5351*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5352*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5353*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5354*207e5cccSFangrui Song // CHECK:   ret void
5355*207e5cccSFangrui Song void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
5356*207e5cccSFangrui Song   vst4_lane_u8(a, b, 7);
5357*207e5cccSFangrui Song }
5358*207e5cccSFangrui Song 
5359*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5360*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
5361*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
5362*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
5363*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5364*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5365*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5366*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5367*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5368*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5369*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5370*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5371*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5372*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5373*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5374*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5375*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5376*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5377*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5378*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5379*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5380*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5381*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5382*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5383*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5384*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5385*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5386*207e5cccSFangrui Song // CHECK:   ret void
5387*207e5cccSFangrui Song void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
5388*207e5cccSFangrui Song   vst4_lane_u16(a, b, 3);
5389*207e5cccSFangrui Song }
5390*207e5cccSFangrui Song 
5391*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
5392*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
5393*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
5394*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
5395*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5396*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5397*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5398*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
5399*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
5400*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
5401*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5402*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
5403*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
5404*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5405*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5406*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
5407*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
5408*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5409*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5410*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
5411*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
5412*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5413*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
5414*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5415*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5416*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5417*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a)
5418*207e5cccSFangrui Song // CHECK:   ret void
5419*207e5cccSFangrui Song void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
5420*207e5cccSFangrui Song   vst4_lane_u32(a, b, 1);
5421*207e5cccSFangrui Song }
5422*207e5cccSFangrui Song 
5423*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5424*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
5425*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
5426*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
5427*207e5cccSFangrui Song // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5428*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5429*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5430*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5431*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5432*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5433*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5434*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5435*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5436*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5437*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5438*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5439*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5440*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5441*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5442*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5443*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5444*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5445*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5446*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5447*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5448*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5449*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5450*207e5cccSFangrui Song // CHECK:   ret void
5451*207e5cccSFangrui Song void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
5452*207e5cccSFangrui Song   vst4_lane_u64(a, b, 0);
5453*207e5cccSFangrui Song }
5454*207e5cccSFangrui Song 
5455*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5456*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
5457*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
5458*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
5459*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5460*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5461*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5462*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5463*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5464*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5465*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5466*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5467*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5468*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5469*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5470*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5471*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5472*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5473*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5474*207e5cccSFangrui Song // CHECK:   ret void
5475*207e5cccSFangrui Song void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
5476*207e5cccSFangrui Song   vst4_lane_s8(a, b, 7);
5477*207e5cccSFangrui Song }
5478*207e5cccSFangrui Song 
5479*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5480*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
5481*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
5482*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
5483*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5484*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5485*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5486*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5487*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5488*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5489*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5490*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5491*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5492*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5493*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5494*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5495*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5496*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5497*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5498*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5499*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5500*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5501*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5502*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5503*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5504*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5505*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5506*207e5cccSFangrui Song // CHECK:   ret void
5507*207e5cccSFangrui Song void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
5508*207e5cccSFangrui Song   vst4_lane_s16(a, b, 3);
5509*207e5cccSFangrui Song }
5510*207e5cccSFangrui Song 
5511*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
5512*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
5513*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
5514*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
5515*207e5cccSFangrui Song // CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5516*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5517*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5518*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
5519*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
5520*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
5521*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5522*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
5523*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
5524*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5525*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5526*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
5527*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
5528*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5529*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5530*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
5531*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
5532*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5533*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
5534*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5535*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5536*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5537*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a)
5538*207e5cccSFangrui Song // CHECK:   ret void
5539*207e5cccSFangrui Song void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
5540*207e5cccSFangrui Song   vst4_lane_s32(a, b, 1);
5541*207e5cccSFangrui Song }
5542*207e5cccSFangrui Song 
5543*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5544*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
5545*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
5546*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
5547*207e5cccSFangrui Song // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5548*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5549*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5550*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5551*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5552*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5553*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5554*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5555*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5556*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5557*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5558*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5559*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5560*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5561*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5562*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5563*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5564*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5565*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5566*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5567*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5568*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5569*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5570*207e5cccSFangrui Song // CHECK:   ret void
5571*207e5cccSFangrui Song void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
5572*207e5cccSFangrui Song   vst4_lane_s64(a, b, 0);
5573*207e5cccSFangrui Song }
5574*207e5cccSFangrui Song 
5575*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 {
5576*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
5577*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
5578*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
5579*207e5cccSFangrui Song // CHECK:   store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5580*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5581*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5582*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
5583*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
5584*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
5585*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5586*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
5587*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
5588*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5589*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5590*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
5591*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
5592*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5593*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5594*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
5595*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
5596*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
5597*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
5598*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
5599*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
5600*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
5601*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, ptr %a)
5602*207e5cccSFangrui Song // CHECK:   ret void
5603*207e5cccSFangrui Song void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
5604*207e5cccSFangrui Song   vst4_lane_f16(a, b, 3);
5605*207e5cccSFangrui Song }
5606*207e5cccSFangrui Song 
5607*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 {
5608*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
5609*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
5610*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
5611*207e5cccSFangrui Song // CHECK:   store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5612*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5613*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5614*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
5615*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
5616*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
5617*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5618*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
5619*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
5620*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5621*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5622*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
5623*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
5624*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5625*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5626*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
5627*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
5628*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
5629*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
5630*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5631*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5632*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
5633*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, ptr %a)
5634*207e5cccSFangrui Song // CHECK:   ret void
5635*207e5cccSFangrui Song void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
5636*207e5cccSFangrui Song   vst4_lane_f32(a, b, 1);
5637*207e5cccSFangrui Song }
5638*207e5cccSFangrui Song 
5639*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 {
5640*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
5641*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
5642*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
5643*207e5cccSFangrui Song // CHECK:   store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5644*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5645*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5646*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
5647*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
5648*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
5649*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5650*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
5651*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
5652*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
5653*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5654*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
5655*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
5656*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
5657*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5658*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
5659*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
5660*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
5661*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
5662*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
5663*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
5664*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
5665*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, ptr %a)
5666*207e5cccSFangrui Song // CHECK:   ret void
5667*207e5cccSFangrui Song void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
5668*207e5cccSFangrui Song   vst4_lane_f64(a, b, 0);
5669*207e5cccSFangrui Song }
5670*207e5cccSFangrui Song 
5671*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5672*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
5673*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
5674*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
5675*207e5cccSFangrui Song // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5676*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5677*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5678*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5679*207e5cccSFangrui Song // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5680*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5681*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5682*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5683*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5684*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5685*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5686*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5687*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5688*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5689*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5690*207e5cccSFangrui Song // CHECK:   ret void
5691*207e5cccSFangrui Song void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
5692*207e5cccSFangrui Song   vst4_lane_p8(a, b, 7);
5693*207e5cccSFangrui Song }
5694*207e5cccSFangrui Song 
5695*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5696*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
5697*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
5698*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
5699*207e5cccSFangrui Song // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5700*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5701*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5702*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5703*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5704*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5705*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5706*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5707*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5708*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5709*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5710*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5711*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5712*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5713*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5714*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5715*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5716*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5717*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5718*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5719*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5720*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5721*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5722*207e5cccSFangrui Song // CHECK:   ret void
5723*207e5cccSFangrui Song void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
5724*207e5cccSFangrui Song   vst4_lane_p16(a, b, 3);
5725*207e5cccSFangrui Song }
5726*207e5cccSFangrui Song 
5727*207e5cccSFangrui Song // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5728*207e5cccSFangrui Song // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
5729*207e5cccSFangrui Song // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
5730*207e5cccSFangrui Song // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
5731*207e5cccSFangrui Song // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5732*207e5cccSFangrui Song // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5733*207e5cccSFangrui Song // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5734*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5735*207e5cccSFangrui Song // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5736*207e5cccSFangrui Song // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5737*207e5cccSFangrui Song // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5738*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5739*207e5cccSFangrui Song // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5740*207e5cccSFangrui Song // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5741*207e5cccSFangrui Song // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5742*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5743*207e5cccSFangrui Song // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5744*207e5cccSFangrui Song // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5745*207e5cccSFangrui Song // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5746*207e5cccSFangrui Song // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5747*207e5cccSFangrui Song // CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5748*207e5cccSFangrui Song // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5749*207e5cccSFangrui Song // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5750*207e5cccSFangrui Song // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5751*207e5cccSFangrui Song // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5752*207e5cccSFangrui Song // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5753*207e5cccSFangrui Song // CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5754*207e5cccSFangrui Song // CHECK:   ret void
5755*207e5cccSFangrui Song void test_vst4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
5756*207e5cccSFangrui Song   vst4_lane_p64(a, b, 0);
5757*207e5cccSFangrui Song }
5758