xref: /llvm-project/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll (revision a7697c86559e9d57c9c0e2b5f2daaa5cec4e5119)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=armv8.6a-arm-none-eabi -mattr=+bf16,+neon,+fullfp16 < %s | FileCheck %s
3; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops
4; depending on it.
5
6define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_bf16(ptr nocapture readonly %ptr) {
7; CHECK-LABEL: test_vld1_bf16:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    vld1.16 {d0}, [r0]
10; CHECK-NEXT:    bx lr
11entry:
12  %0 = load <4 x bfloat>, ptr %ptr, align 2
13  ret <4 x bfloat> %0
14}
15
16define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_bf16(ptr nocapture readonly %ptr) {
17; CHECK-LABEL: test_vld1q_bf16:
18; CHECK:       @ %bb.0: @ %entry
19; CHECK-NEXT:    vld1.16 {d0, d1}, [r0]
20; CHECK-NEXT:    bx lr
21entry:
22  %0 = load <8 x bfloat>, ptr %ptr, align 2
23  ret <8 x bfloat> %0
24}
25
26define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_lane_bf16(ptr nocapture readonly %ptr, <4 x bfloat> %src) {
27; CHECK-LABEL: test_vld1_lane_bf16:
28; CHECK:       @ %bb.0: @ %entry
29; CHECK-NEXT:    vld1.16 {d0[0]}, [r0:16]
30; CHECK-NEXT:    bx lr
31entry:
32  %0 = load bfloat, ptr %ptr, align 2
33  %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
34  ret <4 x bfloat> %vld1_lane
35}
36
37define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_lane_bf16(ptr nocapture readonly %ptr, <8 x bfloat> %src) {
38; CHECK-LABEL: test_vld1q_lane_bf16:
39; CHECK:       @ %bb.0: @ %entry
40; CHECK-NEXT:    vld1.16 {d1[3]}, [r0:16]
41; CHECK-NEXT:    bx lr
42entry:
43  %0 = load bfloat, ptr %ptr, align 2
44  %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
45  ret <8 x bfloat> %vld1_lane
46}
47
48define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_dup_bf16(ptr nocapture readonly %ptr) {
49; CHECK-LABEL: test_vld1_dup_bf16:
50; CHECK:       @ %bb.0: @ %entry
51; CHECK-NEXT:    vld1.16 {d0[]}, [r0:16]
52; CHECK-NEXT:    bx lr
53entry:
54  %0 = load bfloat, ptr %ptr, align 2
55  %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
56  %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
57  ret <4 x bfloat> %lane
58}
59
60define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(ptr %ptr) {
61; CHECK-LABEL: test_vld1_bf16_x2:
62; CHECK:       @ %bb.0: @ %entry
63; CHECK-NEXT:    vld1.16 {d0, d1}, [r0]
64; CHECK-NEXT:    bx lr
65entry:
66  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr %ptr)
67  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
68  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
69  %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
70  %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
71  %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0
72  %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
73  ret [2 x <2 x i32>] %.fca.1.insert
74}
75
76define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(ptr %ptr) {
77; CHECK-LABEL: test_vld1q_bf16_x2:
78; CHECK:       @ %bb.0: @ %entry
79; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0]
80; CHECK-NEXT:    bx lr
81entry:
82  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr %ptr)
83  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
84  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
85  %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
86  %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
87  %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0
88  %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
89  ret [2 x <4 x i32>] %.fca.1.insert
90}
91
92define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(ptr %ptr) {
93; CHECK-LABEL: test_vld1_bf16_x3:
94; CHECK:       @ %bb.0: @ %entry
95; CHECK-NEXT:    vld1.16 {d0, d1, d2}, [r0]
96; CHECK-NEXT:    bx lr
97entry:
98  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr %ptr)
99  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
100  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
101  %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
102  %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
103  %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
104  %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
105  %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0
106  %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
107  %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
108  ret [3 x <2 x i32>] %.fca.2.insert
109}
110
111define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(ptr %ptr) {
112; CHECK-LABEL: test_vld1q_bf16_x3:
113; CHECK:       @ %bb.0: @ %entry
114; CHECK-NEXT:    vld1.16 {d0, d1, d2}, [r0]!
115; CHECK-NEXT:    vld1.16 {d3, d4, d5}, [r0]
116; CHECK-NEXT:    bx lr
117entry:
118  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr %ptr)
119  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
120  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
121  %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
122  %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
123  %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
124  %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
125  %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0
126  %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
127  %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
128  ret [3 x <4 x i32>] %.fca.2.insert
129}
130
131define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(ptr %ptr) {
132; CHECK-LABEL: test_vld1_bf16_x4:
133; CHECK:       @ %bb.0: @ %entry
134; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0]
135; CHECK-NEXT:    bx lr
136entry:
137  %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr %ptr)
138  %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
139  %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
140  %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
141  %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3
142  %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
143  %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
144  %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
145  %3 = bitcast <4 x bfloat> %vld1xN.fca.3.extract to <2 x i32>
146  %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0
147  %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
148  %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
149  %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3
150  ret [4 x <2 x i32>] %.fca.3.insert
151}
152
153define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(ptr %ptr) {
154; CHECK-LABEL: test_vld1q_bf16_x4:
155; CHECK:       @ %bb.0: @ %entry
156; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0]!
157; CHECK-NEXT:    vld1.16 {d4, d5, d6, d7}, [r0]
158; CHECK-NEXT:    bx lr
159entry:
160  %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr %ptr)
161  %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
162  %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
163  %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
164  %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3
165  %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
166  %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
167  %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
168  %3 = bitcast <8 x bfloat> %vld1xN.fca.3.extract to <4 x i32>
169  %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0
170  %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
171  %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
172  %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3
173  ret [4 x <4 x i32>] %.fca.3.insert
174}
175
176define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_dup_bf16(ptr nocapture readonly %ptr) {
177; CHECK-LABEL: test_vld1q_dup_bf16:
178; CHECK:       @ %bb.0: @ %entry
179; CHECK-NEXT:    vld1.16 {d0[], d1[]}, [r0:16]
180; CHECK-NEXT:    bx lr
181entry:
182  %0 = load bfloat, ptr %ptr, align 2
183  %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
184  %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
185  ret <8 x bfloat> %lane
186}
187
188define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_bf16(ptr %ptr) {
189; CHECK-LABEL: test_vld2_bf16:
190; CHECK:       @ %bb.0: @ %entry
191; CHECK-NEXT:    vld2.16 {d0, d1}, [r0]
192; CHECK-NEXT:    bx lr
193entry:
194  %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr %ptr, i32 2)
195  %vld2_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 0
196  %vld2_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 1
197  %0 = bitcast <4 x bfloat> %vld2_v.fca.0.extract to <2 x i32>
198  %1 = bitcast <4 x bfloat> %vld2_v.fca.1.extract to <2 x i32>
199  %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0
200  %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
201  ret [2 x <2 x i32>] %.fca.1.insert
202}
203
204define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_bf16(ptr %ptr) {
205; CHECK-LABEL: test_vld2q_bf16:
206; CHECK:       @ %bb.0: @ %entry
207; CHECK-NEXT:    vld2.16 {d0, d1, d2, d3}, [r0]
208; CHECK-NEXT:    bx lr
209entry:
210  %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr %ptr, i32 2)
211  %vld2q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 0
212  %vld2q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 1
213  %0 = bitcast <8 x bfloat> %vld2q_v.fca.0.extract to <4 x i32>
214  %1 = bitcast <8 x bfloat> %vld2q_v.fca.1.extract to <4 x i32>
215  %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0
216  %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
217  ret [2 x <4 x i32>] %.fca.1.insert
218}
219
220define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_lane_bf16(ptr %ptr, [2 x <2 x i32>] %src.coerce) {
221; CHECK-LABEL: test_vld2_lane_bf16:
222; CHECK:       @ %bb.0: @ %entry
223; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
224; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
225; CHECK-NEXT:    vld2.16 {d0[1], d1[1]}, [r0]
226; CHECK-NEXT:    bx lr
227entry:
228  %src.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %src.coerce, 0
229  %src.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %src.coerce, 1
230  %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
231  %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
232  %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
233  %vld2_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 0
234  %vld2_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 1
235  %2 = bitcast <4 x bfloat> %vld2_lane_v.fca.0.extract to <2 x i32>
236  %3 = bitcast <4 x bfloat> %vld2_lane_v.fca.1.extract to <2 x i32>
237  %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %2, 0
238  %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %3, 1
239  ret [2 x <2 x i32>] %.fca.1.insert
240}
241
242define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_lane_bf16(ptr %ptr, [2 x <4 x i32>] %src.coerce) {
243; CHECK-LABEL: test_vld2q_lane_bf16:
244; CHECK:       @ %bb.0: @ %entry
245; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
246; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
247; CHECK-NEXT:    vld2.16 {d1[3], d3[3]}, [r0]
248; CHECK-NEXT:    bx lr
249entry:
250  %src.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %src.coerce, 0
251  %src.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %src.coerce, 1
252  %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
253  %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
254  %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
255  %vld2q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 0
256  %vld2q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 1
257  %2 = bitcast <8 x bfloat> %vld2q_lane_v.fca.0.extract to <4 x i32>
258  %3 = bitcast <8 x bfloat> %vld2q_lane_v.fca.1.extract to <4 x i32>
259  %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %2, 0
260  %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %3, 1
261  ret [2 x <4 x i32>] %.fca.1.insert
262}
263
264define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_bf16(ptr %ptr) {
265; CHECK-LABEL: test_vld3_bf16:
266; CHECK:       @ %bb.0: @ %entry
267; CHECK-NEXT:    vld3.16 {d0, d1, d2}, [r0]
268; CHECK-NEXT:    bx lr
269entry:
270  %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr %ptr, i32 2)
271  %vld3_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 0
272  %vld3_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 1
273  %vld3_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 2
274  %0 = bitcast <4 x bfloat> %vld3_v.fca.0.extract to <2 x i32>
275  %1 = bitcast <4 x bfloat> %vld3_v.fca.1.extract to <2 x i32>
276  %2 = bitcast <4 x bfloat> %vld3_v.fca.2.extract to <2 x i32>
277  %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0
278  %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
279  %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
280  ret [3 x <2 x i32>] %.fca.2.insert
281}
282
283define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_bf16(ptr %ptr) {
284; CHECK-LABEL: test_vld3q_bf16:
285; CHECK:       @ %bb.0: @ %entry
286; CHECK-NEXT:    vld3.16 {d0, d2, d4}, [r0]!
287; CHECK-NEXT:    vld3.16 {d1, d3, d5}, [r0]
288; CHECK-NEXT:    bx lr
289entry:
290  %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr %ptr, i32 2)
291  %vld3q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 0
292  %vld3q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 1
293  %vld3q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 2
294  %0 = bitcast <8 x bfloat> %vld3q_v.fca.0.extract to <4 x i32>
295  %1 = bitcast <8 x bfloat> %vld3q_v.fca.1.extract to <4 x i32>
296  %2 = bitcast <8 x bfloat> %vld3q_v.fca.2.extract to <4 x i32>
297  %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0
298  %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
299  %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
300  ret [3 x <4 x i32>] %.fca.2.insert
301}
302
303define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_lane_bf16(ptr %ptr, [3 x <2 x i32>] %src.coerce) {
304; CHECK-LABEL: test_vld3_lane_bf16:
305; CHECK:       @ %bb.0: @ %entry
306; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
307; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
308; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
309; CHECK-NEXT:    vld3.16 {d0[1], d1[1], d2[1]}, [r0]
310; CHECK-NEXT:    bx lr
311entry:
312  %src.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %src.coerce, 0
313  %src.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %src.coerce, 1
314  %src.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %src.coerce, 2
315  %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
316  %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
317  %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
318  %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
319  %vld3_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 0
320  %vld3_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 1
321  %vld3_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 2
322  %3 = bitcast <4 x bfloat> %vld3_lane_v.fca.0.extract to <2 x i32>
323  %4 = bitcast <4 x bfloat> %vld3_lane_v.fca.1.extract to <2 x i32>
324  %5 = bitcast <4 x bfloat> %vld3_lane_v.fca.2.extract to <2 x i32>
325  %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %3, 0
326  %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %4, 1
327  %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %5, 2
328  ret [3 x <2 x i32>] %.fca.2.insert
329}
330
331define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_lane_bf16(ptr %ptr, [3 x <4 x i32>] %src.coerce) {
332; CHECK-LABEL: test_vld3q_lane_bf16:
333; CHECK:       @ %bb.0: @ %entry
334; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
335; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
336; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
337; CHECK-NEXT:    vld3.16 {d1[3], d3[3], d5[3]}, [r0]
338; CHECK-NEXT:    bx lr
339entry:
340  %src.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %src.coerce, 0
341  %src.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %src.coerce, 1
342  %src.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %src.coerce, 2
343  %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
344  %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
345  %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
346  %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
347  %vld3q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 0
348  %vld3q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 1
349  %vld3q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 2
350  %3 = bitcast <8 x bfloat> %vld3q_lane_v.fca.0.extract to <4 x i32>
351  %4 = bitcast <8 x bfloat> %vld3q_lane_v.fca.1.extract to <4 x i32>
352  %5 = bitcast <8 x bfloat> %vld3q_lane_v.fca.2.extract to <4 x i32>
353  %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %3, 0
354  %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %4, 1
355  %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %5, 2
356  ret [3 x <4 x i32>] %.fca.2.insert
357}
358
359define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_bf16(ptr %ptr) {
360; CHECK-LABEL: test_vld4_bf16:
361; CHECK:       @ %bb.0: @ %entry
362; CHECK-NEXT:    vld4.16 {d0, d1, d2, d3}, [r0]
363; CHECK-NEXT:    bx lr
364entry:
365  %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr %ptr, i32 2)
366  %vld4_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 0
367  %vld4_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 1
368  %vld4_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 2
369  %vld4_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 3
370  %0 = bitcast <4 x bfloat> %vld4_v.fca.0.extract to <2 x i32>
371  %1 = bitcast <4 x bfloat> %vld4_v.fca.1.extract to <2 x i32>
372  %2 = bitcast <4 x bfloat> %vld4_v.fca.2.extract to <2 x i32>
373  %3 = bitcast <4 x bfloat> %vld4_v.fca.3.extract to <2 x i32>
374  %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0
375  %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
376  %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
377  %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3
378  ret [4 x <2 x i32>] %.fca.3.insert
379}
380
381define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_bf16(ptr %ptr) {
382; CHECK-LABEL: test_vld4q_bf16:
383; CHECK:       @ %bb.0: @ %entry
384; CHECK-NEXT:    vld4.16 {d0, d2, d4, d6}, [r0]!
385; CHECK-NEXT:    vld4.16 {d1, d3, d5, d7}, [r0]
386; CHECK-NEXT:    bx lr
387entry:
388  %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr %ptr, i32 2)
389  %vld4q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 0
390  %vld4q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 1
391  %vld4q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 2
392  %vld4q_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 3
393  %0 = bitcast <8 x bfloat> %vld4q_v.fca.0.extract to <4 x i32>
394  %1 = bitcast <8 x bfloat> %vld4q_v.fca.1.extract to <4 x i32>
395  %2 = bitcast <8 x bfloat> %vld4q_v.fca.2.extract to <4 x i32>
396  %3 = bitcast <8 x bfloat> %vld4q_v.fca.3.extract to <4 x i32>
397  %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0
398  %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
399  %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
400  %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3
401  ret [4 x <4 x i32>] %.fca.3.insert
402}
403
404define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_lane_bf16(ptr %ptr, [4 x <2 x i32>] %src.coerce) {
405; CHECK-LABEL: test_vld4_lane_bf16:
406; CHECK:       @ %bb.0: @ %entry
407; CHECK-NEXT:    @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
408; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
409; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
410; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
411; CHECK-NEXT:    vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0]
412; CHECK-NEXT:    bx lr
413entry:
414  %src.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %src.coerce, 0
415  %src.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %src.coerce, 1
416  %src.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %src.coerce, 2
417  %src.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %src.coerce, 3
418  %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
419  %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
420  %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
421  %3 = bitcast <2 x i32> %src.coerce.fca.3.extract to <4 x bfloat>
422  %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
423  %vld4_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 0
424  %vld4_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 1
425  %vld4_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 2
426  %vld4_lane_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 3
427  %4 = bitcast <4 x bfloat> %vld4_lane_v.fca.0.extract to <2 x i32>
428  %5 = bitcast <4 x bfloat> %vld4_lane_v.fca.1.extract to <2 x i32>
429  %6 = bitcast <4 x bfloat> %vld4_lane_v.fca.2.extract to <2 x i32>
430  %7 = bitcast <4 x bfloat> %vld4_lane_v.fca.3.extract to <2 x i32>
431  %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %4, 0
432  %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %5, 1
433  %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %6, 2
434  %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %7, 3
435  ret [4 x <2 x i32>] %.fca.3.insert
436}
437
438define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_lane_bf16(ptr %ptr, [4 x <4 x i32>] %src.coerce) {
439; CHECK-LABEL: test_vld4q_lane_bf16:
440; CHECK:       @ %bb.0: @ %entry
441; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
442; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
443; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
444; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
445; CHECK-NEXT:    vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0]
446; CHECK-NEXT:    bx lr
447entry:
448  %src.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %src.coerce, 0
449  %src.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %src.coerce, 1
450  %src.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %src.coerce, 2
451  %src.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %src.coerce, 3
452  %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
453  %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
454  %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
455  %3 = bitcast <4 x i32> %src.coerce.fca.3.extract to <8 x bfloat>
456  %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
457  %vld4q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 0
458  %vld4q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 1
459  %vld4q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 2
460  %vld4q_lane_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 3
461  %4 = bitcast <8 x bfloat> %vld4q_lane_v.fca.0.extract to <4 x i32>
462  %5 = bitcast <8 x bfloat> %vld4q_lane_v.fca.1.extract to <4 x i32>
463  %6 = bitcast <8 x bfloat> %vld4q_lane_v.fca.2.extract to <4 x i32>
464  %7 = bitcast <8 x bfloat> %vld4q_lane_v.fca.3.extract to <4 x i32>
465  %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %4, 0
466  %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %5, 1
467  %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %6, 2
468  %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %7, 3
469  ret [4 x <4 x i32>] %.fca.3.insert
470}
471
472define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_dup_bf16(ptr %ptr) {
473; CHECK-LABEL: test_vld2_dup_bf16:
474; CHECK:       @ %bb.0: @ %entry
475; CHECK-NEXT:    vld2.16 {d0[], d1[]}, [r0]
476; CHECK-NEXT:    bx lr
477entry:
478  %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr %ptr, i32 2)
479  %vld2_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 0
480  %vld2_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 1
481  %0 = bitcast <4 x bfloat> %vld2_dup_v.fca.0.extract to <2 x i32>
482  %1 = bitcast <4 x bfloat> %vld2_dup_v.fca.1.extract to <2 x i32>
483  %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0
484  %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
485  ret [2 x <2 x i32>] %.fca.1.insert
486}
487
488define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(ptr %ptr) {
489; CHECK-LABEL: test_vld2q_dup_bf16:
490; CHECK:       @ %bb.0: @ %entry
491; CHECK-NEXT:    vld2.16 {d0[], d2[]}, [r0]
492; CHECK-NEXT:    vld2.16 {d1[], d3[]}, [r0]
493; CHECK-NEXT:    bx lr
494entry:
495  %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr %ptr, i32 2)
496  %vld2q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 0
497  %vld2q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 1
498  %0 = bitcast <8 x bfloat> %vld2q_dup_v.fca.0.extract to <4 x i32>
499  %1 = bitcast <8 x bfloat> %vld2q_dup_v.fca.1.extract to <4 x i32>
500  %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0
501  %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
502  ret [2 x <4 x i32>] %.fca.1.insert
503}
504
505define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_dup_bf16(ptr %ptr) {
506; CHECK-LABEL: test_vld3_dup_bf16:
507; CHECK:       @ %bb.0: @ %entry
508; CHECK-NEXT:    vld3.16 {d0[], d1[], d2[]}, [r0]
509; CHECK-NEXT:    bx lr
510entry:
511  %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr %ptr, i32 2)
512  %vld3_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 0
513  %vld3_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 1
514  %vld3_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 2
515  %0 = bitcast <4 x bfloat> %vld3_dup_v.fca.0.extract to <2 x i32>
516  %1 = bitcast <4 x bfloat> %vld3_dup_v.fca.1.extract to <2 x i32>
517  %2 = bitcast <4 x bfloat> %vld3_dup_v.fca.2.extract to <2 x i32>
518  %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0
519  %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
520  %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
521  ret [3 x <2 x i32>] %.fca.2.insert
522}
523
524define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_dup_bf16(ptr %ptr) {
525; CHECK-LABEL: test_vld3q_dup_bf16:
526; CHECK:       @ %bb.0: @ %entry
527; CHECK-NEXT:    vld3.16 {d0[], d2[], d4[]}, [r0]
528; CHECK-NEXT:    vld3.16 {d1[], d3[], d5[]}, [r0]
529; CHECK-NEXT:    bx lr
530entry:
531  %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr %ptr, i32 2)
532  %vld3q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 0
533  %vld3q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 1
534  %vld3q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 2
535  %0 = bitcast <8 x bfloat> %vld3q_dup_v.fca.0.extract to <4 x i32>
536  %1 = bitcast <8 x bfloat> %vld3q_dup_v.fca.1.extract to <4 x i32>
537  %2 = bitcast <8 x bfloat> %vld3q_dup_v.fca.2.extract to <4 x i32>
538  %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0
539  %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
540  %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
541  ret [3 x <4 x i32>] %.fca.2.insert
542}
543
544define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_dup_bf16(ptr %ptr) {
545; CHECK-LABEL: test_vld4_dup_bf16:
546; CHECK:       @ %bb.0: @ %entry
547; CHECK-NEXT:    vld4.16 {d0[], d1[], d2[], d3[]}, [r0]
548; CHECK-NEXT:    bx lr
549entry:
550  %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr %ptr, i32 2)
551  %vld4_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 0
552  %vld4_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 1
553  %vld4_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 2
554  %vld4_dup_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 3
555  %0 = bitcast <4 x bfloat> %vld4_dup_v.fca.0.extract to <2 x i32>
556  %1 = bitcast <4 x bfloat> %vld4_dup_v.fca.1.extract to <2 x i32>
557  %2 = bitcast <4 x bfloat> %vld4_dup_v.fca.2.extract to <2 x i32>
558  %3 = bitcast <4 x bfloat> %vld4_dup_v.fca.3.extract to <2 x i32>
559  %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0
560  %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
561  %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
562  %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3
563  ret [4 x <2 x i32>] %.fca.3.insert
564}
565
566define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_dup_bf16(ptr %ptr) {
567; CHECK-LABEL: test_vld4q_dup_bf16:
568; CHECK:       @ %bb.0: @ %entry
569; CHECK-NEXT:    vld4.16 {d0[], d2[], d4[], d6[]}, [r0]
570; CHECK-NEXT:    vld4.16 {d1[], d3[], d5[], d7[]}, [r0]
571; CHECK-NEXT:    bx lr
572entry:
573  %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr %ptr, i32 2)
574  %vld4q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 0
575  %vld4q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 1
576  %vld4q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 2
577  %vld4q_dup_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 3
578  %0 = bitcast <8 x bfloat> %vld4q_dup_v.fca.0.extract to <4 x i32>
579  %1 = bitcast <8 x bfloat> %vld4q_dup_v.fca.1.extract to <4 x i32>
580  %2 = bitcast <8 x bfloat> %vld4q_dup_v.fca.2.extract to <4 x i32>
581  %3 = bitcast <8 x bfloat> %vld4q_dup_v.fca.3.extract to <4 x i32>
582  %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0
583  %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
584  %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
585  %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3
586  ret [4 x <4 x i32>] %.fca.3.insert
587}
588
589define arm_aapcs_vfpcc void @test_vst1_bf16(ptr %ptr, <4 x bfloat> %val) {
590; CHECK-LABEL: test_vst1_bf16:
591; CHECK:       @ %bb.0: @ %entry
592; CHECK-NEXT:    vst1.16 {d0}, [r0]
593; CHECK-NEXT:    bx lr
594entry:
595  tail call void @llvm.arm.neon.vst1.p0.v4bf16(ptr %ptr, <4 x bfloat> %val, i32 2)
596  ret void
597}
598
599define arm_aapcs_vfpcc void @test_vst1q_bf16(ptr %ptr, <8 x bfloat> %val) {
600; CHECK-LABEL: test_vst1q_bf16:
601; CHECK:       @ %bb.0: @ %entry
602; CHECK-NEXT:    vst1.16 {d0, d1}, [r0]
603; CHECK-NEXT:    bx lr
604entry:
605  tail call void @llvm.arm.neon.vst1.p0.v8bf16(ptr %ptr, <8 x bfloat> %val, i32 2)
606  ret void
607}
608
609define arm_aapcs_vfpcc void @test_vst1_lane_bf16(ptr nocapture %ptr, <4 x bfloat> %val) {
610; CHECK-LABEL: test_vst1_lane_bf16:
611; CHECK:       @ %bb.0: @ %entry
612; CHECK-NEXT:    vmovx.f16 s0, s0
613; CHECK-NEXT:    vstr.16 s0, [r0]
614; CHECK-NEXT:    bx lr
615entry:
616  %0 = extractelement <4 x bfloat> %val, i32 1
617  store bfloat %0, ptr %ptr, align 2
618  ret void
619}
620
621define arm_aapcs_vfpcc void @test_vst1q_lane_bf16(ptr nocapture %ptr, <8 x bfloat> %val) {
622; CHECK-LABEL: test_vst1q_lane_bf16:
623; CHECK:       @ %bb.0: @ %entry
624; CHECK-NEXT:    vmovx.f16 s0, s3
625; CHECK-NEXT:    vstr.16 s0, [r0]
626; CHECK-NEXT:    bx lr
627entry:
628  %0 = extractelement <8 x bfloat> %val, i32 7
629  store bfloat %0, ptr %ptr, align 2
630  ret void
631}
632
633define arm_aapcs_vfpcc void @test_vst1_bf16_x2(ptr nocapture %ptr, [2 x <2 x i32>] %val.coerce) {
634; CHECK-LABEL: test_vst1_bf16_x2:
635; CHECK:       @ %bb.0: @ %entry
636; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
637; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
638; CHECK-NEXT:    vst1.16 {d0, d1}, [r0]
639; CHECK-NEXT:    bx lr
640entry:
641  %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
642  %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
643  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
644  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
645  tail call void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1)
646  ret void
647}
648
649define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(ptr nocapture %ptr, [2 x <4 x i32>] %val.coerce) {
650; CHECK-LABEL: test_vst1q_bf16_x2:
651; CHECK:       @ %bb.0: @ %entry
652; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
653; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
654; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0]
655; CHECK-NEXT:    bx lr
656entry:
657  %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
658  %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
659  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
660  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
661  tail call void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1)
662  ret void
663}
664
665define arm_aapcs_vfpcc void @test_vst1_bf16_x3(ptr nocapture %ptr, [3 x <2 x i32>] %val.coerce) {
666; CHECK-LABEL: test_vst1_bf16_x3:
667; CHECK:       @ %bb.0: @ %entry
668; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
669; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
670; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
671; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0]
672; CHECK-NEXT:    bx lr
673entry:
674  %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
675  %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
676  %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
677  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
678  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
679  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
680  tail call void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2)
681  ret void
682}
683
684define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(ptr nocapture %ptr, [3 x <4 x i32>] %val.coerce) {
685; CHECK-LABEL: test_vst1q_bf16_x3:
686; CHECK:       @ %bb.0: @ %entry
687; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
688; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
689; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
690; CHECK-NEXT:    vst1.16 {d0, d1, d2}, [r0]!
691; CHECK-NEXT:    vst1.16 {d3, d4, d5}, [r0]
692; CHECK-NEXT:    bx lr
693entry:
694  %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
695  %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
696  %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
697  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
698  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
699  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
700  tail call void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2)
701  ret void
702}
703
704define arm_aapcs_vfpcc void @test_vst1_bf16_x4(ptr nocapture %ptr, [4 x <2 x i32>] %val.coerce) {
705; CHECK-LABEL: test_vst1_bf16_x4:
706; CHECK:       @ %bb.0: @ %entry
707; CHECK-NEXT:    @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
708; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
709; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
710; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
711; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0]
712; CHECK-NEXT:    bx lr
713entry:
714  %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
715  %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
716  %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
717  %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
718  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
719  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
720  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
721  %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
722  tail call void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3)
723  ret void
724}
725
726define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(ptr nocapture %ptr, [4 x <4 x i32>] %val.coerce) {
727; CHECK-LABEL: test_vst1q_bf16_x4:
728; CHECK:       @ %bb.0: @ %entry
729; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
730; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
731; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
732; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
733; CHECK-NEXT:    vst1.16 {d0, d1, d2, d3}, [r0]!
734; CHECK-NEXT:    vst1.16 {d4, d5, d6, d7}, [r0]
735; CHECK-NEXT:    bx lr
736entry:
737  %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
738  %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
739  %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
740  %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
741  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
742  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
743  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
744  %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
745  tail call void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3)
746  ret void
747}
748
749define arm_aapcs_vfpcc void @test_vst2_bf16(ptr %ptr, [2 x <2 x i32>] %val.coerce) {
750; CHECK-LABEL: test_vst2_bf16:
751; CHECK:       @ %bb.0: @ %entry
752; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
753; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
754; CHECK-NEXT:    vst2.16 {d0, d1}, [r0]
755; CHECK-NEXT:    bx lr
756entry:
757  %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
758  %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
759  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
760  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
761  tail call void @llvm.arm.neon.vst2.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 2)
762  ret void
763}
764
765define arm_aapcs_vfpcc void @test_vst2q_bf16(ptr %ptr, [2 x <4 x i32>] %val.coerce) {
766; CHECK-LABEL: test_vst2q_bf16:
767; CHECK:       @ %bb.0: @ %entry
768; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
769; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
770; CHECK-NEXT:    vst2.16 {d0, d1, d2, d3}, [r0]
771; CHECK-NEXT:    bx lr
772entry:
773  %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
774  %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
775  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
776  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
777  tail call void @llvm.arm.neon.vst2.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 2)
778  ret void
779}
780
781define arm_aapcs_vfpcc void @test_vst2_lane_bf16(ptr %ptr, [2 x <2 x i32>] %val.coerce) {
782; CHECK-LABEL: test_vst2_lane_bf16:
783; CHECK:       @ %bb.0: @ %entry
784; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
785; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
786; CHECK-NEXT:    vst2.16 {d0[1], d1[1]}, [r0]
787; CHECK-NEXT:    bx lr
788entry:
789  %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
790  %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
791  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
792  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
793  tail call void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
794  ret void
795}
796
797define arm_aapcs_vfpcc void @test_vst2q_lane_bf16(ptr %ptr, [2 x <4 x i32>] %val.coerce) {
798; CHECK-LABEL: test_vst2q_lane_bf16:
799; CHECK:       @ %bb.0: @ %entry
800; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
801; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
802; CHECK-NEXT:    vst2.16 {d1[3], d3[3]}, [r0]
803; CHECK-NEXT:    bx lr
804entry:
805  %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
806  %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
807  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
808  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
809  tail call void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
810  ret void
811}
812
813define arm_aapcs_vfpcc void @test_vst3_bf16(ptr %ptr, [3 x <2 x i32>] %val.coerce) {
814; CHECK-LABEL: test_vst3_bf16:
815; CHECK:       @ %bb.0: @ %entry
816; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
817; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
818; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
819; CHECK-NEXT:    vst3.16 {d0, d1, d2}, [r0]
820; CHECK-NEXT:    bx lr
821entry:
822  %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
823  %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
824  %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
825  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
826  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
827  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
828  tail call void @llvm.arm.neon.vst3.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2)
829  ret void
830}
831
832define arm_aapcs_vfpcc void @test_vst3q_bf16(ptr %ptr, [3 x <4 x i32>] %val.coerce) {
833; CHECK-LABEL: test_vst3q_bf16:
834; CHECK:       @ %bb.0: @ %entry
835; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
836; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
837; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
838; CHECK-NEXT:    vst3.16 {d0, d2, d4}, [r0]!
839; CHECK-NEXT:    vst3.16 {d1, d3, d5}, [r0]
840; CHECK-NEXT:    bx lr
841entry:
842  %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
843  %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
844  %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
845  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
846  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
847  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
848  tail call void @llvm.arm.neon.vst3.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2)
849  ret void
850}
851
852define arm_aapcs_vfpcc void @test_vst3_lane_bf16(ptr %ptr, [3 x <2 x i32>] %val.coerce) {
853; CHECK-LABEL: test_vst3_lane_bf16:
854; CHECK:       @ %bb.0: @ %entry
855; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
856; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
857; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
858; CHECK-NEXT:    vst3.16 {d0[1], d1[1], d2[1]}, [r0]
859; CHECK-NEXT:    bx lr
860entry:
861  %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
862  %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
863  %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
864  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
865  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
866  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
867  tail call void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
868  ret void
869}
870
871define arm_aapcs_vfpcc void @test_vst3q_lane_bf16(ptr %ptr, [3 x <4 x i32>] %val.coerce) {
872; CHECK-LABEL: test_vst3q_lane_bf16:
873; CHECK:       @ %bb.0: @ %entry
874; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
875; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
876; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
877; CHECK-NEXT:    vst3.16 {d1[3], d3[3], d5[3]}, [r0]
878; CHECK-NEXT:    bx lr
879entry:
880  %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
881  %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
882  %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
883  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
884  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
885  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
886  tail call void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
887  ret void
888}
889
890define arm_aapcs_vfpcc void @test_vst4_bf16(ptr %ptr, [4 x <2 x i32>] %val.coerce) {
891; CHECK-LABEL: test_vst4_bf16:
892; CHECK:       @ %bb.0: @ %entry
893; CHECK-NEXT:    @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
894; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
895; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
896; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
897; CHECK-NEXT:    vst4.16 {d0, d1, d2, d3}, [r0]
898; CHECK-NEXT:    bx lr
899entry:
900  %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
901  %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
902  %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
903  %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
904  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
905  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
906  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
907  %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
908  tail call void @llvm.arm.neon.vst4.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2)
909  ret void
910}
911
912define arm_aapcs_vfpcc void @test_vst4q_bf16(ptr %ptr, [4 x <4 x i32>] %val.coerce) {
913; CHECK-LABEL: test_vst4q_bf16:
914; CHECK:       @ %bb.0: @ %entry
915; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
916; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
917; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
918; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
919; CHECK-NEXT:    vst4.16 {d0, d2, d4, d6}, [r0]!
920; CHECK-NEXT:    vst4.16 {d1, d3, d5, d7}, [r0]
921; CHECK-NEXT:    bx lr
922entry:
923  %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
924  %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
925  %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
926  %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
927  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
928  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
929  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
930  %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
931  tail call void @llvm.arm.neon.vst4.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2)
932  ret void
933}
934
935define arm_aapcs_vfpcc void @test_vst4_lane_bf16(ptr %ptr, [4 x <2 x i32>] %val.coerce) {
936; CHECK-LABEL: test_vst4_lane_bf16:
937; CHECK:       @ %bb.0: @ %entry
938; CHECK-NEXT:    @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
939; CHECK-NEXT:    @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
940; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
941; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
942; CHECK-NEXT:    vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0]
943; CHECK-NEXT:    bx lr
944entry:
945  %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
946  %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
947  %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
948  %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
949  %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
950  %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
951  %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
952  %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
953  tail call void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
954  ret void
955}
956
957define arm_aapcs_vfpcc void @test_vst4q_lane_bf16(ptr %ptr, [4 x <4 x i32>] %val.coerce) {
958; CHECK-LABEL: test_vst4q_lane_bf16:
959; CHECK:       @ %bb.0: @ %entry
960; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
961; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
962; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
963; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
964; CHECK-NEXT:    vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0]
965; CHECK-NEXT:    bx lr
966entry:
967  %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
968  %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
969  %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
970  %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
971  %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
972  %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
973  %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
974  %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
975  tail call void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
976  ret void
977}
978
979declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr, i32)
980declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr, i32)
981declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr, i32)
982declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr, i32)
983declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr, i32)
984declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr, i32)
985
986declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr, i32)
987declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr, i32)
988declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr, i32)
989declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr, i32)
990declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr, i32)
991declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr, i32)
992
993declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr)
994declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr)
995declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr)
996declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr)
997declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr)
998declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr)
999
1000declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, i32, i32)
1001declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, i32, i32)
1002declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1003declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1004declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1005declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1006
1007declare void @llvm.arm.neon.vst1.p0.v4bf16(ptr, <4 x bfloat>, i32)
1008declare void @llvm.arm.neon.vst1.p0.v8bf16(ptr, <8 x bfloat>, i32)
1009declare void @llvm.arm.neon.vst2.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, i32)
1010declare void @llvm.arm.neon.vst2.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, i32)
1011declare void @llvm.arm.neon.vst3.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1012declare void @llvm.arm.neon.vst3.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1013declare void @llvm.arm.neon.vst4.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1014declare void @llvm.arm.neon.vst4.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1015
1016declare void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>)
1017declare void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>)
1018declare void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
1019declare void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
1020declare void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
1021declare void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
1022
1023declare void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, i32, i32)
1024declare void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, i32, i32)
1025declare void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1026declare void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1027declare void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1028declare void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1029