xref: /llvm-project/clang/test/CodeGen/AArch64/bf16-ldst-intrinsics.c (revision 207e5ccceec8d3cc3f32723e78f2a142bc61b07d)
1*207e5cccSFangrui Song // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2*207e5cccSFangrui Song // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
3*207e5cccSFangrui Song // RUN:  -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64
4*207e5cccSFangrui Song // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
5*207e5cccSFangrui Song // RUN:  -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32
6*207e5cccSFangrui Song 
7*207e5cccSFangrui Song // REQUIRES: arm-registered-target,aarch64-registered-target
8*207e5cccSFangrui Song 
9*207e5cccSFangrui Song #include "arm_neon.h"
10*207e5cccSFangrui Song 
11*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1_bf16(
12*207e5cccSFangrui Song // CHECK-NEXT:  entry:
13*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x bfloat>, ptr [[PTR:%.*]], align 2
14*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x bfloat> [[TMP1]]
15*207e5cccSFangrui Song //
16*207e5cccSFangrui Song bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) {
17*207e5cccSFangrui Song   return vld1_bf16(ptr);
18*207e5cccSFangrui Song }
19*207e5cccSFangrui Song 
20*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1q_bf16(
21*207e5cccSFangrui Song // CHECK-NEXT:  entry:
22*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[PTR:%.*]], align 2
23*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x bfloat> [[TMP1]]
24*207e5cccSFangrui Song //
25*207e5cccSFangrui Song bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) {
26*207e5cccSFangrui Song   return vld1q_bf16(ptr);
27*207e5cccSFangrui Song }
28*207e5cccSFangrui Song 
29*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1_lane_bf16(
30*207e5cccSFangrui Song // CHECK-NEXT:  entry:
31*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2
32*207e5cccSFangrui Song // CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x bfloat> [[SRC:%.*]], bfloat [[TMP0]], i64 0
33*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x bfloat> [[VLD1_LANE]]
34*207e5cccSFangrui Song //
35*207e5cccSFangrui Song bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) {
36*207e5cccSFangrui Song   return vld1_lane_bf16(ptr, src, 0);
37*207e5cccSFangrui Song }
38*207e5cccSFangrui Song 
39*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1q_lane_bf16(
40*207e5cccSFangrui Song // CHECK-NEXT:  entry:
41*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2
42*207e5cccSFangrui Song // CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x bfloat> [[SRC:%.*]], bfloat [[TMP0]], i64 7
43*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x bfloat> [[VLD1_LANE]]
44*207e5cccSFangrui Song //
45*207e5cccSFangrui Song bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) {
46*207e5cccSFangrui Song   return vld1q_lane_bf16(ptr, src, 7);
47*207e5cccSFangrui Song }
48*207e5cccSFangrui Song 
49*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1_dup_bf16(
50*207e5cccSFangrui Song // CHECK-NEXT:  entry:
51*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2
52*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x bfloat> poison, bfloat [[TMP0]], i64 0
53*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> poison, <4 x i32> zeroinitializer
54*207e5cccSFangrui Song // CHECK-NEXT:    ret <4 x bfloat> [[LANE]]
55*207e5cccSFangrui Song //
56*207e5cccSFangrui Song bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) {
57*207e5cccSFangrui Song   return vld1_dup_bf16(ptr);
58*207e5cccSFangrui Song }
59*207e5cccSFangrui Song 
60*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1_bf16_x2(
61*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
62*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0(ptr [[PTR:%.*]])
63*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
64*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
65*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
66*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
67*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]]
68*207e5cccSFangrui Song //
69*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1_bf16_x2(
70*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
71*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr [[PTR:%.*]])
72*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
73*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
74*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32>
75*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32>
76*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0
77*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1
78*207e5cccSFangrui Song // CHECK32-NEXT:    ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]]
79*207e5cccSFangrui Song //
80*207e5cccSFangrui Song bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) {
81*207e5cccSFangrui Song   return vld1_bf16_x2(ptr);
82*207e5cccSFangrui Song }
83*207e5cccSFangrui Song 
84*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1q_bf16_x2(
85*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
86*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0(ptr [[PTR:%.*]])
87*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
88*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
89*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
90*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
91*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]]
92*207e5cccSFangrui Song //
93*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1q_bf16_x2(
94*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
95*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr [[PTR:%.*]])
96*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
97*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
98*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32>
99*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32>
100*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0
101*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1
102*207e5cccSFangrui Song // CHECK32-NEXT:    ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]]
103*207e5cccSFangrui Song //
104*207e5cccSFangrui Song bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) {
105*207e5cccSFangrui Song   return vld1q_bf16_x2(ptr);
106*207e5cccSFangrui Song }
107*207e5cccSFangrui Song 
108*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1_bf16_x3(
109*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
110*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0(ptr [[PTR:%.*]])
111*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
112*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
113*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2
114*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
115*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
116*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2
117*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]]
118*207e5cccSFangrui Song //
119*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1_bf16_x3(
120*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
121*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr [[PTR:%.*]])
122*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
123*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
124*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2
125*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32>
126*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32>
127*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <2 x i32>
128*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0
129*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1
130*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP2]], 2
131*207e5cccSFangrui Song // CHECK32-NEXT:    ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]]
132*207e5cccSFangrui Song //
133*207e5cccSFangrui Song bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) {
134*207e5cccSFangrui Song   return vld1_bf16_x3(ptr);
135*207e5cccSFangrui Song }
136*207e5cccSFangrui Song 
137*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1q_bf16_x3(
138*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
139*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0(ptr [[PTR:%.*]])
140*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
141*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
142*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2
143*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
144*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
145*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2
146*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]]
147*207e5cccSFangrui Song //
148*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1q_bf16_x3(
149*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
150*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr [[PTR:%.*]])
151*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
152*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
153*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2
154*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32>
155*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32>
156*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <4 x i32>
157*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0
158*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1
159*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP2]], 2
160*207e5cccSFangrui Song // CHECK32-NEXT:    ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]]
161*207e5cccSFangrui Song //
162*207e5cccSFangrui Song bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) {
163*207e5cccSFangrui Song   return vld1q_bf16_x3(ptr);
164*207e5cccSFangrui Song }
165*207e5cccSFangrui Song 
166*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1_bf16_x4(
167*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
168*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0(ptr [[PTR:%.*]])
169*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
170*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
171*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2
172*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 3
173*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
174*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
175*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2
176*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD1XN_FCA_3_EXTRACT]], 0, 3
177*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]]
178*207e5cccSFangrui Song //
179*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1_bf16_x4(
180*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
181*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr [[PTR:%.*]])
182*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
183*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
184*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2
185*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 3
186*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32>
187*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32>
188*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <2 x i32>
189*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_3_EXTRACT]] to <2 x i32>
190*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0
191*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1
192*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP2]], 2
193*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP3]], 3
194*207e5cccSFangrui Song // CHECK32-NEXT:    ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]]
195*207e5cccSFangrui Song //
196*207e5cccSFangrui Song bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) {
197*207e5cccSFangrui Song   return vld1_bf16_x4(ptr);
198*207e5cccSFangrui Song }
199*207e5cccSFangrui Song 
200*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1q_bf16_x4(
201*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
202*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0(ptr [[PTR:%.*]])
203*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
204*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
205*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2
206*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 3
207*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
208*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
209*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2
210*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD1XN_FCA_3_EXTRACT]], 0, 3
211*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]]
212*207e5cccSFangrui Song //
213*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1q_bf16_x4(
214*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
215*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr [[PTR:%.*]])
216*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
217*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
218*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2
219*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 3
220*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32>
221*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32>
222*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <4 x i32>
223*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_3_EXTRACT]] to <4 x i32>
224*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0
225*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1
226*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP2]], 2
227*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP3]], 3
228*207e5cccSFangrui Song // CHECK32-NEXT:    ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]]
229*207e5cccSFangrui Song //
230*207e5cccSFangrui Song bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) {
231*207e5cccSFangrui Song   return vld1q_bf16_x4(ptr);
232*207e5cccSFangrui Song }
233*207e5cccSFangrui Song 
234*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1q_dup_bf16(
235*207e5cccSFangrui Song // CHECK-NEXT:  entry:
236*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2
237*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x bfloat> poison, bfloat [[TMP0]], i64 0
238*207e5cccSFangrui Song // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> poison, <8 x i32> zeroinitializer
239*207e5cccSFangrui Song // CHECK-NEXT:    ret <8 x bfloat> [[LANE]]
240*207e5cccSFangrui Song //
241*207e5cccSFangrui Song bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) {
242*207e5cccSFangrui Song   return vld1q_dup_bf16(ptr);
243*207e5cccSFangrui Song }
244*207e5cccSFangrui Song 
245*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2_bf16(
246*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
247*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0(ptr [[PTR:%.*]])
248*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 0
249*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 1
250*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0
251*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1
252*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]]
253*207e5cccSFangrui Song //
254*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2_bf16(
255*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
256*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
257*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_V]], 0
258*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_V]], 1
259*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD2_V_FCA_0_EXTRACT]] to <2 x i32>
260*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD2_V_FCA_1_EXTRACT]] to <2 x i32>
261*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
262*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
263*207e5cccSFangrui Song // CHECK32-NEXT:    ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]]
264*207e5cccSFangrui Song //
265*207e5cccSFangrui Song bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) {
266*207e5cccSFangrui Song   return vld2_bf16(ptr);
267*207e5cccSFangrui Song }
268*207e5cccSFangrui Song 
269*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2q_bf16(
270*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
271*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0(ptr [[PTR:%.*]])
272*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 0
273*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 1
274*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0
275*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1
276*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]]
277*207e5cccSFangrui Song //
278*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2q_bf16(
279*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
280*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
281*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_V]], 0
282*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_V]], 1
283*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD2Q_V_FCA_0_EXTRACT]] to <4 x i32>
284*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD2Q_V_FCA_1_EXTRACT]] to <4 x i32>
285*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
286*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
287*207e5cccSFangrui Song // CHECK32-NEXT:    ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]]
288*207e5cccSFangrui Song //
289*207e5cccSFangrui Song bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) {
290*207e5cccSFangrui Song   return vld2q_bf16(ptr);
291*207e5cccSFangrui Song }
292*207e5cccSFangrui Song 
293*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2_lane_bf16(
294*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
295*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0
296*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[SRC_COERCE]], 1
297*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], i64 1, ptr [[PTR:%.*]])
298*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE]], 0
299*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE]], 1
300*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0
301*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1
302*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]]
303*207e5cccSFangrui Song //
304*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2_lane_bf16(
305*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
306*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[SRC_COERCE:%.*]], 0
307*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[SRC_COERCE]], 1
308*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
309*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
310*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 1, i32 2)
311*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE_V]], 0
312*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE_V]], 1
313*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD2_LANE_V_FCA_0_EXTRACT]] to <2 x i32>
314*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD2_LANE_V_FCA_1_EXTRACT]] to <2 x i32>
315*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP3]], 0
316*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP4]], 1
317*207e5cccSFangrui Song // CHECK32-NEXT:    ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]]
318*207e5cccSFangrui Song //
319*207e5cccSFangrui Song bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) {
320*207e5cccSFangrui Song   return vld2_lane_bf16(ptr, src, 1);
321*207e5cccSFangrui Song }
322*207e5cccSFangrui Song 
323*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2q_lane_bf16(
324*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
325*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0
326*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[SRC_COERCE]], 1
327*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR:%.*]])
328*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2_LANE]], 0
329*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2_LANE]], 1
330*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0
331*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1
332*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]]
333*207e5cccSFangrui Song //
334*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2q_lane_bf16(
335*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
336*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[SRC_COERCE:%.*]], 0
337*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[SRC_COERCE]], 1
338*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
339*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
340*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 7, i32 2)
341*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_LANE_V]], 0
342*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_LANE_V]], 1
343*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD2Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32>
344*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD2Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32>
345*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP3]], 0
346*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP4]], 1
347*207e5cccSFangrui Song // CHECK32-NEXT:    ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]]
348*207e5cccSFangrui Song //
349*207e5cccSFangrui Song bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) {
350*207e5cccSFangrui Song   return vld2q_lane_bf16(ptr, src, 7);
351*207e5cccSFangrui Song }
352*207e5cccSFangrui Song 
353*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3_bf16(
354*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
355*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0(ptr [[PTR:%.*]])
356*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 0
357*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 1
358*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 2
359*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0
360*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1
361*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2
362*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]]
363*207e5cccSFangrui Song //
364*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3_bf16(
365*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
366*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
367*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 0
368*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 1
369*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 2
370*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_0_EXTRACT]] to <2 x i32>
371*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_1_EXTRACT]] to <2 x i32>
372*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_2_EXTRACT]] to <2 x i32>
373*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
374*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
375*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2
376*207e5cccSFangrui Song // CHECK32-NEXT:    ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]]
377*207e5cccSFangrui Song //
378*207e5cccSFangrui Song bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) {
379*207e5cccSFangrui Song   return vld3_bf16(ptr);
380*207e5cccSFangrui Song }
381*207e5cccSFangrui Song 
382*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3q_bf16(
383*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
384*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0(ptr [[PTR:%.*]])
385*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 0
386*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 1
387*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 2
388*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0
389*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1
390*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2
391*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]]
392*207e5cccSFangrui Song //
393*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3q_bf16(
394*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
395*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
396*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 0
397*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 1
398*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 2
399*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_0_EXTRACT]] to <4 x i32>
400*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_1_EXTRACT]] to <4 x i32>
401*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_2_EXTRACT]] to <4 x i32>
402*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
403*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
404*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2
405*207e5cccSFangrui Song // CHECK32-NEXT:    ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]]
406*207e5cccSFangrui Song //
407*207e5cccSFangrui Song bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) {
408*207e5cccSFangrui Song   return vld3q_bf16(ptr);
409*207e5cccSFangrui Song }
410*207e5cccSFangrui Song 
411*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3_lane_bf16(
412*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
413*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0
414*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE]], 1
415*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE]], 2
416*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], i64 1, ptr [[PTR:%.*]])
417*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 0
418*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 1
419*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 2
420*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0
421*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1
422*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2
423*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]]
424*207e5cccSFangrui Song //
425*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3_lane_bf16(
426*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
427*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE:%.*]], 0
428*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE]], 1
429*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE]], 2
430*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
431*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
432*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
433*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 1, i32 2)
434*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 0
435*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 1
436*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 2
437*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_0_EXTRACT]] to <2 x i32>
438*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_1_EXTRACT]] to <2 x i32>
439*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_2_EXTRACT]] to <2 x i32>
440*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP4]], 0
441*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP5]], 1
442*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP6]], 2
443*207e5cccSFangrui Song // CHECK32-NEXT:    ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]]
444*207e5cccSFangrui Song //
445*207e5cccSFangrui Song bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) {
446*207e5cccSFangrui Song   return vld3_lane_bf16(ptr, src, 1);
447*207e5cccSFangrui Song }
448*207e5cccSFangrui Song 
449*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3q_lane_bf16(
450*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
451*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0
452*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE]], 1
453*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE]], 2
454*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR:%.*]])
455*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 0
456*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 1
457*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 2
458*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0
459*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1
460*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2
461*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]]
462*207e5cccSFangrui Song //
463*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3q_lane_bf16(
464*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
465*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE:%.*]], 0
466*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE]], 1
467*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE]], 2
468*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
469*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
470*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
471*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 7, i32 2)
472*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 0
473*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 1
474*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 2
475*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32>
476*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32>
477*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP6:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_2_EXTRACT]] to <4 x i32>
478*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP4]], 0
479*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP5]], 1
480*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP6]], 2
481*207e5cccSFangrui Song // CHECK32-NEXT:    ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]]
482*207e5cccSFangrui Song //
483*207e5cccSFangrui Song bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) {
484*207e5cccSFangrui Song   return vld3q_lane_bf16(ptr, src, 7);
485*207e5cccSFangrui Song   // return vld3q_lane_bf16(ptr, src, 8);
486*207e5cccSFangrui Song }
487*207e5cccSFangrui Song 
488*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4_bf16(
489*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
490*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0(ptr [[PTR:%.*]])
491*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 0
492*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 1
493*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 2
494*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 3
495*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0
496*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1
497*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2
498*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3
499*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]]
500*207e5cccSFangrui Song //
501*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4_bf16(
502*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
503*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
504*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 0
505*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 1
506*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 2
507*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 3
508*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_0_EXTRACT]] to <2 x i32>
509*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_1_EXTRACT]] to <2 x i32>
510*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_2_EXTRACT]] to <2 x i32>
511*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_3_EXTRACT]] to <2 x i32>
512*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
513*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
514*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2
515*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP4]], 3
516*207e5cccSFangrui Song // CHECK32-NEXT:    ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]]
517*207e5cccSFangrui Song //
518*207e5cccSFangrui Song bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) {
519*207e5cccSFangrui Song   return vld4_bf16(ptr);
520*207e5cccSFangrui Song }
521*207e5cccSFangrui Song 
522*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4q_bf16(
523*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
524*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0(ptr [[PTR:%.*]])
525*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 0
526*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 1
527*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 2
528*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 3
529*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0
530*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1
531*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2
532*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3
533*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]]
534*207e5cccSFangrui Song //
535*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4q_bf16(
536*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
537*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
538*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 0
539*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 1
540*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 2
541*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 3
542*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_0_EXTRACT]] to <4 x i32>
543*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_1_EXTRACT]] to <4 x i32>
544*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_2_EXTRACT]] to <4 x i32>
545*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_3_EXTRACT]] to <4 x i32>
546*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
547*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
548*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2
549*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP4]], 3
550*207e5cccSFangrui Song // CHECK32-NEXT:    ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]]
551*207e5cccSFangrui Song //
552*207e5cccSFangrui Song bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) {
553*207e5cccSFangrui Song   return vld4q_bf16(ptr);
554*207e5cccSFangrui Song }
555*207e5cccSFangrui Song 
556*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4_lane_bf16(
557*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
558*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0
559*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 1
560*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 2
561*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 3
562*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_3_EXTRACT]], i64 1, ptr [[PTR:%.*]])
563*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 0
564*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 1
565*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 2
566*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 3
567*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0
568*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1
569*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2
570*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3
571*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]]
572*207e5cccSFangrui Song //
573*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4_lane_bf16(
574*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
575*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE:%.*]], 0
576*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 1
577*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 2
578*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 3
579*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
580*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
581*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
582*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_3_EXTRACT]] to <4 x bfloat>
583*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 1, i32 2)
584*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 0
585*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 1
586*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 2
587*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 3
588*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_0_EXTRACT]] to <2 x i32>
589*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_1_EXTRACT]] to <2 x i32>
590*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP7:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_2_EXTRACT]] to <2 x i32>
591*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP8:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_3_EXTRACT]] to <2 x i32>
592*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP5]], 0
593*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP6]], 1
594*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP7]], 2
595*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP8]], 3
596*207e5cccSFangrui Song // CHECK32-NEXT:    ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]]
597*207e5cccSFangrui Song //
598*207e5cccSFangrui Song bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) {
599*207e5cccSFangrui Song   return vld4_lane_bf16(ptr, src, 1);
600*207e5cccSFangrui Song }
601*207e5cccSFangrui Song 
602*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4q_lane_bf16(
603*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
604*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0
605*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 1
606*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 2
607*207e5cccSFangrui Song // CHECK64-NEXT:    [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 3
608*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR:%.*]])
609*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 0
610*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 1
611*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 2
612*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 3
613*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0
614*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1
615*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2
616*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3
617*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]]
618*207e5cccSFangrui Song //
619*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4q_lane_bf16(
620*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
621*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE:%.*]], 0
622*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 1
623*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 2
624*207e5cccSFangrui Song // CHECK32-NEXT:    [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 3
625*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
626*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
627*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
628*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_3_EXTRACT]] to <8 x bfloat>
629*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 7, i32 2)
630*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 0
631*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 1
632*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 2
633*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 3
634*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32>
635*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP6:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32>
636*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_2_EXTRACT]] to <4 x i32>
637*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP8:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_3_EXTRACT]] to <4 x i32>
638*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP5]], 0
639*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP6]], 1
640*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP7]], 2
641*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP8]], 3
642*207e5cccSFangrui Song // CHECK32-NEXT:    ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]]
643*207e5cccSFangrui Song //
644*207e5cccSFangrui Song bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) {
645*207e5cccSFangrui Song   return vld4q_lane_bf16(ptr, src, 7);
646*207e5cccSFangrui Song }
647*207e5cccSFangrui Song 
648*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2_dup_bf16(
649*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
650*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0(ptr [[PTR:%.*]])
651*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 0
652*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 1
653*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0
654*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1
655*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]]
656*207e5cccSFangrui Song //
657*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2_dup_bf16(
658*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
659*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
660*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_DUP_V]], 0
661*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_DUP_V]], 1
662*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD2_DUP_V_FCA_0_EXTRACT]] to <2 x i32>
663*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD2_DUP_V_FCA_1_EXTRACT]] to <2 x i32>
664*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
665*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
666*207e5cccSFangrui Song // CHECK32-NEXT:    ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]]
667*207e5cccSFangrui Song //
668*207e5cccSFangrui Song bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) {
669*207e5cccSFangrui Song   return vld2_dup_bf16(ptr);
670*207e5cccSFangrui Song }
671*207e5cccSFangrui Song 
672*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2q_dup_bf16(
673*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
674*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0(ptr [[PTR:%.*]])
675*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 0
676*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 1
677*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0
678*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1
679*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]]
680*207e5cccSFangrui Song //
681*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2q_dup_bf16(
682*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
683*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
684*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_DUP_V]], 0
685*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD2Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_DUP_V]], 1
686*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD2Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32>
687*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD2Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32>
688*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
689*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
690*207e5cccSFangrui Song // CHECK32-NEXT:    ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]]
691*207e5cccSFangrui Song //
692*207e5cccSFangrui Song bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) {
693*207e5cccSFangrui Song   return vld2q_dup_bf16(ptr);
694*207e5cccSFangrui Song }
695*207e5cccSFangrui Song 
696*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3_dup_bf16(
697*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
698*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0(ptr [[PTR:%.*]])
699*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 0
700*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 1
701*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 2
702*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0
703*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1
704*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2
705*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]]
706*207e5cccSFangrui Song //
707*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3_dup_bf16(
708*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
709*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
710*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 0
711*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 1
712*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 2
713*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_0_EXTRACT]] to <2 x i32>
714*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_1_EXTRACT]] to <2 x i32>
715*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_2_EXTRACT]] to <2 x i32>
716*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
717*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
718*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2
719*207e5cccSFangrui Song // CHECK32-NEXT:    ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]]
720*207e5cccSFangrui Song //
721*207e5cccSFangrui Song bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) {
722*207e5cccSFangrui Song   return vld3_dup_bf16(ptr);
723*207e5cccSFangrui Song }
724*207e5cccSFangrui Song 
725*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3q_dup_bf16(
726*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
727*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0(ptr [[PTR:%.*]])
728*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 0
729*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 1
730*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 2
731*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0
732*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1
733*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2
734*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]]
735*207e5cccSFangrui Song //
736*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3q_dup_bf16(
737*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
738*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
739*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 0
740*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 1
741*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD3Q_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 2
742*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32>
743*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32>
744*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_2_EXTRACT]] to <4 x i32>
745*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
746*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
747*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2
748*207e5cccSFangrui Song // CHECK32-NEXT:    ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]]
749*207e5cccSFangrui Song //
750*207e5cccSFangrui Song bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) {
751*207e5cccSFangrui Song   return vld3q_dup_bf16(ptr);
752*207e5cccSFangrui Song }
753*207e5cccSFangrui Song 
754*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4_dup_bf16(
755*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
756*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0(ptr [[PTR:%.*]])
757*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 0
758*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 1
759*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 2
760*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 3
761*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0
762*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1
763*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2
764*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3
765*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]]
766*207e5cccSFangrui Song //
767*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4_dup_bf16(
768*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
769*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
770*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 0
771*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 1
772*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 2
773*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4_DUP_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 3
774*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_0_EXTRACT]] to <2 x i32>
775*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_1_EXTRACT]] to <2 x i32>
776*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_2_EXTRACT]] to <2 x i32>
777*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_3_EXTRACT]] to <2 x i32>
778*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
779*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
780*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2
781*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP4]], 3
782*207e5cccSFangrui Song // CHECK32-NEXT:    ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]]
783*207e5cccSFangrui Song //
784*207e5cccSFangrui Song bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) {
785*207e5cccSFangrui Song   return vld4_dup_bf16(ptr);
786*207e5cccSFangrui Song }
787*207e5cccSFangrui Song 
788*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4q_dup_bf16(
789*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
790*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0(ptr [[PTR:%.*]])
791*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 0
792*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 1
793*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 2
794*207e5cccSFangrui Song // CHECK64-NEXT:    [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 3
795*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0
796*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1
797*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2
798*207e5cccSFangrui Song // CHECK64-NEXT:    [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3
799*207e5cccSFangrui Song // CHECK64-NEXT:    ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]]
800*207e5cccSFangrui Song //
801*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4q_dup_bf16(
802*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
803*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
804*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 0
805*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 1
806*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 2
807*207e5cccSFangrui Song // CHECK32-NEXT:    [[VLD4Q_DUP_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 3
808*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32>
809*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32>
810*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_2_EXTRACT]] to <4 x i32>
811*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_3_EXTRACT]] to <4 x i32>
812*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
813*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
814*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2
815*207e5cccSFangrui Song // CHECK32-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP4]], 3
816*207e5cccSFangrui Song // CHECK32-NEXT:    ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]]
817*207e5cccSFangrui Song //
818*207e5cccSFangrui Song bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) {
819*207e5cccSFangrui Song   return vld4q_dup_bf16(ptr);
820*207e5cccSFangrui Song }
821*207e5cccSFangrui Song 
822*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1_bf16(
823*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
824*207e5cccSFangrui Song // CHECK64-NEXT:    store <4 x bfloat> [[VAL:%.*]], ptr [[PTR:%.*]], align 2
825*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
826*207e5cccSFangrui Song //
827*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1_bf16(
828*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
829*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst1.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[VAL:%.*]], i32 2)
830*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
831*207e5cccSFangrui Song //
832*207e5cccSFangrui Song void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
833*207e5cccSFangrui Song   vst1_bf16(ptr, val);
834*207e5cccSFangrui Song }
835*207e5cccSFangrui Song 
836*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1q_bf16(
837*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
838*207e5cccSFangrui Song // CHECK64-NEXT:    store <8 x bfloat> [[VAL:%.*]], ptr [[PTR:%.*]], align 2
839*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
840*207e5cccSFangrui Song //
841*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1q_bf16(
842*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
843*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst1.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[VAL:%.*]], i32 2)
844*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
845*207e5cccSFangrui Song //
846*207e5cccSFangrui Song void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
847*207e5cccSFangrui Song   vst1q_bf16(ptr, val);
848*207e5cccSFangrui Song }
849*207e5cccSFangrui Song 
850*207e5cccSFangrui Song // CHECK-LABEL: @test_vst1_lane_bf16(
851*207e5cccSFangrui Song // CHECK-NEXT:  entry:
852*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x bfloat> [[VAL:%.*]], i64 1
853*207e5cccSFangrui Song // CHECK-NEXT:    store bfloat [[TMP0]], ptr [[PTR:%.*]], align 2
854*207e5cccSFangrui Song // CHECK-NEXT:    ret void
855*207e5cccSFangrui Song //
856*207e5cccSFangrui Song void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
857*207e5cccSFangrui Song   vst1_lane_bf16(ptr, val, 1);
858*207e5cccSFangrui Song }
859*207e5cccSFangrui Song 
860*207e5cccSFangrui Song // CHECK-LABEL: @test_vst1q_lane_bf16(
861*207e5cccSFangrui Song // CHECK-NEXT:  entry:
862*207e5cccSFangrui Song // CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x bfloat> [[VAL:%.*]], i64 7
863*207e5cccSFangrui Song // CHECK-NEXT:    store bfloat [[TMP0]], ptr [[PTR:%.*]], align 2
864*207e5cccSFangrui Song // CHECK-NEXT:    ret void
865*207e5cccSFangrui Song //
866*207e5cccSFangrui Song void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
867*207e5cccSFangrui Song   vst1q_lane_bf16(ptr, val, 7);
868*207e5cccSFangrui Song }
869*207e5cccSFangrui Song 
870*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1_bf16_x2(
871*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
872*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
873*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1
874*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]])
875*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
876*207e5cccSFangrui Song //
877*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1_bf16_x2(
878*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
879*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0
880*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1
881*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
882*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
883*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]])
884*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
885*207e5cccSFangrui Song //
886*207e5cccSFangrui Song void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) {
887*207e5cccSFangrui Song   vst1_bf16_x2(ptr, val);
888*207e5cccSFangrui Song }
889*207e5cccSFangrui Song 
890*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1q_bf16_x2(
891*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
892*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
893*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1
894*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]])
895*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
896*207e5cccSFangrui Song //
897*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1q_bf16_x2(
898*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
899*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0
900*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1
901*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
902*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
903*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
904*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
905*207e5cccSFangrui Song //
906*207e5cccSFangrui Song void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) {
907*207e5cccSFangrui Song   vst1q_bf16_x2(ptr, val);
908*207e5cccSFangrui Song }
909*207e5cccSFangrui Song 
910*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1_bf16_x3(
911*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
912*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
913*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1
914*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2
915*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]])
916*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
917*207e5cccSFangrui Song //
918*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1_bf16_x3(
919*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
920*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0
921*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1
922*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2
923*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
924*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
925*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
926*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]])
927*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
928*207e5cccSFangrui Song //
929*207e5cccSFangrui Song void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) {
930*207e5cccSFangrui Song   vst1_bf16_x3(ptr, val);
931*207e5cccSFangrui Song }
932*207e5cccSFangrui Song 
933*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1q_bf16_x3(
934*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
935*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
936*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1
937*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2
938*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]])
939*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
940*207e5cccSFangrui Song //
941*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1q_bf16_x3(
942*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
943*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0
944*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1
945*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2
946*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
947*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
948*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
949*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]])
950*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
951*207e5cccSFangrui Song //
952*207e5cccSFangrui Song void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) {
953*207e5cccSFangrui Song   vst1q_bf16_x3(ptr, val);
954*207e5cccSFangrui Song }
955*207e5cccSFangrui Song 
956*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1_bf16_x4(
957*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
958*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
959*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1
960*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2
961*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3
962*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]])
963*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
964*207e5cccSFangrui Song //
965*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1_bf16_x4(
966*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
967*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0
968*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1
969*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2
970*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3
971*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
972*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
973*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
974*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat>
975*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]])
976*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
977*207e5cccSFangrui Song //
978*207e5cccSFangrui Song void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) {
979*207e5cccSFangrui Song   vst1_bf16_x4(ptr, val);
980*207e5cccSFangrui Song }
981*207e5cccSFangrui Song 
982*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1q_bf16_x4(
983*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
984*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
985*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1
986*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2
987*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3
988*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]])
989*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
990*207e5cccSFangrui Song //
991*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1q_bf16_x4(
992*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
993*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0
994*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1
995*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2
996*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3
997*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
998*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
999*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
1000*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat>
1001*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]])
1002*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1003*207e5cccSFangrui Song //
1004*207e5cccSFangrui Song void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) {
1005*207e5cccSFangrui Song   vst1q_bf16_x4(ptr, val);
1006*207e5cccSFangrui Song }
1007*207e5cccSFangrui Song 
1008*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst2_bf16(
1009*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1010*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
1011*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1
1012*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st2.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]])
1013*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1014*207e5cccSFangrui Song //
1015*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst2_bf16(
1016*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1017*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0
1018*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1
1019*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
1020*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
1021*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst2.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 2)
1022*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1023*207e5cccSFangrui Song //
1024*207e5cccSFangrui Song void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
1025*207e5cccSFangrui Song   vst2_bf16(ptr, val);
1026*207e5cccSFangrui Song }
1027*207e5cccSFangrui Song 
1028*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst2q_bf16(
1029*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1030*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
1031*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1
1032*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st2.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]])
1033*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1034*207e5cccSFangrui Song //
1035*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst2q_bf16(
1036*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1037*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0
1038*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1
1039*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
1040*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
1041*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst2.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 2)
1042*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1043*207e5cccSFangrui Song //
1044*207e5cccSFangrui Song void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
1045*207e5cccSFangrui Song   vst2q_bf16(ptr, val);
1046*207e5cccSFangrui Song }
1047*207e5cccSFangrui Song 
1048*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst2_lane_bf16(
1049*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1050*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
1051*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1
1052*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], i64 1, ptr [[PTR:%.*]])
1053*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1054*207e5cccSFangrui Song //
1055*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst2_lane_bf16(
1056*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1057*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0
1058*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1
1059*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
1060*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
1061*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 1, i32 2)
1062*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1063*207e5cccSFangrui Song //
1064*207e5cccSFangrui Song void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
1065*207e5cccSFangrui Song   vst2_lane_bf16(ptr, val, 1);
1066*207e5cccSFangrui Song }
1067*207e5cccSFangrui Song 
1068*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst2q_lane_bf16(
1069*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1070*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
1071*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1
1072*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR:%.*]])
1073*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1074*207e5cccSFangrui Song //
1075*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst2q_lane_bf16(
1076*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1077*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0
1078*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1
1079*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
1080*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
1081*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 7, i32 2)
1082*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1083*207e5cccSFangrui Song //
1084*207e5cccSFangrui Song void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
1085*207e5cccSFangrui Song   vst2q_lane_bf16(ptr, val, 7);
1086*207e5cccSFangrui Song }
1087*207e5cccSFangrui Song 
1088*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst3_bf16(
1089*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1090*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
1091*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1
1092*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2
1093*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st3.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]])
1094*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1095*207e5cccSFangrui Song //
1096*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst3_bf16(
1097*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1098*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0
1099*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1
1100*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2
1101*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
1102*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
1103*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
1104*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst3.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 2)
1105*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1106*207e5cccSFangrui Song //
1107*207e5cccSFangrui Song void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
1108*207e5cccSFangrui Song   vst3_bf16(ptr, val);
1109*207e5cccSFangrui Song }
1110*207e5cccSFangrui Song 
1111*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst3q_bf16(
1112*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1113*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
1114*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1
1115*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2
1116*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st3.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]])
1117*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1118*207e5cccSFangrui Song //
1119*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst3q_bf16(
1120*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1121*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0
1122*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1
1123*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2
1124*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
1125*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
1126*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
1127*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst3.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 2)
1128*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1129*207e5cccSFangrui Song //
1130*207e5cccSFangrui Song void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
1131*207e5cccSFangrui Song   vst3q_bf16(ptr, val);
1132*207e5cccSFangrui Song }
1133*207e5cccSFangrui Song 
1134*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst3_lane_bf16(
1135*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1136*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
1137*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1
1138*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2
1139*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], i64 1, ptr [[PTR:%.*]])
1140*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1141*207e5cccSFangrui Song //
1142*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst3_lane_bf16(
1143*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1144*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0
1145*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1
1146*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2
1147*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
1148*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
1149*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
1150*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 1, i32 2)
1151*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1152*207e5cccSFangrui Song //
1153*207e5cccSFangrui Song void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
1154*207e5cccSFangrui Song   vst3_lane_bf16(ptr, val, 1);
1155*207e5cccSFangrui Song }
1156*207e5cccSFangrui Song 
1157*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst3q_lane_bf16(
1158*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1159*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
1160*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1
1161*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2
1162*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR:%.*]])
1163*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1164*207e5cccSFangrui Song //
1165*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst3q_lane_bf16(
1166*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1167*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0
1168*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1
1169*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2
1170*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
1171*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
1172*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
1173*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 7, i32 2)
1174*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1175*207e5cccSFangrui Song //
1176*207e5cccSFangrui Song void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
1177*207e5cccSFangrui Song   vst3q_lane_bf16(ptr, val, 7);
1178*207e5cccSFangrui Song }
1179*207e5cccSFangrui Song 
1180*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst4_bf16(
1181*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1182*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
1183*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1
1184*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2
1185*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3
1186*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st4.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]])
1187*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1188*207e5cccSFangrui Song //
1189*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst4_bf16(
1190*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1191*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0
1192*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1
1193*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2
1194*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3
1195*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
1196*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
1197*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
1198*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat>
1199*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst4.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 2)
1200*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1201*207e5cccSFangrui Song //
1202*207e5cccSFangrui Song void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
1203*207e5cccSFangrui Song   vst4_bf16(ptr, val);
1204*207e5cccSFangrui Song }
1205*207e5cccSFangrui Song 
1206*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst4q_bf16(
1207*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1208*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
1209*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1
1210*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2
1211*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3
1212*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st4.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]])
1213*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1214*207e5cccSFangrui Song //
1215*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst4q_bf16(
1216*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1217*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0
1218*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1
1219*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2
1220*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3
1221*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
1222*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
1223*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
1224*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat>
1225*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst4.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 2)
1226*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1227*207e5cccSFangrui Song //
1228*207e5cccSFangrui Song void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
1229*207e5cccSFangrui Song   vst4q_bf16(ptr, val);
1230*207e5cccSFangrui Song }
1231*207e5cccSFangrui Song 
1232*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst4_lane_bf16(
1233*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1234*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
1235*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1
1236*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2
1237*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3
1238*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], i64 1, ptr [[PTR:%.*]])
1239*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1240*207e5cccSFangrui Song //
1241*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst4_lane_bf16(
1242*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1243*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0
1244*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1
1245*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2
1246*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3
1247*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
1248*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
1249*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
1250*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat>
1251*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 1, i32 2)
1252*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1253*207e5cccSFangrui Song //
1254*207e5cccSFangrui Song void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
1255*207e5cccSFangrui Song   vst4_lane_bf16(ptr, val, 1);
1256*207e5cccSFangrui Song }
1257*207e5cccSFangrui Song 
1258*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst4q_lane_bf16(
1259*207e5cccSFangrui Song // CHECK64-NEXT:  entry:
1260*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
1261*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1
1262*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2
1263*207e5cccSFangrui Song // CHECK64-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3
1264*207e5cccSFangrui Song // CHECK64-NEXT:    tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR:%.*]])
1265*207e5cccSFangrui Song // CHECK64-NEXT:    ret void
1266*207e5cccSFangrui Song //
1267*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst4q_lane_bf16(
1268*207e5cccSFangrui Song // CHECK32-NEXT:  entry:
1269*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0
1270*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1
1271*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2
1272*207e5cccSFangrui Song // CHECK32-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3
1273*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
1274*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
1275*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
1276*207e5cccSFangrui Song // CHECK32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat>
1277*207e5cccSFangrui Song // CHECK32-NEXT:    tail call void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 7, i32 2)
1278*207e5cccSFangrui Song // CHECK32-NEXT:    ret void
1279*207e5cccSFangrui Song //
1280*207e5cccSFangrui Song void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
1281*207e5cccSFangrui Song   vst4q_lane_bf16(ptr, val, 7);
1282*207e5cccSFangrui Song }
1283