xref: /llvm-project/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c (revision 804b81d39f2d50743fd2090aed72dad29f5fb388)
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2 
3 // RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
4 // RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
5 
6 // RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -O3 -Werror -Wall -S -o /dev/null %s
7 
8 // REQUIRES: aarch64-registered-target
9 
10 #include <arm_neon.h>
11 
12 // CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16(
13 // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
14 // CHECK-NEXT:  [[ENTRY:.*:]]
15 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
16 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
17 // CHECK-NEXT:    [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
18 // CHECK-NEXT:    ret <4 x half> [[FDOT21_I]]
19 //
20 // CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z13test_vdot_f1613__Float16x4_t13__Mfloat8x8_tS0_m(
21 // CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
22 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
23 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
24 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
25 // CHECK-CXX-NEXT:    [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
26 // CHECK-CXX-NEXT:    ret <4 x half> [[FDOT21_I]]
27 //
28 float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
29   return vdot_f16_mf8_fpm(vd, vn, vm, fpmr);
30 }
31 
32 // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16(
33 // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
34 // CHECK-NEXT:  [[ENTRY:.*:]]
35 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
36 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
37 // CHECK-NEXT:    [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
38 // CHECK-NEXT:    ret <8 x half> [[FDOT21_I]]
39 //
40 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z14test_vdotq_f1613__Float16x8_t14__Mfloat8x16_tS0_m(
41 // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
42 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
43 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
44 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
45 // CHECK-CXX-NEXT:    [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
46 // CHECK-CXX-NEXT:    ret <8 x half> [[FDOT21_I]]
47 //
48 float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
49   return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr);
50 }
51 
52 // CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16(
53 // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
54 // CHECK-NEXT:  [[ENTRY:.*:]]
55 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
56 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
57 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
58 // CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
59 // CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
60 // CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
61 //
62 // CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z18test_vdot_lane_f1613__Float16x4_t13__Mfloat8x8_tS0_m(
63 // CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
64 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
65 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
66 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
67 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
68 // CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
69 // CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
70 // CHECK-CXX-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
71 //
72 float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
73   return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
74 }
75 
76 // CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16(
77 // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
78 // CHECK-NEXT:  [[ENTRY:.*:]]
79 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
80 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
81 // CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
82 // CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
83 // CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
84 //
85 // CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z19test_vdot_laneq_f1613__Float16x4_t13__Mfloat8x8_t14__Mfloat8x16_tm(
86 // CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
87 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
88 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
89 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
90 // CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
91 // CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
92 // CHECK-CXX-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
93 //
94 float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
95   return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
96 }
97 
98 // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16(
99 // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
100 // CHECK-NEXT:  [[ENTRY:.*:]]
101 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
102 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
103 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
104 // CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
105 // CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
106 // CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
107 //
108 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z19test_vdotq_lane_f1613__Float16x8_t14__Mfloat8x16_t13__Mfloat8x8_tm(
109 // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
110 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
111 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
112 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
113 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
114 // CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
115 // CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
116 // CHECK-CXX-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
117 //
118 float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
119   return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
120 }
121 
122 // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16(
123 // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
124 // CHECK-NEXT:  [[ENTRY:.*:]]
125 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
126 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
127 // CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
128 // CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
129 // CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
130 //
131 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z20test_vdotq_laneq_f1613__Float16x8_t14__Mfloat8x16_tS0_m(
132 // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
133 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
134 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
135 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
136 // CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
137 // CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
138 // CHECK-CXX-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
139 //
140 float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
141   return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
142 }
143 
144 // CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
145 // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
146 // CHECK-NEXT:  [[ENTRY:.*:]]
147 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
148 // CHECK-NEXT:    [[FDOT4_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
149 // CHECK-NEXT:    ret <2 x float> [[FDOT4_I]]
150 //
151 // CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z13test_vdot_f3213__Float32x2_t13__Mfloat8x8_tS0_m(
152 // CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
153 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
154 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
155 // CHECK-CXX-NEXT:    [[FDOT4_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
156 // CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_I]]
157 //
158 float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
159   return vdot_f32_mf8_fpm(vd, vn, vm, fpmr);
160 }
161 
162 // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
163 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
164 // CHECK-NEXT:  [[ENTRY:.*:]]
165 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
166 // CHECK-NEXT:    [[FDOT4_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
167 // CHECK-NEXT:    ret <4 x float> [[FDOT4_I]]
168 //
169 // CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z14test_vdotq_f3213__Float32x4_t14__Mfloat8x16_tS0_m(
170 // CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
171 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
172 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
173 // CHECK-CXX-NEXT:    [[FDOT4_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
174 // CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_I]]
175 //
176 float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
177   return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr);
178 }
179 
180 // CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
181 // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
182 // CHECK-NEXT:  [[ENTRY:.*:]]
183 // CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
184 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
185 // CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
186 // CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
187 //
188 // CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z18test_vdot_lane_f3213__Float32x2_t13__Mfloat8x8_tS0_m(
189 // CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
190 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
191 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
192 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
193 // CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
194 // CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_LANE]]
195 //
196 float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
197   return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
198 }
199 
200 // CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
201 // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
202 // CHECK-NEXT:  [[ENTRY:.*:]]
203 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
204 // CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
205 // CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
206 //
207 // CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z19test_vdot_laneq_f3213__Float32x2_t13__Mfloat8x8_t14__Mfloat8x16_tm(
208 // CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
209 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
210 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
211 // CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
212 // CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_LANE]]
213 //
214 float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
215   return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
216 }
217 
218 // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
219 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
220 // CHECK-NEXT:  [[ENTRY:.*:]]
221 // CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
222 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
223 // CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
224 // CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
225 //
226 // CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vdotq_lane_f3213__Float32x4_t14__Mfloat8x16_t13__Mfloat8x8_tm(
227 // CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
228 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
229 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
230 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
231 // CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
232 // CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_LANE]]
233 //
234 float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
235   return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
236 }
237 
238 // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
239 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
240 // CHECK-NEXT:  [[ENTRY:.*:]]
241 // CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
242 // CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
243 // CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
244 //
245 // CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z20test_vdotq_laneq_f3213__Float32x4_t14__Mfloat8x16_tS0_m(
246 // CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
247 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
248 // CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
249 // CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
250 // CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_LANE]]
251 //
252 float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
253   return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
254 }
255