xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
3
4target triple="aarch64-linux-gnu"
5
6
7; == Multi, multi (16-bit float) ==
8
9define void @fdot_multi_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3) #0 {
10; CHECK-LABEL: fdot_multi_za32_f16_vg1x2:
11; CHECK:       // %bb.0:
12; CHECK-NEXT:    mov z5.d, z4.d
13; CHECK-NEXT:    mov z7.d, z2.d
14; CHECK-NEXT:    mov w8, w0
15; CHECK-NEXT:    mov z4.d, z3.d
16; CHECK-NEXT:    mov z6.d, z1.d
17; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
18; CHECK-NEXT:    fdot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
19; CHECK-NEXT:    ret
20  call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3)
21  %slice2 = add i32 %slice, 7
22  call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3)
23  ret void
24}
25
26define void @fdot_multi_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
27; CHECK-LABEL: fdot_multi_za32_f16_vg1x4:
28; CHECK:       // %bb.0:
29; CHECK-NEXT:    mov z26.d, z7.d
30; CHECK-NEXT:    mov z31.d, z4.d
31; CHECK-NEXT:    mov w8, w0
32; CHECK-NEXT:    ptrue p0.h
33; CHECK-NEXT:    mov z25.d, z6.d
34; CHECK-NEXT:    mov z30.d, z3.d
35; CHECK-NEXT:    mov z24.d, z5.d
36; CHECK-NEXT:    mov z29.d, z2.d
37; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
38; CHECK-NEXT:    mov z28.d, z1.d
39; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
40; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
41; CHECK-NEXT:    ret
42                                        <vscale x 8 x half> %zn4, <vscale x 8 x half> %zn5, <vscale x 8 x half> %zn6, <vscale x 8 x half> %zn7) #0 {
43  call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
44                                                      <vscale x 8 x half> %zn4, <vscale x 8 x half> %zn5, <vscale x 8 x half> %zn6, <vscale x 8 x half> %zn7)
45  %slice2 = add i32 %slice, 7
46  call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
47                                                      <vscale x 8 x half> %zn4, <vscale x 8 x half> %zn5, <vscale x 8 x half> %zn6, <vscale x 8 x half> %zn7)
48  ret void
49}
50
51
52; == Multi, multi (16-bit bfloat) ==
53
54define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3) #0 {
55; CHECK-LABEL: bfdot_multi_za32_bf16_vg1x2:
56; CHECK:       // %bb.0:
57; CHECK-NEXT:    mov z5.d, z4.d
58; CHECK-NEXT:    mov z7.d, z2.d
59; CHECK-NEXT:    mov w8, w0
60; CHECK-NEXT:    mov z4.d, z3.d
61; CHECK-NEXT:    mov z6.d, z1.d
62; CHECK-NEXT:    bfdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
63; CHECK-NEXT:    bfdot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
64; CHECK-NEXT:    ret
65  call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3)
66  %slice2 = add i32 %slice, 7
67  call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3)
68  ret void
69}
70
71define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
72; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4:
73; CHECK:       // %bb.0:
74; CHECK-NEXT:    mov z26.d, z7.d
75; CHECK-NEXT:    mov z31.d, z4.d
76; CHECK-NEXT:    mov w8, w0
77; CHECK-NEXT:    ptrue p0.h
78; CHECK-NEXT:    mov z25.d, z6.d
79; CHECK-NEXT:    mov z30.d, z3.d
80; CHECK-NEXT:    mov z24.d, z5.d
81; CHECK-NEXT:    mov z29.d, z2.d
82; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
83; CHECK-NEXT:    mov z28.d, z1.d
84; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
85; CHECK-NEXT:    bfdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
86; CHECK-NEXT:    ret
87                                        <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zn5, <vscale x 8 x bfloat> %zn6, <vscale x 8 x bfloat> %zn7) #0 {
88  call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
89                                                       <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zn5, <vscale x 8 x bfloat> %zn6, <vscale x 8 x bfloat> %zn7)
90  %slice2 = add i32 %slice, 7
91  call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
92                                                        <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zn5, <vscale x 8 x bfloat> %zn6, <vscale x 8 x bfloat> %zn7)
93  ret void
94}
95
96
97; == Multi, single (16-bit float) ==
98
99define void @fdot_single_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) #0 {
100; CHECK-LABEL: fdot_single_za32_f16_vg1x2:
101; CHECK:       // %bb.0:
102; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
103; CHECK-NEXT:    mov w8, w0
104; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
105; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
106; CHECK-NEXT:    fdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
107; CHECK-NEXT:    ret
108  call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
109  %slice2 = add i32 %slice, 7
110  call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
111  ret void
112}
113
114define void @fdot_single_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) #0 {
115; CHECK-LABEL: fdot_single_za32_f16_vg1x4:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
118; CHECK-NEXT:    mov w8, w0
119; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
120; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
121; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
122; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
123; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
124; CHECK-NEXT:    ret
125  call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
126  %slice2 = add i32 %slice, 7
127  call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
128  ret void
129}
130
131
132; == Multi, single (16-bit bfloat) ==
133
134define void @bfdot_single_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) #0 {
135; CHECK-LABEL: bfdot_single_za32_bf16_vg1x2:
136; CHECK:       // %bb.0:
137; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
138; CHECK-NEXT:    mov w8, w0
139; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
140; CHECK-NEXT:    bfdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
141; CHECK-NEXT:    bfdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
142; CHECK-NEXT:    ret
143  call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
144  %slice2 = add i32 %slice, 7
145  call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
146  ret void
147}
148
149define void @bfdot_single_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) #0 {
150; CHECK-LABEL: bfdot_single_za32_bf16_vg1x4:
151; CHECK:       // %bb.0:
152; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
153; CHECK-NEXT:    mov w8, w0
154; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
155; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
156; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
157; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
158; CHECK-NEXT:    bfdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
159; CHECK-NEXT:    ret
160  call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
161  %slice2 = add i32 %slice, 7
162  call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
163  ret void
164}
165
166
167; == Multi, indexed (16-bit float) ==
168
169define void @fdot_lane_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) #0 {
170; CHECK-LABEL: fdot_lane_za32_f16_vg1x2:
171; CHECK:       // %bb.0:
172; CHECK-NEXT:    mov z5.d, z2.d
173; CHECK-NEXT:    mov w8, w0
174; CHECK-NEXT:    mov z4.d, z1.d
175; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
176; CHECK-NEXT:    fdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
177; CHECK-NEXT:    ret
178  call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, i32 3)
179  %slice2 = add i32 %slice, 7
180  call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, i32 3)
181  ret void
182}
183
184define void @fdot_lane_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) #0 {
185; CHECK-LABEL: fdot_lane_za32_f16_vg1x4:
186; CHECK:       // %bb.0:
187; CHECK-NEXT:    mov z27.d, z4.d
188; CHECK-NEXT:    mov w8, w0
189; CHECK-NEXT:    mov z26.d, z3.d
190; CHECK-NEXT:    mov z25.d, z2.d
191; CHECK-NEXT:    mov z24.d, z1.d
192; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
193; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z24.h - z27.h }, z5.h[3]
194; CHECK-NEXT:    ret
195  call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
196                                                           <vscale x 8 x half> %zn4, i32 3)
197  %slice2 = add i32 %slice, 7
198  call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
199                                                           <vscale x 8 x half> %zn4, i32 3)
200  ret void
201}
202
203
204; == Multi, indexed (16-bit bfloat) ==
205
206define void @bfdot_lane_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) #0 {
207; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x2:
208; CHECK:       // %bb.0:
209; CHECK-NEXT:    mov z5.d, z2.d
210; CHECK-NEXT:    mov w8, w0
211; CHECK-NEXT:    mov z4.d, z1.d
212; CHECK-NEXT:    bfdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
213; CHECK-NEXT:    bfdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
214; CHECK-NEXT:    ret
215  call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, i32 3)
216  %slice2 = add i32 %slice, 7
217  call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, i32 3)
218  ret void
219}
220
221define void @bfdot_lane_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) #0 {
222; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x4:
223; CHECK:       // %bb.0:
224; CHECK-NEXT:    mov z27.d, z4.d
225; CHECK-NEXT:    mov w8, w0
226; CHECK-NEXT:    mov z26.d, z3.d
227; CHECK-NEXT:    mov z25.d, z2.d
228; CHECK-NEXT:    mov z24.d, z1.d
229; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
230; CHECK-NEXT:    bfdot za.s[w8, 7, vgx4], { z24.h - z27.h }, z5.h[3]
231; CHECK-NEXT:    ret
232  call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
233                                                             <vscale x 8 x bfloat> %zn4, i32 3)
234  %slice2 = add i32 %slice, 7
235  call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
236                                                             <vscale x 8 x bfloat> %zn4, i32 3)
237  ret void
238}
239
240
241attributes #0 = { nounwind "target-features"="+sme2" }
242
243
244; == Multi, multi (16-bit float)
245
246declare void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
247declare void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
248                                                       <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
249
250; == Multi, multi (16-bit bfloat)
251
252declare void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
253declare void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
254                                                         <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
255
256; == Multi, single (16-bit float)
257
258declare void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
259declare void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
260
261; == Multi, single (16-bit bfloat)
262
263declare void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
264declare void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
265
266; == Multi, indexed (16-bit float)
267
268declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
269declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
270
271; == Multi, indexed (16-bit bfloat)
272
273declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
274declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
275