1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+bf16 -force-streaming -verify-machineinstrs < %s | FileCheck %s 3 4; 5; BF/F/S/UMLAL x1 (SINGLE) 6; 7 8define void @multi_vector_add_single_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) { 9; CHECK-LABEL: multi_vector_add_single_vg2x1_bf16: 10; CHECK: // %bb.0: 11; CHECK-NEXT: mov w8, w0 12; CHECK-NEXT: bfmlal za.s[w8, 0:1], z0.h, z1.h 13; CHECK-NEXT: bfmlal za.s[w8, 14:15], z0.h, z1.h 14; CHECK-NEXT: ret 15 call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) 16 %slice.14 = add i32 %slice, 14 17 call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) 18 ret void 19} 20 21define void @multi_vector_add_single_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) { 22; CHECK-LABEL: multi_vector_add_single_vg2x1_f16: 23; CHECK: // %bb.0: 24; CHECK-NEXT: mov w8, w0 25; CHECK-NEXT: fmlal za.s[w8, 0:1], z0.h, z1.h 26; CHECK-NEXT: fmlal za.s[w8, 14:15], z0.h, z1.h 27; CHECK-NEXT: ret 28 call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) 29 %slice.14 = add i32 %slice, 14 30 call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) 31 ret void 32} 33 34define void @multi_vector_add_single_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 35; CHECK-LABEL: multi_vector_add_single_vg2x1_s16: 36; CHECK: // %bb.0: 37; CHECK-NEXT: mov w8, w0 38; CHECK-NEXT: smlal za.s[w8, 0:1], z0.h, z1.h 39; CHECK-NEXT: smlal za.s[w8, 14:15], z0.h, z1.h 40; CHECK-NEXT: ret 41 call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 42 %slice.14 = add i32 %slice, 14 43 call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 44 ret void 45} 46 47define void @multi_vector_add_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 48; CHECK-LABEL: multi_vector_add_single_vg2x1_u16: 49; CHECK: // %bb.0: 50; CHECK-NEXT: mov w8, w0 51; CHECK-NEXT: umlal za.s[w8, 0:1], z0.h, z1.h 52; CHECK-NEXT: umlal za.s[w8, 14:15], z0.h, z1.h 53; CHECK-NEXT: ret 54 call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 55 %slice.14 = add i32 %slice, 14 56 call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 57 ret void 58} 59 60; 61; BF/F/S/UMLSL x1 (SINGLE) 62; 63 64define void @multi_vector_sub_single_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) { 65; CHECK-LABEL: multi_vector_sub_single_vg2x1_bf16: 66; CHECK: // %bb.0: 67; CHECK-NEXT: mov w8, w0 68; CHECK-NEXT: bfmlsl za.s[w8, 0:1], z0.h, z1.h 69; CHECK-NEXT: bfmlsl za.s[w8, 14:15], z0.h, z1.h 70; CHECK-NEXT: ret 71 call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) 72 %slice.14 = add i32 %slice, 14 73 call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) 74 ret void 75} 76 77define void @multi_vector_sub_single_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) { 78; CHECK-LABEL: multi_vector_sub_single_vg2x1_f16: 79; CHECK: // %bb.0: 80; CHECK-NEXT: mov w8, w0 81; CHECK-NEXT: fmlsl za.s[w8, 0:1], z0.h, z1.h 82; CHECK-NEXT: fmlsl za.s[w8, 14:15], z0.h, z1.h 83; CHECK-NEXT: ret 84 call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) 85 %slice.14 = add i32 %slice, 14 86 call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) 87 ret void 88} 89 90define void @multi_vector_sub_single_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 91; CHECK-LABEL: multi_vector_sub_single_vg2x1_s16: 92; CHECK: // %bb.0: 93; CHECK-NEXT: mov w8, w0 94; CHECK-NEXT: smlsl za.s[w8, 0:1], z0.h, z1.h 95; CHECK-NEXT: smlsl za.s[w8, 14:15], z0.h, z1.h 96; CHECK-NEXT: ret 97 call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 98 %slice.14 = add i32 %slice, 14 99 call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 100 ret void 101} 102 103define void @multi_vector_sub_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 104; CHECK-LABEL: multi_vector_sub_single_vg2x1_u16: 105; CHECK: // %bb.0: 106; CHECK-NEXT: mov w8, w0 107; CHECK-NEXT: umlsl za.s[w8, 0:1], z0.h, z1.h 108; CHECK-NEXT: umlsl za.s[w8, 14:15], z0.h, z1.h 109; CHECK-NEXT: ret 110 call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 111 %slice.14 = add i32 %slice, 14 112 call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 113 ret void 114} 115 116; 117; BF/F/S/UMLAL x2 (SINGLE) 118; 119 120define void @multi_vector_add_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) { 121; CHECK-LABEL: multi_vector_add_single_vg2x2_bf16: 122; CHECK: // %bb.0: 123; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 124; CHECK-NEXT: mov w8, w0 125; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 126; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h 127; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h 128; CHECK-NEXT: ret 129 call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) 130 %slice.6 = add i32 %slice, 6 131 call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) 132 ret void 133} 134 135define void @multi_vector_add_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) { 136; CHECK-LABEL: multi_vector_add_single_vg2x2_f16: 137; CHECK: // %bb.0: 138; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 139; CHECK-NEXT: mov w8, w0 140; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 141; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h 142; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h 143; CHECK-NEXT: ret 144 call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) 145 %slice.6 = add i32 %slice, 6 146 call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) 147 ret void 148} 149 150define void @multi_vector_add_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 151; CHECK-LABEL: multi_vector_add_single_vg2x2_s16: 152; CHECK: // %bb.0: 153; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 154; CHECK-NEXT: mov w8, w0 155; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 156; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h 157; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h 158; CHECK-NEXT: ret 159 call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 160 %slice.6 = add i32 %slice, 6 161 call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 162 ret void 163} 164 165define void @multi_vector_add_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 166; CHECK-LABEL: multi_vector_add_single_vg2x2_u16: 167; CHECK: // %bb.0: 168; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 169; CHECK-NEXT: mov w8, w0 170; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 171; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h 172; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h 173; CHECK-NEXT: ret 174 call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 175 %slice.6 = add i32 %slice, 6 176 call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 177 ret void 178} 179 180; 181; BF/F/S/UMLSL x2 (SINGLE) 182; 183 184define void @multi_vector_sub_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) { 185; CHECK-LABEL: multi_vector_sub_single_vg2x2_bf16: 186; CHECK: // %bb.0: 187; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 188; CHECK-NEXT: mov w8, w0 189; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 190; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h 191; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h 192; CHECK-NEXT: ret 193 call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) 194 %slice.6 = add i32 %slice, 6 195 call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) 196 ret void 197} 198 199define void @multi_vector_sub_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) { 200; CHECK-LABEL: multi_vector_sub_single_vg2x2_f16: 201; CHECK: // %bb.0: 202; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 203; CHECK-NEXT: mov w8, w0 204; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 205; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h 206; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h 207; CHECK-NEXT: ret 208 call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) 209 %slice.6 = add i32 %slice, 6 210 call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) 211 ret void 212} 213 214define void @multi_vector_sub_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 215; CHECK-LABEL: multi_vector_sub_single_vg2x2_s16: 216; CHECK: // %bb.0: 217; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 218; CHECK-NEXT: mov w8, w0 219; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 220; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h 221; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h 222; CHECK-NEXT: ret 223 call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 224 %slice.6 = add i32 %slice, 6 225 call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 226 ret void 227} 228 229define void @multi_vector_sub_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 230; CHECK-LABEL: multi_vector_sub_single_vg2x2_u16: 231; CHECK: // %bb.0: 232; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 233; CHECK-NEXT: mov w8, w0 234; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 235; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h 236; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h 237; CHECK-NEXT: ret 238 call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 239 %slice.6 = add i32 %slice, 6 240 call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 241 ret void 242} 243 244; 245; BF/F/S/UMLAL x4 (SINGLE) 246; 247 248define void @multi_vector_add_single_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) { 249; CHECK-LABEL: multi_vector_add_single_vg2x4_bf16: 250; CHECK: // %bb.0: 251; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 252; CHECK-NEXT: mov w8, w0 253; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 254; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 255; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 256; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h 257; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h 258; CHECK-NEXT: ret 259 call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice, 260 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 261 <vscale x 8 x bfloat> %zm) 262 %slice.6 = add i32 %slice, 6 263 call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice.6, 264 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 265 <vscale x 8 x bfloat> %zm) 266 ret void 267} 268 269define void @multi_vector_add_single_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) { 270; CHECK-LABEL: multi_vector_add_single_vg2x4_f16: 271; CHECK: // %bb.0: 272; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 273; CHECK-NEXT: mov w8, w0 274; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 275; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 276; CHECK-NEXT: mov z3.d, z2.d 277; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h 278; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h 279; CHECK-NEXT: ret 280 call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice, 281 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn2, 282 <vscale x 8 x half> %zm) 283 %slice.6 = add i32 %slice, 6 284 call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice.6, 285 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn2, 286 <vscale x 8 x half> %zm) 287 ret void 288} 289 290define void @multi_vector_add_single_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 291; CHECK-LABEL: multi_vector_add_single_vg2x4_s16: 292; CHECK: // %bb.0: 293; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 294; CHECK-NEXT: mov w8, w0 295; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 296; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 297; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 298; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h 299; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h 300; CHECK-NEXT: ret 301 call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice, 302 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 303 <vscale x 8 x i16> %zm) 304 %slice.6 = add i32 %slice, 6 305 call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice.6, 306 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 307 <vscale x 8 x i16> %zm) 308 ret void 309} 310 311define void @multi_vector_add_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 312; CHECK-LABEL: multi_vector_add_single_vg2x4_u16: 313; CHECK: // %bb.0: 314; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 315; CHECK-NEXT: mov w8, w0 316; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 317; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 318; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 319; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h 320; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h 321; CHECK-NEXT: ret 322 call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice, 323 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 324 <vscale x 8 x i16> %zm) 325 %slice.6 = add i32 %slice, 6 326 call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice.6, 327 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 328 <vscale x 8 x i16> %zm) 329 ret void 330} 331 332; 333; BF/F/S/UMLSL x4 (SINGLE) 334; 335 336define void @multi_vector_sub_single_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) { 337; CHECK-LABEL: multi_vector_sub_single_vg2x4_bf16: 338; CHECK: // %bb.0: 339; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 340; CHECK-NEXT: mov w8, w0 341; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 342; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 343; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 344; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h 345; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h 346; CHECK-NEXT: ret 347 call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice, 348 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 349 <vscale x 8 x bfloat> %zm) 350 %slice.6 = add i32 %slice, 6 351 call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice.6, 352 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 353 <vscale x 8 x bfloat> %zm) 354 ret void 355} 356 357define void @multi_vector_sub_single_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) { 358; CHECK-LABEL: multi_vector_sub_single_vg2x4_f16: 359; CHECK: // %bb.0: 360; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 361; CHECK-NEXT: mov w8, w0 362; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 363; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 364; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 365; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h 366; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h 367; CHECK-NEXT: ret 368 call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice, 369 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 370 <vscale x 8 x half> %zm) 371 %slice.6 = add i32 %slice, 6 372 call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice.6, 373 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 374 <vscale x 8 x half> %zm) 375 ret void 376} 377 378define void @multi_vector_sub_single_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 379; CHECK-LABEL: multi_vector_sub_single_vg2x4_s16: 380; CHECK: // %bb.0: 381; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 382; CHECK-NEXT: mov w8, w0 383; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 384; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 385; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 386; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h 387; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h 388; CHECK-NEXT: ret 389 call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice, 390 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 391 <vscale x 8 x i16> %zm) 392 %slice.6 = add i32 %slice, 6 393 call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice.6, 394 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 395 <vscale x 8 x i16> %zm) 396 ret void 397} 398 399define void @multi_vector_sub_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 400; CHECK-LABEL: multi_vector_sub_single_vg2x4_u16: 401; CHECK: // %bb.0: 402; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 403; CHECK-NEXT: mov w8, w0 404; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 405; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 406; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 407; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h 408; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h 409; CHECK-NEXT: ret 410 call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice, 411 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 412 <vscale x 8 x i16> %zm) 413 %slice.6 = add i32 %slice, 6 414 call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice.6, 415 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 416 <vscale x 8 x i16> %zm) 417 ret void 418} 419 420; 421; BF/F/S/UMLAL x2 (MULTI) 422; 423 424define void @multi_vector_add_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) { 425; CHECK-LABEL: multi_vector_add_multi_vg2x2_bf16: 426; CHECK: // %bb.0: 427; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 428; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 429; CHECK-NEXT: mov w8, w0 430; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 431; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 432; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 433; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 434; CHECK-NEXT: ret 435 call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, 436 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) 437 %slice.6 = add i32 %slice, 6 438 call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, 439 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) 440 ret void 441} 442 443define void @multi_vector_add_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) { 444; CHECK-LABEL: multi_vector_add_multi_vg2x2_f16: 445; CHECK: // %bb.0: 446; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 447; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 448; CHECK-NEXT: mov w8, w0 449; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 450; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 451; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 452; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 453; CHECK-NEXT: ret 454 call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, 455 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) 456 %slice.6 = add i32 %slice, 6 457 call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, 458 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) 459 ret void 460} 461 462define void @multi_vector_add_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) { 463; CHECK-LABEL: multi_vector_add_multi_vg2x2_s16: 464; CHECK: // %bb.0: 465; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 466; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 467; CHECK-NEXT: mov w8, w0 468; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 469; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 470; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 471; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 472; CHECK-NEXT: ret 473 call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, 474 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 475 %slice.6 = add i32 %slice, 6 476 call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, 477 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 478 ret void 479} 480 481define void @multi_vector_add_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) { 482; CHECK-LABEL: multi_vector_add_multi_vg2x2_u16: 483; CHECK: // %bb.0: 484; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 485; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 486; CHECK-NEXT: mov w8, w0 487; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 488; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 489; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 490; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 491; CHECK-NEXT: ret 492 call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, 493 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 494 %slice.6 = add i32 %slice, 6 495 call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, 496 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 497 ret void 498} 499 500; 501; BF/F/S/UMLSL x2 (MULTI) 502; 503 504define void @multi_vector_sub_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) { 505; CHECK-LABEL: multi_vector_sub_multi_vg2x2_bf16: 506; CHECK: // %bb.0: 507; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 508; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 509; CHECK-NEXT: mov w8, w0 510; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 511; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 512; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 513; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 514; CHECK-NEXT: ret 515 call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, 516 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) 517 %slice.6 = add i32 %slice, 6 518 call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, 519 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) 520 ret void 521} 522 523define void @multi_vector_sub_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) { 524; CHECK-LABEL: multi_vector_sub_multi_vg2x2_f16: 525; CHECK: // %bb.0: 526; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 527; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 528; CHECK-NEXT: mov w8, w0 529; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 530; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 531; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 532; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 533; CHECK-NEXT: ret 534 call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, 535 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) 536 %slice.6 = add i32 %slice, 6 537 call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, 538 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) 539 ret void 540} 541 542define void @multi_vector_sub_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) { 543; CHECK-LABEL: multi_vector_sub_multi_vg2x2_s16: 544; CHECK: // %bb.0: 545; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 546; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 547; CHECK-NEXT: mov w8, w0 548; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 549; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 550; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 551; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 552; CHECK-NEXT: ret 553 call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, 554 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 555 %slice.6 = add i32 %slice, 6 556 call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, 557 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 558 ret void 559} 560 561define void @multi_vector_sub_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) { 562; CHECK-LABEL: multi_vector_sub_multi_vg2x2_u16: 563; CHECK: // %bb.0: 564; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 565; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 566; CHECK-NEXT: mov w8, w0 567; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 568; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 569; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 570; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } 571; CHECK-NEXT: ret 572 call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, 573 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 574 %slice.6 = add i32 %slice, 6 575 call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, 576 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 577 ret void 578} 579 580; 581; BF/F/S/UMLAL x4 (MULTI) 582; 583 584define void @multi_vector_add_multi_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 585; CHECK-LABEL: multi_vector_add_multi_vg2x4_bf16: 586; CHECK: // %bb.0: 587; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 588; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 589; CHECK-NEXT: mov w8, w0 590; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 591; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 592; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 593; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 594; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 595; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 596; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 597; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 598; CHECK-NEXT: ret 599 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) { 600 call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice, 601 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 602 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) 603 %slice.6 = add i32 %slice, 6 604 call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice.6, 605 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 606 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) 607 ret void 608} 609 610define void @multi_vector_add_multi_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 611; CHECK-LABEL: multi_vector_add_multi_vg2x4_f16: 612; CHECK: // %bb.0: 613; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 614; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 615; CHECK-NEXT: mov w8, w0 616; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 617; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 618; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 619; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 620; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 621; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 622; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 623; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 624; CHECK-NEXT: ret 625 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) { 626 call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice, 627 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 628 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) 629 %slice.6 = add i32 %slice, 6 630 call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice.6, 631 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 632 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) 633 ret void 634} 635 636define void @multi_vector_add_multi_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 637; CHECK-LABEL: multi_vector_add_multi_vg2x4_s16: 638; CHECK: // %bb.0: 639; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 640; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 641; CHECK-NEXT: mov w8, w0 642; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 643; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 644; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 645; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 646; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 647; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 648; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 649; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 650; CHECK-NEXT: ret 651 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) { 652 call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice, 653 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 654 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 655 %slice.6 = add i32 %slice, 6 656 call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice.6, 657 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 658 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 659 ret void 660} 661 662define void @multi_vector_add_multi_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 663; CHECK-LABEL: multi_vector_add_multi_vg2x4_u16: 664; CHECK: // %bb.0: 665; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 666; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 667; CHECK-NEXT: mov w8, w0 668; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 669; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 670; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 671; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 672; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 673; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 674; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 675; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 676; CHECK-NEXT: ret 677 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) { 678 call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice, 679 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 680 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 681 %slice.6 = add i32 %slice, 6 682 call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice.6, 683 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 684 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 685 ret void 686} 687 688; 689; BF/F/S/UMLSL x4 (MULTI) 690; 691 692define void @multi_vector_sub_multi_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 693; CHECK-LABEL: multi_vector_sub_multi_vg2x4_bf16: 694; CHECK: // %bb.0: 695; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 696; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 697; CHECK-NEXT: mov w8, w0 698; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 699; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 700; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 701; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 702; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 703; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 704; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 705; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 706; CHECK-NEXT: ret 707 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) { 708 call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice, 709 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 710 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) 711 %slice.6 = add i32 %slice, 6 712 call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice.6, 713 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 714 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) 715 ret void 716} 717 718define void @multi_vector_sub_multi_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 719; CHECK-LABEL: multi_vector_sub_multi_vg2x4_f16: 720; CHECK: // %bb.0: 721; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 722; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 723; CHECK-NEXT: mov w8, w0 724; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 725; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 726; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 727; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 728; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 729; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 730; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 731; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 732; CHECK-NEXT: ret 733 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) { 734 call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice, 735 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 736 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) 737 %slice.6 = add i32 %slice, 6 738 call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice.6, 739 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 740 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) 741 ret void 742} 743 744define void @multi_vector_sub_multi_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 745; CHECK-LABEL: multi_vector_sub_multi_vg2x4_s16: 746; CHECK: // %bb.0: 747; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 748; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 749; CHECK-NEXT: mov w8, w0 750; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 751; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 752; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 753; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 754; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 755; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 756; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 757; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 758; CHECK-NEXT: ret 759 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) { 760 call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice, 761 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 762 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 763 %slice.6 = add i32 %slice, 6 764 call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice.6, 765 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 766 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 767 ret void 768} 769 770define void @multi_vector_sub_multi_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 771; CHECK-LABEL: multi_vector_sub_multi_vg2x4_u16: 772; CHECK: // %bb.0: 773; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 774; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 775; CHECK-NEXT: mov w8, w0 776; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 777; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 778; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 779; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 780; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 781; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 782; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 783; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } 784; CHECK-NEXT: ret 785 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) { 786 call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice, 787 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 788 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 789 %slice.6 = add i32 %slice, 6 790 call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice.6, 791 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 792 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 793 ret void 794} 795 796; 797; BF/F/S/UMLAL x1 (INDEXED) 798; 799 800define void @multi_vector_add_lane_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) { 801; CHECK-LABEL: multi_vector_add_lane_vg2x1_f16: 802; CHECK: // %bb.0: 803; CHECK-NEXT: mov w8, w0 804; CHECK-NEXT: fmlal za.s[w8, 0:1], z0.h, z1.h[0] 805; CHECK-NEXT: fmlal za.s[w8, 14:15], z0.h, z1.h[7] 806; CHECK-NEXT: ret 807 call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 0) 808 %slice.14 = add i32 %slice, 14 809 call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7) 810 ret void 811} 812 813define void @multi_vector_add_lane_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) { 814; CHECK-LABEL: multi_vector_add_lane_vg2x1_bf16: 815; CHECK: // %bb.0: 816; CHECK-NEXT: mov w8, w0 817; CHECK-NEXT: bfmlal za.s[w8, 0:1], z0.h, z1.h[0] 818; CHECK-NEXT: bfmlal za.s[w8, 14:15], z0.h, z1.h[7] 819; CHECK-NEXT: ret 820 call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 0) 821 %slice.14 = add i32 %slice, 14 822 call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7) 823 ret void 824} 825 826define void @multi_vector_add_lane_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 827; CHECK-LABEL: multi_vector_add_lane_vg2x1_s16: 828; CHECK: // %bb.0: 829; CHECK-NEXT: mov w8, w0 830; CHECK-NEXT: smlal za.s[w8, 0:1], z0.h, z1.h[0] 831; CHECK-NEXT: smlal za.s[w8, 14:15], z0.h, z1.h[7] 832; CHECK-NEXT: ret 833 call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0) 834 %slice.14 = add i32 %slice, 14 835 call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7) 836 ret void 837} 838 839define void @multi_vector_add_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 840; CHECK-LABEL: multi_vector_add_lane_vg2x1_u16: 841; CHECK: // %bb.0: 842; CHECK-NEXT: mov w8, w0 843; CHECK-NEXT: umlal za.s[w8, 0:1], z0.h, z1.h[0] 844; CHECK-NEXT: umlal za.s[w8, 14:15], z0.h, z1.h[7] 845; CHECK-NEXT: ret 846 call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0) 847 %slice.14 = add i32 %slice, 14 848 call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7) 849 ret void 850} 851 852; 853; BF/F/S/UMLSL x1 (INDEXED) 854; 855 856define void @multi_vector_sub_lane_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) { 857; CHECK-LABEL: multi_vector_sub_lane_vg2x1_f16: 858; CHECK: // %bb.0: 859; CHECK-NEXT: mov w8, w0 860; CHECK-NEXT: fmlsl za.s[w8, 0:1], z0.h, z1.h[0] 861; CHECK-NEXT: fmlsl za.s[w8, 14:15], z0.h, z1.h[7] 862; CHECK-NEXT: ret 863 call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 0) 864 %slice.14 = add i32 %slice, 14 865 call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7) 866 ret void 867} 868 869define void @multi_vector_sub_lane_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) { 870; CHECK-LABEL: multi_vector_sub_lane_vg2x1_bf16: 871; CHECK: // %bb.0: 872; CHECK-NEXT: mov w8, w0 873; CHECK-NEXT: bfmlsl za.s[w8, 0:1], z0.h, z1.h[0] 874; CHECK-NEXT: bfmlsl za.s[w8, 14:15], z0.h, z1.h[7] 875; CHECK-NEXT: ret 876 call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 0) 877 %slice.14 = add i32 %slice, 14 878 call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7) 879 ret void 880} 881 882define void @multi_vector_sub_lane_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 883; CHECK-LABEL: multi_vector_sub_lane_vg2x1_s16: 884; CHECK: // %bb.0: 885; CHECK-NEXT: mov w8, w0 886; CHECK-NEXT: smlsl za.s[w8, 0:1], z0.h, z1.h[0] 887; CHECK-NEXT: smlsl za.s[w8, 14:15], z0.h, z1.h[7] 888; CHECK-NEXT: ret 889 call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0) 890 %slice.14 = add i32 %slice, 14 891 call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7) 892 ret void 893} 894 895define void @multi_vector_sub_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 896; CHECK-LABEL: multi_vector_sub_lane_vg2x1_u16: 897; CHECK: // %bb.0: 898; CHECK-NEXT: mov w8, w0 899; CHECK-NEXT: umlsl za.s[w8, 0:1], z0.h, z1.h[0] 900; CHECK-NEXT: umlsl za.s[w8, 14:15], z0.h, z1.h[7] 901; CHECK-NEXT: ret 902 call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0) 903 %slice.14 = add i32 %slice, 14 904 call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7) 905 ret void 906} 907 908; 909; BF/F/S/UMLAL x2 (INDEXED) 910; 911 912define void @multi_vector_add_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) { 913; CHECK-LABEL: multi_vector_add_lane_vg2x2_f16: 914; CHECK: // %bb.0: 915; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 916; CHECK-NEXT: mov w8, w0 917; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 918; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] 919; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] 920; CHECK-NEXT: ret 921 call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice, 922 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 0) 923 %slice.6 = add i32 %slice, 6 924 call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice.6, 925 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 7) 926 ret void 927} 928 929define void @multi_vector_add_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) { 930; CHECK-LABEL: multi_vector_add_lane_vg2x2_bf16: 931; CHECK: // %bb.0: 932; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 933; CHECK-NEXT: mov w8, w0 934; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 935; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] 936; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] 937; CHECK-NEXT: ret 938 call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice, 939 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 0) 940 %slice.6 = add i32 %slice, 6 941 call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice.6, 942 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 7) 943 ret void 944} 945 946define void @multi_vector_add_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 947; CHECK-LABEL: multi_vector_add_lane_vg2x2_s16: 948; CHECK: // %bb.0: 949; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 950; CHECK-NEXT: mov w8, w0 951; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 952; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] 953; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] 954; CHECK-NEXT: ret 955 call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice, 956 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0) 957 %slice.6 = add i32 %slice, 6 958 call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice.6, 959 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7) 960 ret void 961} 962 963define void @multi_vector_add_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 964; CHECK-LABEL: multi_vector_add_lane_vg2x2_u16: 965; CHECK: // %bb.0: 966; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 967; CHECK-NEXT: mov w8, w0 968; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 969; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] 970; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] 971; CHECK-NEXT: ret 972 call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice, 973 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0) 974 %slice.6 = add i32 %slice, 6 975 call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice.6, 976 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7) 977 ret void 978} 979 980; 981; BF/F/S/UMLSL x2 (INDEXED) 982; 983 984define void @multi_vector_sub_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) { 985; CHECK-LABEL: multi_vector_sub_lane_vg2x2_f16: 986; CHECK: // %bb.0: 987; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 988; CHECK-NEXT: mov w8, w0 989; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 990; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] 991; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] 992; CHECK-NEXT: ret 993 call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice, 994 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 0) 995 %slice.6 = add i32 %slice, 6 996 call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice.6, 997 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 7) 998 ret void 999} 1000 1001define void @multi_vector_sub_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) { 1002; CHECK-LABEL: multi_vector_sub_lane_vg2x2_bf16: 1003; CHECK: // %bb.0: 1004; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 1005; CHECK-NEXT: mov w8, w0 1006; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 1007; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] 1008; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] 1009; CHECK-NEXT: ret 1010 call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice, 1011 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 0) 1012 %slice.6 = add i32 %slice, 6 1013 call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice.6, 1014 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 7) 1015 ret void 1016} 1017 1018define void @multi_vector_sub_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 1019; CHECK-LABEL: multi_vector_sub_lane_vg2x2_s16: 1020; CHECK: // %bb.0: 1021; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 1022; CHECK-NEXT: mov w8, w0 1023; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 1024; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] 1025; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] 1026; CHECK-NEXT: ret 1027 call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice, 1028 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0) 1029 %slice.6 = add i32 %slice, 6 1030 call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice.6, 1031 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7) 1032 ret void 1033} 1034 1035define void @multi_vector_sub_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 1036; CHECK-LABEL: multi_vector_sub_lane_vg2x2_u16: 1037; CHECK: // %bb.0: 1038; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 1039; CHECK-NEXT: mov w8, w0 1040; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 1041; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] 1042; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] 1043; CHECK-NEXT: ret 1044 call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice, 1045 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0) 1046 %slice.6 = add i32 %slice, 6 1047 call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice.6, 1048 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7) 1049 ret void 1050} 1051 1052; 1053; BF/F/S/UMLAL x4 (INDEXED) 1054; 1055 1056define void @multi_vector_add_lane_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) { 1057; CHECK-LABEL: multi_vector_add_lane_vg2x4_f16: 1058; CHECK: // %bb.0: 1059; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1060; CHECK-NEXT: mov w8, w0 1061; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1062; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1063; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1064; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] 1065; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] 1066; CHECK-NEXT: ret 1067 call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice, 1068 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 1069 <vscale x 8 x half> %zm, i32 0) 1070 %slice.6 = add i32 %slice, 6 1071 call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice.6, 1072 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 1073 <vscale x 8 x half> %zm, i32 7) 1074 ret void 1075} 1076 1077define void @multi_vector_add_lane_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) { 1078; CHECK-LABEL: multi_vector_add_lane_vg2x4_bf16: 1079; CHECK: // %bb.0: 1080; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1081; CHECK-NEXT: mov w8, w0 1082; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1083; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1084; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1085; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] 1086; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] 1087; CHECK-NEXT: ret 1088 call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice, 1089 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 1090 <vscale x 8 x bfloat> %zm, i32 0) 1091 %slice.6 = add i32 %slice, 6 1092 call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice.6, 1093 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 1094 <vscale x 8 x bfloat> %zm, i32 7) 1095 ret void 1096} 1097 1098define void @multi_vector_add_lane_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 1099; CHECK-LABEL: multi_vector_add_lane_vg2x4_s16: 1100; CHECK: // %bb.0: 1101; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1102; CHECK-NEXT: mov w8, w0 1103; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1104; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1105; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1106; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] 1107; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] 1108; CHECK-NEXT: ret 1109 call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice, 1110 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1111 <vscale x 8 x i16> %zm, i32 0) 1112 %slice.6 = add i32 %slice, 6 1113 call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice.6, 1114 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1115 <vscale x 8 x i16> %zm, i32 7) 1116 ret void 1117} 1118 1119define void @multi_vector_add_lane_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 1120; CHECK-LABEL: multi_vector_add_lane_vg2x4_u16: 1121; CHECK: // %bb.0: 1122; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1123; CHECK-NEXT: mov w8, w0 1124; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1125; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1126; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1127; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] 1128; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] 1129; CHECK-NEXT: ret 1130 call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice, 1131 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1132 <vscale x 8 x i16> %zm, i32 0) 1133 %slice.6 = add i32 %slice, 6 1134 call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice.6, 1135 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1136 <vscale x 8 x i16> %zm, i32 7) 1137 ret void 1138} 1139 1140; 1141; BF/F/S/UMLSL x4 (INDEXED) 1142; 1143 1144define void @multi_vector_sub_lane_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) { 1145; CHECK-LABEL: multi_vector_sub_lane_vg2x4_f16: 1146; CHECK: // %bb.0: 1147; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1148; CHECK-NEXT: mov w8, w0 1149; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1150; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1151; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1152; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] 1153; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] 1154; CHECK-NEXT: ret 1155 call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice, 1156 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 1157 <vscale x 8 x half> %zm, i32 0) 1158 %slice.6 = add i32 %slice, 6 1159 call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice.6, 1160 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, 1161 <vscale x 8 x half> %zm, i32 7) 1162 ret void 1163} 1164 1165define void @multi_vector_sub_lane_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) { 1166; CHECK-LABEL: multi_vector_sub_lane_vg2x4_bf16: 1167; CHECK: // %bb.0: 1168; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1169; CHECK-NEXT: mov w8, w0 1170; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1171; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1172; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1173; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] 1174; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] 1175; CHECK-NEXT: ret 1176 call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice, 1177 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 1178 <vscale x 8 x bfloat> %zm, i32 0) 1179 %slice.6 = add i32 %slice, 6 1180 call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice.6, 1181 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, 1182 <vscale x 8 x bfloat> %zm, i32 7) 1183 ret void 1184} 1185 1186define void @multi_vector_sub_lane_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 1187; CHECK-LABEL: multi_vector_sub_lane_vg2x4_s16: 1188; CHECK: // %bb.0: 1189; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1190; CHECK-NEXT: mov w8, w0 1191; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1192; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1193; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1194; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] 1195; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] 1196; CHECK-NEXT: ret 1197 call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice, 1198 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1199 <vscale x 8 x i16> %zm, i32 0) 1200 %slice.6 = add i32 %slice, 6 1201 call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice.6, 1202 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1203 <vscale x 8 x i16> %zm, i32 7) 1204 ret void 1205} 1206 1207define void @multi_vector_sub_lane_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 1208; CHECK-LABEL: multi_vector_sub_lane_vg2x4_u16: 1209; CHECK: // %bb.0: 1210; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1211; CHECK-NEXT: mov w8, w0 1212; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1213; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1214; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 1215; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] 1216; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] 1217; CHECK-NEXT: ret 1218 call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice, 1219 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1220 <vscale x 8 x i16> %zm, i32 0) 1221 %slice.6 = add i32 %slice, 6 1222 call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice.6, 1223 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1224 <vscale x 8 x i16> %zm, i32 7) 1225 ret void 1226} 1227 1228declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1229declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>) 1230declare void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 1231declare void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 1232 1233declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1234declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>) 1235declare void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 1236declare void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 1237 1238declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1239declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 1240declare void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1241declare void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1242 1243declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1244declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 1245declare void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1246declare void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1247 1248declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, 1249 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1250declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, 1251 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 1252declare void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, 1253 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1254declare void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, 1255 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1256 1257declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, 1258 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1259declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, 1260 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 1261declare void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, 1262 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1263declare void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, 1264 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1265 1266declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1267declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 1268declare void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1269declare void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1270 1271declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1272declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 1273declare void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1274declare void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1275 1276declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, 1277 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1278declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, 1279 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 1280declare void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 1281 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1282declare void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 1283 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1284 1285declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, 1286 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 1287declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, 1288 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 1289declare void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 1290 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1291declare void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 1292 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1293 1294declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32) 1295declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, i32) 1296declare void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1297declare void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1298 1299declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32) 1300declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, i32) 1301declare void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1302declare void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1303 1304declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32) 1305declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32) 1306declare void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1307declare void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1308 1309declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32) 1310declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32) 1311declare void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1312declare void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1313 1314declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32) 1315declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32) 1316declare void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1317declare void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1318 1319declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32) 1320declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32) 1321declare void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1322declare void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1323