1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s 3 4; 5; SMLALL 6; 7 8; Single x1 9 10define void @multi_vector_mul_add_single_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 11; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_s8: 12; CHECK: // %bb.0: 13; CHECK-NEXT: mov w8, w0 14; CHECK-NEXT: smlall za.s[w8, 0:3], z1.b, z2.b 15; CHECK-NEXT: smlall za.s[w8, 12:15], z1.b, z2.b 16; CHECK-NEXT: ret 17 call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 18 %slice.12 = add i32 %slice, 12 19 call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 20 ret void 21} 22 23define void @multi_vector_mul_add_single_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 24; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_s16: 25; CHECK: // %bb.0: 26; CHECK-NEXT: mov w8, w0 27; CHECK-NEXT: smlall za.d[w8, 0:3], z1.h, z2.h 28; CHECK-NEXT: smlall za.d[w8, 12:15], z1.h, z2.h 29; CHECK-NEXT: ret 30 call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 31 %slice.12 = add i32 %slice, 12 32 call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 33 ret void 34} 35 36; Single x2 37 38define void @multi_vector_mul_add_single_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 39; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_s8: 40; CHECK: // %bb.0: 41; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 42; CHECK-NEXT: mov w8, w0 43; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 44; CHECK-NEXT: smlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b 45; CHECK-NEXT: smlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b 46; CHECK-NEXT: ret 47 call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 48 %slice.4 = add i32 %slice, 4 49 call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 50 ret void 51} 52 53define void @multi_vector_mul_add_single_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 54; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_s16: 55; CHECK: // %bb.0: 56; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 57; CHECK-NEXT: mov w8, w0 58; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 59; CHECK-NEXT: smlall za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h 60; CHECK-NEXT: smlall za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h 61; CHECK-NEXT: ret 62 call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 63 %slice.4 = add i32 %slice, 4 64 call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 65 ret void 66} 67 68; Single x4 69 70define void @multi_vector_mul_add_single_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 71; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_s8: 72; CHECK: // %bb.0: 73; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 74; CHECK-NEXT: mov w8, w0 75; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 76; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 77; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 78; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b 79; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b 80; CHECK-NEXT: ret 81 call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 82 %slice.4 = add i32 %slice, 4 83 call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 84 ret void 85} 86 87define void @multi_vector_mul_add_single_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 88; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_s16: 89; CHECK: // %bb.0: 90; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 91; CHECK-NEXT: mov w8, w0 92; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 93; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 94; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 95; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h 96; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h 97; CHECK-NEXT: ret 98 call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) 99 %slice.4 = add i32 %slice, 4 100 call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) 101 ret void 102} 103 104; Multi x2 105 106define void @multi_vector_mul_add_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) { 107; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s8: 108; CHECK: // %bb.0: 109; CHECK-NEXT: mov z5.d, z4.d 110; CHECK-NEXT: mov z7.d, z2.d 111; CHECK-NEXT: mov w8, w0 112; CHECK-NEXT: mov z4.d, z3.d 113; CHECK-NEXT: mov z6.d, z1.d 114; CHECK-NEXT: smlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 115; CHECK-NEXT: smlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 116; CHECK-NEXT: ret 117 call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 118 %slice.4 = add i32 %slice, 4 119 call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 120 ret void 121} 122 123define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) { 124; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s16: 125; CHECK: // %bb.0: 126; CHECK-NEXT: mov z5.d, z4.d 127; CHECK-NEXT: mov z7.d, z2.d 128; CHECK-NEXT: mov w8, w0 129; CHECK-NEXT: mov z4.d, z3.d 130; CHECK-NEXT: mov z6.d, z1.d 131; CHECK-NEXT: smlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 132; CHECK-NEXT: smlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 133; CHECK-NEXT: ret 134 call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 135 %slice.4 = add i32 %slice, 4 136 call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 137 ret void 138} 139 140; Multi x4 141 142define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) { 143; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8: 144; CHECK: // %bb.0: 145; CHECK-NEXT: mov z26.d, z7.d 146; CHECK-NEXT: mov z31.d, z4.d 147; CHECK-NEXT: mov w8, w0 148; CHECK-NEXT: ptrue p0.b 149; CHECK-NEXT: mov z25.d, z6.d 150; CHECK-NEXT: mov z30.d, z3.d 151; CHECK-NEXT: mov z24.d, z5.d 152; CHECK-NEXT: mov z29.d, z2.d 153; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] 154; CHECK-NEXT: mov z28.d, z1.d 155; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 156; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 157; CHECK-NEXT: ret 158 call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 159 %slice.4 = add i32 %slice, 4 160 call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 161 ret void 162} 163 164define void @multi_vector_mul_add_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) { 165; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16: 166; CHECK: // %bb.0: 167; CHECK-NEXT: mov z26.d, z7.d 168; CHECK-NEXT: mov z31.d, z4.d 169; CHECK-NEXT: mov w8, w0 170; CHECK-NEXT: ptrue p0.h 171; CHECK-NEXT: mov z25.d, z6.d 172; CHECK-NEXT: mov z30.d, z3.d 173; CHECK-NEXT: mov z24.d, z5.d 174; CHECK-NEXT: mov z29.d, z2.d 175; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] 176; CHECK-NEXT: mov z28.d, z1.d 177; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } 178; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } 179; CHECK-NEXT: ret 180 call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 181 %slice.4 = add i32 %slice, 4 182 call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 183 ret void 184} 185 186; Indexed x1 187 188define void @multi_vector_mul_add_lane_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 189; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_s8: 190; CHECK: // %bb.0: 191; CHECK-NEXT: mov w8, w0 192; CHECK-NEXT: smlall za.s[w8, 0:3], z1.b, z2.b[0] 193; CHECK-NEXT: smlall za.s[w8, 12:15], z1.b, z2.b[15] 194; CHECK-NEXT: ret 195 call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0) 196 %slice.12 = add i32 %slice, 12 197 call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15) 198 ret void 199} 200 201define void @multi_vector_mul_add_lane_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 202; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_s16: 203; CHECK: // %bb.0: 204; CHECK-NEXT: mov w8, w0 205; CHECK-NEXT: smlall za.d[w8, 0:3], z1.h, z2.h[0] 206; CHECK-NEXT: smlall za.d[w8, 12:15], z1.h, z2.h[7] 207; CHECK-NEXT: ret 208 call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0) 209 %slice.12 = add i32 %slice, 12 210 call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7) 211 ret void 212} 213 214; Indexed x2 215 216define void @multi_vector_mul_add_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 217; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_s8: 218; CHECK: // %bb.0: 219; CHECK-NEXT: mov z5.d, z2.d 220; CHECK-NEXT: mov w8, w0 221; CHECK-NEXT: mov z4.d, z1.d 222; CHECK-NEXT: smlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0] 223; CHECK-NEXT: smlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15] 224; CHECK-NEXT: ret 225 call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0) 226 %slice.4 = add i32 %slice, 4 227 call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15) 228 ret void 229} 230 231define void @multi_vector_mul_add_lane_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 232; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_s16: 233; CHECK: // %bb.0: 234; CHECK-NEXT: mov z5.d, z2.d 235; CHECK-NEXT: mov w8, w0 236; CHECK-NEXT: mov z4.d, z1.d 237; CHECK-NEXT: smlall za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0] 238; CHECK-NEXT: smlall za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7] 239; CHECK-NEXT: ret 240 call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0) 241 %slice.4 = add i32 %slice, 4 242 call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7) 243 ret void 244} 245 246; Indexed x4 247 248define void @multi_vector_mul_add_lane_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 249; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_s8: 250; CHECK: // %bb.0: 251; CHECK-NEXT: mov z27.d, z4.d 252; CHECK-NEXT: mov w8, w0 253; CHECK-NEXT: mov z26.d, z3.d 254; CHECK-NEXT: mov z25.d, z2.d 255; CHECK-NEXT: mov z24.d, z1.d 256; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0] 257; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15] 258; CHECK-NEXT: ret 259 call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0) 260 %slice.4 = add i32 %slice, 4 261 call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15) 262 ret void 263} 264 265define void @multi_vector_mul_add_lane_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 266; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_s16: 267; CHECK: // %bb.0: 268; CHECK-NEXT: mov z27.d, z4.d 269; CHECK-NEXT: mov w8, w0 270; CHECK-NEXT: mov z26.d, z3.d 271; CHECK-NEXT: mov z25.d, z2.d 272; CHECK-NEXT: mov z24.d, z1.d 273; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0] 274; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7] 275; CHECK-NEXT: ret 276 call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0) 277 %slice.4 = add i32 %slice, 4 278 call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7) 279 ret void 280} 281 282; UMLALL 283 284; Single x1 285 286define void @multi_vector_mul_add_single_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 287; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_u8: 288; CHECK: // %bb.0: 289; CHECK-NEXT: mov w8, w0 290; CHECK-NEXT: umlall za.s[w8, 0:3], z1.b, z2.b 291; CHECK-NEXT: umlall za.s[w8, 12:15], z1.b, z2.b 292; CHECK-NEXT: ret 293 call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 294 %slice.12 = add i32 %slice, 12 295 call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 296 ret void 297} 298 299define void @multi_vector_mul_add_single_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 300; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_u16: 301; CHECK: // %bb.0: 302; CHECK-NEXT: mov w8, w0 303; CHECK-NEXT: umlall za.d[w8, 0:3], z1.h, z2.h 304; CHECK-NEXT: umlall za.d[w8, 12:15], z1.h, z2.h 305; CHECK-NEXT: ret 306 call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 307 %slice.12 = add i32 %slice, 12 308 call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 309 ret void 310} 311 312; Single x2 313 314define void @multi_vector_mul_add_single_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 315; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_u8: 316; CHECK: // %bb.0: 317; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 318; CHECK-NEXT: mov w8, w0 319; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 320; CHECK-NEXT: umlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b 321; CHECK-NEXT: umlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b 322; CHECK-NEXT: ret 323 call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 324 %slice.4 = add i32 %slice, 4 325 call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 326 ret void 327} 328 329define void @multi_vector_mul_add_single_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 330; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_u16: 331; CHECK: // %bb.0: 332; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 333; CHECK-NEXT: mov w8, w0 334; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 335; CHECK-NEXT: umlall za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h 336; CHECK-NEXT: umlall za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h 337; CHECK-NEXT: ret 338 call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 339 %slice.4 = add i32 %slice, 4 340 call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 341 ret void 342} 343 344; Single x4 345 346define void @multi_vector_mul_add_single_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 347; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_u8: 348; CHECK: // %bb.0: 349; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 350; CHECK-NEXT: mov w8, w0 351; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 352; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 353; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 354; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b 355; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b 356; CHECK-NEXT: ret 357 call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 358 %slice.4 = add i32 %slice, 4 359 call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 360 ret void 361} 362 363define void @multi_vector_mul_add_single_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 364; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_u16: 365; CHECK: // %bb.0: 366; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 367; CHECK-NEXT: mov w8, w0 368; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 369; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 370; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 371; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h 372; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h 373; CHECK-NEXT: ret 374 call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) 375 %slice.4 = add i32 %slice, 4 376 call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) 377 ret void 378} 379 380; Multi x2 381 382define void @multi_vector_mul_add_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) { 383; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u8: 384; CHECK: // %bb.0: 385; CHECK-NEXT: mov z5.d, z4.d 386; CHECK-NEXT: mov z7.d, z2.d 387; CHECK-NEXT: mov w8, w0 388; CHECK-NEXT: mov z4.d, z3.d 389; CHECK-NEXT: mov z6.d, z1.d 390; CHECK-NEXT: umlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 391; CHECK-NEXT: umlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 392; CHECK-NEXT: ret 393 call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 394 %slice.4 = add i32 %slice, 4 395 call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 396 ret void 397} 398 399define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) { 400; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u16: 401; CHECK: // %bb.0: 402; CHECK-NEXT: mov z5.d, z4.d 403; CHECK-NEXT: mov z7.d, z2.d 404; CHECK-NEXT: mov w8, w0 405; CHECK-NEXT: mov z4.d, z3.d 406; CHECK-NEXT: mov z6.d, z1.d 407; CHECK-NEXT: umlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 408; CHECK-NEXT: umlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 409; CHECK-NEXT: ret 410 call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 411 %slice.4 = add i32 %slice, 4 412 call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 413 ret void 414} 415 416; Multi x4 417 418define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) { 419; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8: 420; CHECK: // %bb.0: 421; CHECK-NEXT: mov z26.d, z7.d 422; CHECK-NEXT: mov z31.d, z4.d 423; CHECK-NEXT: mov w8, w0 424; CHECK-NEXT: ptrue p0.b 425; CHECK-NEXT: mov z25.d, z6.d 426; CHECK-NEXT: mov z30.d, z3.d 427; CHECK-NEXT: mov z24.d, z5.d 428; CHECK-NEXT: mov z29.d, z2.d 429; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] 430; CHECK-NEXT: mov z28.d, z1.d 431; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 432; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 433; CHECK-NEXT: ret 434 call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 435 %slice.4 = add i32 %slice, 4 436 call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 437 ret void 438} 439 440define void @multi_vector_mul_add_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) { 441; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16: 442; CHECK: // %bb.0: 443; CHECK-NEXT: mov z26.d, z7.d 444; CHECK-NEXT: mov z31.d, z4.d 445; CHECK-NEXT: mov w8, w0 446; CHECK-NEXT: ptrue p0.h 447; CHECK-NEXT: mov z25.d, z6.d 448; CHECK-NEXT: mov z30.d, z3.d 449; CHECK-NEXT: mov z24.d, z5.d 450; CHECK-NEXT: mov z29.d, z2.d 451; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] 452; CHECK-NEXT: mov z28.d, z1.d 453; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } 454; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } 455; CHECK-NEXT: ret 456 call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 457 %slice.4 = add i32 %slice, 4 458 call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 459 ret void 460} 461 462; Indexed x1 463 464define void @multi_vector_mul_add_lane_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 465; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_u8: 466; CHECK: // %bb.0: 467; CHECK-NEXT: mov w8, w0 468; CHECK-NEXT: umlall za.s[w8, 0:3], z1.b, z2.b[0] 469; CHECK-NEXT: umlall za.s[w8, 12:15], z1.b, z2.b[15] 470; CHECK-NEXT: ret 471 call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0) 472 %slice.12 = add i32 %slice, 12 473 call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15) 474 ret void 475} 476 477define void @multi_vector_mul_add_lane_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 478; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_u16: 479; CHECK: // %bb.0: 480; CHECK-NEXT: mov w8, w0 481; CHECK-NEXT: umlall za.d[w8, 0:3], z1.h, z2.h[0] 482; CHECK-NEXT: umlall za.d[w8, 12:15], z1.h, z2.h[7] 483; CHECK-NEXT: ret 484 call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0) 485 %slice.12 = add i32 %slice, 12 486 call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7) 487 ret void 488} 489 490; Indexed x2 491 492define void @multi_vector_mul_add_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 493; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_u8: 494; CHECK: // %bb.0: 495; CHECK-NEXT: mov z5.d, z2.d 496; CHECK-NEXT: mov w8, w0 497; CHECK-NEXT: mov z4.d, z1.d 498; CHECK-NEXT: umlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0] 499; CHECK-NEXT: umlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15] 500; CHECK-NEXT: ret 501 call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0) 502 %slice.4 = add i32 %slice, 4 503 call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15) 504 ret void 505} 506 507define void @multi_vector_mul_add_lane_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 508; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_u16: 509; CHECK: // %bb.0: 510; CHECK-NEXT: mov z5.d, z2.d 511; CHECK-NEXT: mov w8, w0 512; CHECK-NEXT: mov z4.d, z1.d 513; CHECK-NEXT: umlall za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0] 514; CHECK-NEXT: umlall za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7] 515; CHECK-NEXT: ret 516 call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0) 517 %slice.4 = add i32 %slice, 4 518 call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7) 519 ret void 520} 521 522; Indexed x4 523 524define void @multi_vector_mul_add_lane_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 525; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_u8: 526; CHECK: // %bb.0: 527; CHECK-NEXT: mov z27.d, z4.d 528; CHECK-NEXT: mov w8, w0 529; CHECK-NEXT: mov z26.d, z3.d 530; CHECK-NEXT: mov z25.d, z2.d 531; CHECK-NEXT: mov z24.d, z1.d 532; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0] 533; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15] 534; CHECK-NEXT: ret 535 call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0) 536 %slice.4 = add i32 %slice, 4 537 call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15) 538 ret void 539} 540 541define void @multi_vector_mul_add_lane_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 542; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_u16: 543; CHECK: // %bb.0: 544; CHECK-NEXT: mov z27.d, z4.d 545; CHECK-NEXT: mov w8, w0 546; CHECK-NEXT: mov z26.d, z3.d 547; CHECK-NEXT: mov z25.d, z2.d 548; CHECK-NEXT: mov z24.d, z1.d 549; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0] 550; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7] 551; CHECK-NEXT: ret 552 call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0) 553 %slice.4 = add i32 %slice, 4 554 call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7) 555 ret void 556} 557 558; SMLSLL 559 560; Single x1 561 562define void @multi_vector_mul_sub_single_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 563; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_s8: 564; CHECK: // %bb.0: 565; CHECK-NEXT: mov w8, w0 566; CHECK-NEXT: smlsll za.s[w8, 0:3], z1.b, z2.b 567; CHECK-NEXT: smlsll za.s[w8, 12:15], z1.b, z2.b 568; CHECK-NEXT: ret 569 call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 570 %slice.12 = add i32 %slice, 12 571 call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 572 ret void 573} 574 575define void @multi_vector_mul_sub_single_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 576; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_s16: 577; CHECK: // %bb.0: 578; CHECK-NEXT: mov w8, w0 579; CHECK-NEXT: smlsll za.d[w8, 0:3], z1.h, z2.h 580; CHECK-NEXT: smlsll za.d[w8, 12:15], z1.h, z2.h 581; CHECK-NEXT: ret 582 call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 583 %slice.12 = add i32 %slice, 12 584 call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 585 ret void 586} 587 588; Single x2 589 590define void @multi_vector_mul_sub_single_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 591; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_s8: 592; CHECK: // %bb.0: 593; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 594; CHECK-NEXT: mov w8, w0 595; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 596; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b 597; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b 598; CHECK-NEXT: ret 599 call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 600 %slice.4 = add i32 %slice, 4 601 call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 602 ret void 603} 604 605define void @multi_vector_mul_sub_single_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 606; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_s16: 607; CHECK: // %bb.0: 608; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 609; CHECK-NEXT: mov w8, w0 610; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 611; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h 612; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h 613; CHECK-NEXT: ret 614 call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 615 %slice.4 = add i32 %slice, 4 616 call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 617 ret void 618} 619 620; Single x4 621 622define void @multi_vector_mul_sub_single_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 623; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_s8: 624; CHECK: // %bb.0: 625; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 626; CHECK-NEXT: mov w8, w0 627; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 628; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 629; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 630; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b 631; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b 632; CHECK-NEXT: ret 633 call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 634 %slice.4 = add i32 %slice, 4 635 call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 636 ret void 637} 638 639define void @multi_vector_mul_sub_single_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 640; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_s16: 641; CHECK: // %bb.0: 642; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 643; CHECK-NEXT: mov w8, w0 644; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 645; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 646; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 647; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h 648; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h 649; CHECK-NEXT: ret 650 call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) 651 %slice.4 = add i32 %slice, 4 652 call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) 653 ret void 654} 655 656; Multi x2 657 658define void @multi_vector_mul_sub_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) { 659; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s8: 660; CHECK: // %bb.0: 661; CHECK-NEXT: mov z5.d, z4.d 662; CHECK-NEXT: mov z7.d, z2.d 663; CHECK-NEXT: mov w8, w0 664; CHECK-NEXT: mov z4.d, z3.d 665; CHECK-NEXT: mov z6.d, z1.d 666; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 667; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 668; CHECK-NEXT: ret 669 call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 670 %slice.4 = add i32 %slice, 4 671 call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 672 ret void 673} 674 675define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) { 676; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s16: 677; CHECK: // %bb.0: 678; CHECK-NEXT: mov z5.d, z4.d 679; CHECK-NEXT: mov z7.d, z2.d 680; CHECK-NEXT: mov w8, w0 681; CHECK-NEXT: mov z4.d, z3.d 682; CHECK-NEXT: mov z6.d, z1.d 683; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 684; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 685; CHECK-NEXT: ret 686 call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 687 %slice.4 = add i32 %slice, 4 688 call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 689 ret void 690} 691 692; Multi x4 693 694define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) { 695; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8: 696; CHECK: // %bb.0: 697; CHECK-NEXT: mov z26.d, z7.d 698; CHECK-NEXT: mov z31.d, z4.d 699; CHECK-NEXT: mov w8, w0 700; CHECK-NEXT: ptrue p0.b 701; CHECK-NEXT: mov z25.d, z6.d 702; CHECK-NEXT: mov z30.d, z3.d 703; CHECK-NEXT: mov z24.d, z5.d 704; CHECK-NEXT: mov z29.d, z2.d 705; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] 706; CHECK-NEXT: mov z28.d, z1.d 707; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 708; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 709; CHECK-NEXT: ret 710 call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 711 %slice.4 = add i32 %slice, 4 712 call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 713 ret void 714} 715 716define void @multi_vector_mul_sub_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) { 717; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16: 718; CHECK: // %bb.0: 719; CHECK-NEXT: mov z26.d, z7.d 720; CHECK-NEXT: mov z31.d, z4.d 721; CHECK-NEXT: mov w8, w0 722; CHECK-NEXT: ptrue p0.h 723; CHECK-NEXT: mov z25.d, z6.d 724; CHECK-NEXT: mov z30.d, z3.d 725; CHECK-NEXT: mov z24.d, z5.d 726; CHECK-NEXT: mov z29.d, z2.d 727; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] 728; CHECK-NEXT: mov z28.d, z1.d 729; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } 730; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } 731; CHECK-NEXT: ret 732 call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 733 %slice.4 = add i32 %slice, 4 734 call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 735 ret void 736} 737 738; Indexed x1 739 740define void @multi_vector_mul_sub_lane_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 741; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_s8: 742; CHECK: // %bb.0: 743; CHECK-NEXT: mov w8, w0 744; CHECK-NEXT: smlsll za.s[w8, 0:3], z1.b, z2.b[0] 745; CHECK-NEXT: smlsll za.s[w8, 12:15], z1.b, z2.b[15] 746; CHECK-NEXT: ret 747 call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0) 748 %slice.12 = add i32 %slice, 12 749 call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15) 750 ret void 751} 752 753define void @multi_vector_mul_sub_lane_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 754; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_s16: 755; CHECK: // %bb.0: 756; CHECK-NEXT: mov w8, w0 757; CHECK-NEXT: smlsll za.d[w8, 0:3], z1.h, z2.h[0] 758; CHECK-NEXT: smlsll za.d[w8, 12:15], z1.h, z2.h[7] 759; CHECK-NEXT: ret 760 call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0) 761 %slice.12 = add i32 %slice, 12 762 call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7) 763 ret void 764} 765 766; Indexed x2 767 768define void @multi_vector_mul_sub_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 769; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_s8: 770; CHECK: // %bb.0: 771; CHECK-NEXT: mov z5.d, z2.d 772; CHECK-NEXT: mov w8, w0 773; CHECK-NEXT: mov z4.d, z1.d 774; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0] 775; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15] 776; CHECK-NEXT: ret 777 call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0) 778 %slice.4 = add i32 %slice, 4 779 call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15) 780 ret void 781} 782 783define void @multi_vector_mul_sub_lane_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 784; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_s16: 785; CHECK: // %bb.0: 786; CHECK-NEXT: mov z5.d, z2.d 787; CHECK-NEXT: mov w8, w0 788; CHECK-NEXT: mov z4.d, z1.d 789; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0] 790; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7] 791; CHECK-NEXT: ret 792 call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0) 793 %slice.4 = add i32 %slice, 4 794 call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7) 795 ret void 796} 797 798; Indexed x4 799 800define void @multi_vector_mul_sub_lane_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 801; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_s8: 802; CHECK: // %bb.0: 803; CHECK-NEXT: mov z27.d, z4.d 804; CHECK-NEXT: mov w8, w0 805; CHECK-NEXT: mov z26.d, z3.d 806; CHECK-NEXT: mov z25.d, z2.d 807; CHECK-NEXT: mov z24.d, z1.d 808; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0] 809; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15] 810; CHECK-NEXT: ret 811 call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0) 812 %slice.4 = add i32 %slice, 4 813 call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15) 814 ret void 815} 816 817define void @multi_vector_mul_sub_lane_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 818; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_s16: 819; CHECK: // %bb.0: 820; CHECK-NEXT: mov z27.d, z4.d 821; CHECK-NEXT: mov w8, w0 822; CHECK-NEXT: mov z26.d, z3.d 823; CHECK-NEXT: mov z25.d, z2.d 824; CHECK-NEXT: mov z24.d, z1.d 825; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0] 826; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7] 827; CHECK-NEXT: ret 828 call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0) 829 %slice.4 = add i32 %slice, 4 830 call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7) 831 ret void 832} 833 834; UMLSLL 835 836; Single x1 837 838define void @multi_vector_mul_sub_single_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 839; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_u8: 840; CHECK: // %bb.0: 841; CHECK-NEXT: mov w8, w0 842; CHECK-NEXT: umlsll za.s[w8, 0:3], z1.b, z2.b 843; CHECK-NEXT: umlsll za.s[w8, 12:15], z1.b, z2.b 844; CHECK-NEXT: ret 845 call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 846 %slice.12 = add i32 %slice, 12 847 call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 848 ret void 849} 850 851define void @multi_vector_mul_sub_single_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 852; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_u16: 853; CHECK: // %bb.0: 854; CHECK-NEXT: mov w8, w0 855; CHECK-NEXT: umlsll za.d[w8, 0:3], z1.h, z2.h 856; CHECK-NEXT: umlsll za.d[w8, 12:15], z1.h, z2.h 857; CHECK-NEXT: ret 858 call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 859 %slice.12 = add i32 %slice, 12 860 call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 861 ret void 862} 863 864; Single x2 865 866define void @multi_vector_mul_sub_single_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 867; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_u8: 868; CHECK: // %bb.0: 869; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 870; CHECK-NEXT: mov w8, w0 871; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 872; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b 873; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b 874; CHECK-NEXT: ret 875 call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 876 %slice.4 = add i32 %slice, 4 877 call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 878 ret void 879} 880 881define void @multi_vector_mul_sub_single_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 882; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_u16: 883; CHECK: // %bb.0: 884; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 885; CHECK-NEXT: mov w8, w0 886; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 887; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h 888; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h 889; CHECK-NEXT: ret 890 call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 891 %slice.4 = add i32 %slice, 4 892 call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) 893 ret void 894} 895 896; Single x4 897 898define void @multi_vector_mul_sub_single_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 899; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_u8: 900; CHECK: // %bb.0: 901; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 902; CHECK-NEXT: mov w8, w0 903; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 904; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 905; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 906; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b 907; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b 908; CHECK-NEXT: ret 909 call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 910 %slice.4 = add i32 %slice, 4 911 call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 912 ret void 913} 914 915define void @multi_vector_mul_sub_single_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 916; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_u16: 917; CHECK: // %bb.0: 918; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 919; CHECK-NEXT: mov w8, w0 920; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 921; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 922; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 923; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h 924; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h 925; CHECK-NEXT: ret 926 call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) 927 %slice.4 = add i32 %slice, 4 928 call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) 929 ret void 930} 931 932; Multi x2 933 934define void @multi_vector_mul_sub_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) { 935; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u8: 936; CHECK: // %bb.0: 937; CHECK-NEXT: mov z5.d, z4.d 938; CHECK-NEXT: mov z7.d, z2.d 939; CHECK-NEXT: mov w8, w0 940; CHECK-NEXT: mov z4.d, z3.d 941; CHECK-NEXT: mov z6.d, z1.d 942; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 943; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 944; CHECK-NEXT: ret 945 call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 946 %slice.4 = add i32 %slice, 4 947 call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 948 ret void 949} 950 951define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) { 952; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u16: 953; CHECK: // %bb.0: 954; CHECK-NEXT: mov z5.d, z4.d 955; CHECK-NEXT: mov z7.d, z2.d 956; CHECK-NEXT: mov w8, w0 957; CHECK-NEXT: mov z4.d, z3.d 958; CHECK-NEXT: mov z6.d, z1.d 959; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 960; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 961; CHECK-NEXT: ret 962 call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 963 %slice.4 = add i32 %slice, 4 964 call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) 965 ret void 966} 967 968; Multi x4 969 970define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) { 971; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8: 972; CHECK: // %bb.0: 973; CHECK-NEXT: mov z26.d, z7.d 974; CHECK-NEXT: mov z31.d, z4.d 975; CHECK-NEXT: mov w8, w0 976; CHECK-NEXT: ptrue p0.b 977; CHECK-NEXT: mov z25.d, z6.d 978; CHECK-NEXT: mov z30.d, z3.d 979; CHECK-NEXT: mov z24.d, z5.d 980; CHECK-NEXT: mov z29.d, z2.d 981; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] 982; CHECK-NEXT: mov z28.d, z1.d 983; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 984; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 985; CHECK-NEXT: ret 986 call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 987 %slice.4 = add i32 %slice, 4 988 call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 989 ret void 990} 991 992define void @multi_vector_mul_sub_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) { 993; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16: 994; CHECK: // %bb.0: 995; CHECK-NEXT: mov z26.d, z7.d 996; CHECK-NEXT: mov z31.d, z4.d 997; CHECK-NEXT: mov w8, w0 998; CHECK-NEXT: ptrue p0.h 999; CHECK-NEXT: mov z25.d, z6.d 1000; CHECK-NEXT: mov z30.d, z3.d 1001; CHECK-NEXT: mov z24.d, z5.d 1002; CHECK-NEXT: mov z29.d, z2.d 1003; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] 1004; CHECK-NEXT: mov z28.d, z1.d 1005; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } 1006; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } 1007; CHECK-NEXT: ret 1008 call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 1009 %slice.4 = add i32 %slice, 4 1010 call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) 1011 ret void 1012} 1013 1014; Indexed x1 1015 1016define void @multi_vector_mul_sub_lane_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 1017; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_u8: 1018; CHECK: // %bb.0: 1019; CHECK-NEXT: mov w8, w0 1020; CHECK-NEXT: umlsll za.s[w8, 0:3], z1.b, z2.b[0] 1021; CHECK-NEXT: umlsll za.s[w8, 12:15], z1.b, z2.b[15] 1022; CHECK-NEXT: ret 1023 call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0) 1024 %slice.12 = add i32 %slice, 12 1025 call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15) 1026 ret void 1027} 1028 1029define void @multi_vector_mul_sub_lane_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) { 1030; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_u16: 1031; CHECK: // %bb.0: 1032; CHECK-NEXT: mov w8, w0 1033; CHECK-NEXT: umlsll za.d[w8, 0:3], z1.h, z2.h[0] 1034; CHECK-NEXT: umlsll za.d[w8, 12:15], z1.h, z2.h[7] 1035; CHECK-NEXT: ret 1036 call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0) 1037 %slice.12 = add i32 %slice, 12 1038 call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7) 1039 ret void 1040} 1041 1042; Indexed x2 1043 1044define void @multi_vector_mul_sub_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 1045; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_u8: 1046; CHECK: // %bb.0: 1047; CHECK-NEXT: mov z5.d, z2.d 1048; CHECK-NEXT: mov w8, w0 1049; CHECK-NEXT: mov z4.d, z1.d 1050; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0] 1051; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15] 1052; CHECK-NEXT: ret 1053 call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0) 1054 %slice.4 = add i32 %slice, 4 1055 call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15) 1056 ret void 1057} 1058 1059define void @multi_vector_mul_sub_lane_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) { 1060; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_u16: 1061; CHECK: // %bb.0: 1062; CHECK-NEXT: mov z5.d, z2.d 1063; CHECK-NEXT: mov w8, w0 1064; CHECK-NEXT: mov z4.d, z1.d 1065; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0] 1066; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7] 1067; CHECK-NEXT: ret 1068 call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0) 1069 %slice.4 = add i32 %slice, 4 1070 call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7) 1071 ret void 1072} 1073 1074; Indexed x4 1075 1076define void @multi_vector_mul_sub_lane_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 1077; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_u8: 1078; CHECK: // %bb.0: 1079; CHECK-NEXT: mov z27.d, z4.d 1080; CHECK-NEXT: mov w8, w0 1081; CHECK-NEXT: mov z26.d, z3.d 1082; CHECK-NEXT: mov z25.d, z2.d 1083; CHECK-NEXT: mov z24.d, z1.d 1084; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0] 1085; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15] 1086; CHECK-NEXT: ret 1087 call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0) 1088 %slice.4 = add i32 %slice, 4 1089 call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15) 1090 ret void 1091} 1092 1093define void @multi_vector_mul_sub_lane_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) { 1094; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_u16: 1095; CHECK: // %bb.0: 1096; CHECK-NEXT: mov z27.d, z4.d 1097; CHECK-NEXT: mov w8, w0 1098; CHECK-NEXT: mov z26.d, z3.d 1099; CHECK-NEXT: mov z25.d, z2.d 1100; CHECK-NEXT: mov z24.d, z1.d 1101; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0] 1102; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7] 1103; CHECK-NEXT: ret 1104 call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0) 1105 %slice.4 = add i32 %slice, 4 1106 call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7) 1107 ret void 1108} 1109 1110; 1111; SUMLALL 1112; 1113 1114; Single x 2 1115 1116define void @multi_vector_mul_add_single_signed_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 1117; CHECK-LABEL: multi_vector_mul_add_single_signed_long_vg4x2_s8: 1118; CHECK: // %bb.0: 1119; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 1120; CHECK-NEXT: mov w8, w0 1121; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 1122; CHECK-NEXT: sumlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b 1123; CHECK-NEXT: sumlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b 1124; CHECK-NEXT: ret 1125 call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 1126 %slice.4 = add i32 %slice, 4 1127 call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 1128 ret void 1129} 1130 1131; Single x 4 1132 1133define void @multi_vector_mul_add_single_signed_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 1134; CHECK-LABEL: multi_vector_mul_add_single_signed_long_vg4x4_s8: 1135; CHECK: // %bb.0: 1136; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 1137; CHECK-NEXT: mov w8, w0 1138; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 1139; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 1140; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 1141; CHECK-NEXT: sumlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b 1142; CHECK-NEXT: sumlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b 1143; CHECK-NEXT: ret 1144 call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 1145 %slice.4 = add i32 %slice, 4 1146 call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 1147 ret void 1148} 1149 1150; Indexed x1 1151 1152define void @multi_vector_mul_add_lane_signed_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 1153; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x1_s8: 1154; CHECK: // %bb.0: 1155; CHECK-NEXT: mov w8, w0 1156; CHECK-NEXT: sumlall za.s[w8, 0:3], z1.b, z2.b[0] 1157; CHECK-NEXT: sumlall za.s[w8, 12:15], z1.b, z2.b[15] 1158; CHECK-NEXT: ret 1159 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0) 1160 %slice.12 = add i32 %slice, 12 1161 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15) 1162 ret void 1163} 1164 1165; Indexed x2 1166 1167define void @multi_vector_mul_add_lane_signed_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 1168; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x2_s8: 1169; CHECK: // %bb.0: 1170; CHECK-NEXT: mov z5.d, z2.d 1171; CHECK-NEXT: mov w8, w0 1172; CHECK-NEXT: mov z4.d, z1.d 1173; CHECK-NEXT: sumlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0] 1174; CHECK-NEXT: sumlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15] 1175; CHECK-NEXT: ret 1176 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0) 1177 %slice.4 = add i32 %slice, 4 1178 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15) 1179 ret void 1180} 1181 1182; Indexed x4 1183 1184define void @multi_vector_mul_add_lane_signed_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 1185; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x4_s8: 1186; CHECK: // %bb.0: 1187; CHECK-NEXT: mov z27.d, z4.d 1188; CHECK-NEXT: mov w8, w0 1189; CHECK-NEXT: mov z26.d, z3.d 1190; CHECK-NEXT: mov z25.d, z2.d 1191; CHECK-NEXT: mov z24.d, z1.d 1192; CHECK-NEXT: sumlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0] 1193; CHECK-NEXT: sumlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15] 1194; CHECK-NEXT: ret 1195 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0) 1196 %slice.4 = add i32 %slice, 4 1197 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15) 1198 ret void 1199} 1200 1201; USMLALL 1202 1203; Single x1 1204 1205define void @multi_vector_mul_add_single_unsigned_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 1206; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x1_s8: 1207; CHECK: // %bb.0: 1208; CHECK-NEXT: mov w8, w0 1209; CHECK-NEXT: usmlall za.s[w8, 0:3], z1.b, z2.b 1210; CHECK-NEXT: usmlall za.s[w8, 12:15], z1.b, z2.b 1211; CHECK-NEXT: ret 1212 call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 1213 %slice.12 = add i32 %slice, 12 1214 call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 1215 ret void 1216} 1217 1218; Single x 2 1219 1220define void @multi_vector_mul_add_single_unsigned_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 1221; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x2_s8: 1222; CHECK: // %bb.0: 1223; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2 1224; CHECK-NEXT: mov w8, w0 1225; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2 1226; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b 1227; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b 1228; CHECK-NEXT: ret 1229 call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 1230 %slice.4 = add i32 %slice, 4 1231 call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) 1232 ret void 1233} 1234 1235; Single x4 1236 1237define void @multi_vector_mul_add_single_unsigned_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 1238; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x4_s8: 1239; CHECK: // %bb.0: 1240; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 1241; CHECK-NEXT: mov w8, w0 1242; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 1243; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 1244; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4 1245; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b 1246; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b 1247; CHECK-NEXT: ret 1248 call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 1249 %slice.4 = add i32 %slice, 4 1250 call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) 1251 ret void 1252} 1253 1254; Multi x2 1255 1256define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) { 1257; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x2_u8: 1258; CHECK: // %bb.0: 1259; CHECK-NEXT: mov z5.d, z4.d 1260; CHECK-NEXT: mov z7.d, z2.d 1261; CHECK-NEXT: mov w8, w0 1262; CHECK-NEXT: mov z4.d, z3.d 1263; CHECK-NEXT: mov z6.d, z1.d 1264; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 1265; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 1266; CHECK-NEXT: ret 1267 call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 1268 %slice.4 = add i32 %slice, 4 1269 call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) 1270 ret void 1271} 1272 1273; Multi x4 1274 1275define void @multi_vector_mul_add_multi_unsigned_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) { 1276; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8: 1277; CHECK: // %bb.0: 1278; CHECK-NEXT: mov z26.d, z7.d 1279; CHECK-NEXT: mov z31.d, z4.d 1280; CHECK-NEXT: mov w8, w0 1281; CHECK-NEXT: ptrue p0.b 1282; CHECK-NEXT: mov z25.d, z6.d 1283; CHECK-NEXT: mov z30.d, z3.d 1284; CHECK-NEXT: mov z24.d, z5.d 1285; CHECK-NEXT: mov z29.d, z2.d 1286; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] 1287; CHECK-NEXT: mov z28.d, z1.d 1288; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 1289; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } 1290; CHECK-NEXT: ret 1291 call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 1292 %slice.4 = add i32 %slice, 4 1293 call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) 1294 ret void 1295} 1296 1297; Indexed x1 1298 1299define void @multi_vector_mul_add_lane_unsigned_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) { 1300; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x1_s8: 1301; CHECK: // %bb.0: 1302; CHECK-NEXT: mov w8, w0 1303; CHECK-NEXT: usmlall za.s[w8, 0:3], z1.b, z2.b[0] 1304; CHECK-NEXT: usmlall za.s[w8, 12:15], z1.b, z2.b[15] 1305; CHECK-NEXT: ret 1306 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0) 1307 %slice.12 = add i32 %slice, 12 1308 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15) 1309 ret void 1310} 1311 1312; Indexed x2 1313 1314define void @multi_vector_mul_add_lane_unsigned_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) { 1315; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x2_s8: 1316; CHECK: // %bb.0: 1317; CHECK-NEXT: mov z5.d, z2.d 1318; CHECK-NEXT: mov w8, w0 1319; CHECK-NEXT: mov z4.d, z1.d 1320; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0] 1321; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15] 1322; CHECK-NEXT: ret 1323 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0) 1324 %slice.4 = add i32 %slice, 4 1325 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15) 1326 ret void 1327} 1328 1329; Indexed x4 1330 1331define void @multi_vector_mul_add_lane_unsigned_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) { 1332; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x4_s8: 1333; CHECK: // %bb.0: 1334; CHECK-NEXT: mov z27.d, z4.d 1335; CHECK-NEXT: mov w8, w0 1336; CHECK-NEXT: mov z26.d, z3.d 1337; CHECK-NEXT: mov z25.d, z2.d 1338; CHECK-NEXT: mov z24.d, z1.d 1339; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0] 1340; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15] 1341; CHECK-NEXT: ret 1342 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0) 1343 %slice.4 = add i32 %slice, 4 1344 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15) 1345 ret void 1346} 1347 1348declare void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>) 1349declare void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1350declare void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1351 1352declare void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 1353declare void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1354declare void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1355 1356declare void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1357declare void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1358 1359declare void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1360declare void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1361 1362declare void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1363declare void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1364declare void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1365 1366declare void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1367declare void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1368declare void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1369 1370declare void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>) 1371declare void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1372declare void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1373 1374declare void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 1375declare void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1376declare void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1377 1378declare void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1379declare void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1380 1381declare void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1382declare void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1383 1384declare void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1385declare void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1386declare void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1387 1388declare void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1389declare void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1390declare void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1391 1392declare void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>) 1393declare void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1394declare void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1395 1396declare void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 1397declare void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1398declare void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1399 1400declare void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1401declare void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1402 1403declare void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1404declare void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1405 1406declare void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1407declare void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1408declare void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1409 1410declare void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1411declare void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1412declare void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1413 1414declare void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>) 1415declare void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1416declare void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1417 1418declare void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 1419declare void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1420declare void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1421 1422declare void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1423declare void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1424 1425declare void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1426declare void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1427 1428declare void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1429declare void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1430declare void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1431 1432declare void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1433declare void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1434declare void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1435 1436declare void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1437declare void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1438 1439declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1440declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1441declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1442 1443declare void @llvm.aarch64.sme.sumla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1444declare void @llvm.aarch64.sme.sumls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1445declare void @llvm.aarch64.sme.sumls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1446 1447declare void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>) 1448declare void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1449declare void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1450 1451declare void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1452declare void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1453 1454declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1455declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1456declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 1457 1458declare void @llvm.aarch64.sme.usmla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1459declare void @llvm.aarch64.sme.usmls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1460declare void @llvm.aarch64.sme.usmls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 1461