1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs | FileCheck %s 3 4; FMLA (SINGLE) 5 6define void @multi_vector_add_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) { 7; CHECK-LABEL: multi_vector_add_single_vg1x2_s: 8; CHECK: // %bb.0: 9; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 10; CHECK-NEXT: mov w8, w0 11; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 12; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s 13; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s 14; CHECK-NEXT: ret 15 call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice, 16 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 17 <vscale x 4 x float> %zm) 18 %slice.7 = add i32 %slice, 7 19 call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice.7, 20 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 21 <vscale x 4 x float> %zm) 22 ret void 23} 24 25define void @multi_vector_add_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) { 26; CHECK-LABEL: multi_vector_add_single_vg1x2_d: 27; CHECK: // %bb.0: 28; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 29; CHECK-NEXT: mov w8, w0 30; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 31; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d 32; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d 33; CHECK-NEXT: ret 34 call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice, 35 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 36 <vscale x 2 x double> %zm) 37 %slice.7 = add i32 %slice, 7 38 call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice.7, 39 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 40 <vscale x 2 x double> %zm) 41 ret void 42} 43 44define void @multi_vector_add_single_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 45; CHECK-LABEL: multi_vector_add_single_vg1x4_s: 46; CHECK: // %bb.0: 47; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 48; CHECK-NEXT: mov w8, w0 49; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 50; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 51; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 52; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s 53; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s 54; CHECK-NEXT: ret 55 <vscale x 4 x float> %zm) { 56 call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice, 57 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 58 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 59 <vscale x 4 x float> %zm) 60 %slice.7 = add i32 %slice, 7 61 call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice.7, 62 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 63 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 64 <vscale x 4 x float> %zm) 65 ret void 66} 67 68define void @multi_vector_add_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 69; CHECK-LABEL: multi_vector_add_single_vg1x4_d: 70; CHECK: // %bb.0: 71; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 72; CHECK-NEXT: mov w8, w0 73; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 74; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 75; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 76; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d 77; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d 78; CHECK-NEXT: ret 79 <vscale x 2 x double> %zm) { 80 call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice, 81 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 82 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 83 <vscale x 2 x double> %zm) 84 %slice.7 = add i32 %slice, 7 85 call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice.7, 86 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 87 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 88 <vscale x 2 x double> %zm) 89 ret void 90} 91 92; FMLS (SINGLE) 93 94define void @multi_vector_sub_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) { 95; CHECK-LABEL: multi_vector_sub_single_vg1x2_s: 96; CHECK: // %bb.0: 97; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 98; CHECK-NEXT: mov w8, w0 99; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 100; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s 101; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s 102; CHECK-NEXT: ret 103 call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice, 104 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 105 <vscale x 4 x float> %zm) 106 %slice.7 = add i32 %slice, 7 107 call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice.7, 108 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 109 <vscale x 4 x float> %zm) 110 ret void 111} 112 113define void @multi_vector_sub_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) { 114; CHECK-LABEL: multi_vector_sub_single_vg1x2_d: 115; CHECK: // %bb.0: 116; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 117; CHECK-NEXT: mov w8, w0 118; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 119; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d 120; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d 121; CHECK-NEXT: ret 122 call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice, 123 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 124 <vscale x 2 x double> %zm) 125 %slice.7 = add i32 %slice, 7 126 call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice.7, 127 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 128 <vscale x 2 x double> %zm) 129 ret void 130} 131 132define void @multi_vector_sub_single_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 133; CHECK-LABEL: multi_vector_sub_single_vg1x4_s: 134; CHECK: // %bb.0: 135; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 136; CHECK-NEXT: mov w8, w0 137; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 138; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 139; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 140; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s 141; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s 142; CHECK-NEXT: ret 143 <vscale x 4 x float> %zm) { 144 call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice, 145 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 146 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 147 <vscale x 4 x float> %zm) 148 %slice.7 = add i32 %slice, 7 149 call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice.7, 150 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 151 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 152 <vscale x 4 x float> %zm) 153 ret void 154} 155 156define void @multi_vector_sub_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 157; CHECK-LABEL: multi_vector_sub_single_vg1x4_d: 158; CHECK: // %bb.0: 159; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 160; CHECK-NEXT: mov w8, w0 161; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 162; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 163; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 164; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d 165; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d 166; CHECK-NEXT: ret 167 <vscale x 2 x double> %zm) { 168 call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice, 169 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 170 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 171 <vscale x 2 x double> %zm) 172 %slice.7 = add i32 %slice, 7 173 call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice.7, 174 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 175 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 176 <vscale x 2 x double> %zm) 177 ret void 178} 179 180; FMLA (MULTI) 181 182define void @multi_vector_add_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 183; CHECK-LABEL: multi_vector_add_vg1x2_s: 184; CHECK: // %bb.0: 185; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 186; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 187; CHECK-NEXT: mov w8, w0 188; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 189; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 190; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s } 191; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s } 192; CHECK-NEXT: ret 193 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) { 194 call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice, 195 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 196 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) 197 %slice.7 = add i32 %slice, 7 198 call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice.7, 199 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 200 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) 201 ret void 202} 203 204define void @multi_vector_add_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 205; CHECK-LABEL: multi_vector_add_vg1x2_d: 206; CHECK: // %bb.0: 207; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 208; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 209; CHECK-NEXT: mov w8, w0 210; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 211; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 212; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d } 213; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d } 214; CHECK-NEXT: ret 215 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) { 216 call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice, 217 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 218 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) 219 %slice.7 = add i32 %slice, 7 220 call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice.7, 221 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 222 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) 223 ret void 224} 225 226; Test to ensure the correct register class is used (first register in the list should be a multiple of 2) 227define void @multi_vector_add_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 228; CHECK-LABEL: multi_vector_add_vg1x2_s_regclass: 229; CHECK: // %bb.0: 230; CHECK-NEXT: mov z4.d, z3.d 231; CHECK-NEXT: mov z6.d, z1.d 232; CHECK-NEXT: mov w8, w0 233; CHECK-NEXT: mov z5.d, z2.d 234; CHECK-NEXT: mov z7.d, z0.d 235; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z6.s, z7.s }, { z4.s, z5.s } 236; CHECK-NEXT: ret 237 <vscale x 4 x float> %zm0, <vscale x 4 x float> %zm1) { 238 call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice, 239 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn0, 240 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm0) 241 ret void 242} 243 244define void @multi_vector_add_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 245; CHECK-LABEL: multi_vector_add_vg1x4_s: 246; CHECK: // %bb.0: 247; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 248; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 249; CHECK-NEXT: mov w8, w0 250; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 251; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 252; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 253; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 254; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 255; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 256; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s } 257; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s } 258; CHECK-NEXT: ret 259 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) { 260 call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice, 261 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 262 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) 263 %slice.7 = add i32 %slice, 7 264 call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice.7, 265 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 266 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) 267 ret void 268} 269 270define void @multi_vector_add_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 271; CHECK-LABEL: multi_vector_add_vg1x4_d: 272; CHECK: // %bb.0: 273; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 274; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 275; CHECK-NEXT: mov w8, w0 276; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 277; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 278; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 279; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 280; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 281; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 282; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d } 283; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d } 284; CHECK-NEXT: ret 285 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) { 286 call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice, 287 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 288 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) 289 %slice.7 = add i32 %slice, 7 290 call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice.7, 291 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 292 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) 293 ret void 294} 295 296; Test to ensure the correct register class is used (first register in the list should be a multiple of 4) 297define void @multi_vector_add_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 298; CHECK-LABEL: multi_vector_add_vg1x4_s_regclass: 299; CHECK: // %bb.0: 300; CHECK-NEXT: mov z26.d, z7.d 301; CHECK-NEXT: mov z30.d, z3.d 302; CHECK-NEXT: mov w8, w0 303; CHECK-NEXT: mov z25.d, z6.d 304; CHECK-NEXT: mov z29.d, z2.d 305; CHECK-NEXT: mov z24.d, z5.d 306; CHECK-NEXT: mov z28.d, z1.d 307; CHECK-NEXT: mov z27.d, z4.d 308; CHECK-NEXT: mov z31.d, z0.d 309; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z28.s - z31.s }, { z24.s - z27.s } 310; CHECK-NEXT: ret 311 <vscale x 4 x float> %zm0, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3) { 312 call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice, 313 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn0, 314 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm0) 315 ret void 316} 317 318; FMLS (MULTI) 319 320define void @multi_vector_sub_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 321; CHECK-LABEL: multi_vector_sub_vg1x2_s: 322; CHECK: // %bb.0: 323; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 324; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 325; CHECK-NEXT: mov w8, w0 326; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 327; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 328; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s } 329; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s } 330; CHECK-NEXT: ret 331 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) { 332 call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice, 333 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 334 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) 335 %slice.7 = add i32 %slice, 7 336 call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice.7, 337 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 338 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) 339 ret void 340} 341 342define void @multi_vector_sub_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 343; CHECK-LABEL: multi_vector_sub_vg1x2_d: 344; CHECK: // %bb.0: 345; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 346; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 347; CHECK-NEXT: mov w8, w0 348; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 349; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 350; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d } 351; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d } 352; CHECK-NEXT: ret 353 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) { 354 call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice, 355 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 356 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) 357 %slice.7 = add i32 %slice, 7 358 call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice.7, 359 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 360 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) 361 ret void 362} 363 364define void @multi_vector_sub_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 365; CHECK-LABEL: multi_vector_sub_vg1x4_s: 366; CHECK: // %bb.0: 367; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 368; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 369; CHECK-NEXT: mov w8, w0 370; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 371; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 372; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 373; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 374; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 375; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 376; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s } 377; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s } 378; CHECK-NEXT: ret 379 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) { 380 call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice, 381 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 382 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) 383 %slice.7 = add i32 %slice, 7 384 call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice.7, 385 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 386 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) 387 ret void 388} 389 390define void @multi_vector_sub_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 391; CHECK-LABEL: multi_vector_sub_vg1x4_d: 392; CHECK: // %bb.0: 393; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 394; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 395; CHECK-NEXT: mov w8, w0 396; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 397; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 398; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 399; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 400; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 401; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 402; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d } 403; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d } 404; CHECK-NEXT: ret 405 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) { 406 call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice, 407 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 408 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) 409 %slice.7 = add i32 %slice, 7 410 call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice.7, 411 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 412 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) 413 ret void 414} 415 416; FMLA (INDEXED) 417 418define void @multi_vector_add_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) { 419; CHECK-LABEL: multi_vector_add_lane_vg1x2_s: 420; CHECK: // %bb.0: 421; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 422; CHECK-NEXT: mov w8, w0 423; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 424; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3] 425; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3] 426; CHECK-NEXT: ret 427 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice, 428 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 429 <vscale x 4 x float> %zm, i32 3) 430 %slice.7 = add i32 %slice, 7 431 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice.7, 432 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 433 <vscale x 4 x float> %zm, i32 3) 434 ret void 435} 436 437define void @multi_vector_add_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) { 438; CHECK-LABEL: multi_vector_add_lane_vg1x2_d: 439; CHECK: // %bb.0: 440; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 441; CHECK-NEXT: mov w8, w0 442; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 443; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1] 444; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1] 445; CHECK-NEXT: ret 446 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice, 447 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 448 <vscale x 2 x double> %zm, i32 1) 449 %slice.7 = add i32 %slice, 7 450 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice.7, 451 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 452 <vscale x 2 x double> %zm, i32 1) 453 ret void 454} 455 456; Test to ensure the correct register class is used (first register in the list should be a multiple of 2) 457define void @multi_vector_add_lane_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) { 458; CHECK-LABEL: multi_vector_add_lane_vg1x2_s_regclass: 459; CHECK: // %bb.0: 460; CHECK-NEXT: mov z4.d, z1.d 461; CHECK-NEXT: mov w8, w0 462; CHECK-NEXT: mov z5.d, z0.d 463; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z4.s, z5.s }, z2.s[3] 464; CHECK-NEXT: ret 465 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice, 466 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn0, 467 <vscale x 4 x float> %zm, i32 3) 468 ret void 469} 470 471define void @multi_vector_add_lane_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 472; CHECK-LABEL: multi_vector_add_lane_vg1x4_s: 473; CHECK: // %bb.0: 474; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 475; CHECK-NEXT: mov w8, w0 476; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 477; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 478; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 479; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3] 480; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3] 481; CHECK-NEXT: ret 482 <vscale x 4 x float> %zm) { 483 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice, 484 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 485 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 486 <vscale x 4 x float> %zm, i32 3) 487 %slice.7 = add i32 %slice, 7 488 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice.7, 489 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 490 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 491 <vscale x 4 x float> %zm, i32 3) 492 ret void 493} 494 495define void @multi_vector_add_lane_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 496; CHECK-LABEL: multi_vector_add_lane_vg1x4_d: 497; CHECK: // %bb.0: 498; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 499; CHECK-NEXT: mov w8, w0 500; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 501; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 502; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 503; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1] 504; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1] 505; CHECK-NEXT: ret 506 <vscale x 2 x double> %zm) { 507 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice, 508 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 509 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 510 <vscale x 2 x double> %zm, i32 1) 511 %slice.7 = add i32 %slice, 7 512 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice.7, 513 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 514 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 515 <vscale x 2 x double> %zm, i32 1) 516 ret void 517} 518 519; Test to ensure the correct register class is used (first register in the list should be a multiple of 4) 520define void @multi_vector_add_lane_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 521; CHECK-LABEL: multi_vector_add_lane_vg1x4_s_regclass: 522; CHECK: // %bb.0: 523; CHECK-NEXT: mov z26.d, z3.d 524; CHECK-NEXT: mov w8, w0 525; CHECK-NEXT: mov z25.d, z2.d 526; CHECK-NEXT: mov z24.d, z1.d 527; CHECK-NEXT: mov z27.d, z0.d 528; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z24.s - z27.s }, z4.s[3] 529; CHECK-NEXT: ret 530 <vscale x 4 x float> %zm) { 531 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice, 532 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, 533 <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn0, 534 <vscale x 4 x float> %zm, i32 3) 535 ret void 536} 537 538; FMLS (INDEXED) 539 540define void @multi_vector_sub_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) { 541; CHECK-LABEL: multi_vector_sub_lane_vg1x2_s: 542; CHECK: // %bb.0: 543; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 544; CHECK-NEXT: mov w8, w0 545; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 546; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3] 547; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3] 548; CHECK-NEXT: ret 549 call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice, 550 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 551 <vscale x 4 x float> %zm, i32 3) 552 %slice.7 = add i32 %slice, 7 553 call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice.7, 554 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 555 <vscale x 4 x float> %zm, i32 3) 556 ret void 557} 558 559define void @multi_vector_sub_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) { 560; CHECK-LABEL: multi_vector_sub_lane_vg1x2_d: 561; CHECK: // %bb.0: 562; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 563; CHECK-NEXT: mov w8, w0 564; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 565; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1] 566; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1] 567; CHECK-NEXT: ret 568 call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice, 569 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 570 <vscale x 2 x double> %zm, i32 1) 571 %slice.7 = add i32 %slice, 7 572 call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice.7, 573 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 574 <vscale x 2 x double> %zm, i32 1) 575 ret void 576} 577 578define void @multi_vector_sub_lane_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 579; CHECK-LABEL: multi_vector_sub_lane_vg1x4_s: 580; CHECK: // %bb.0: 581; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 582; CHECK-NEXT: mov w8, w0 583; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 584; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 585; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 586; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3] 587; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3] 588; CHECK-NEXT: ret 589 <vscale x 4 x float> %zm) { 590 call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice, 591 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 592 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 593 <vscale x 4 x float> %zm, i32 3) 594 %slice.7 = add i32 %slice, 7 595 call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice.7, 596 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 597 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, 598 <vscale x 4 x float> %zm, i32 3) 599 ret void 600} 601 602define void @multi_vector_sub_lane_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 603; CHECK-LABEL: multi_vector_sub_lane_vg1x4_d: 604; CHECK: // %bb.0: 605; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 606; CHECK-NEXT: mov w8, w0 607; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 608; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 609; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 610; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1] 611; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1] 612; CHECK-NEXT: ret 613 <vscale x 2 x double> %zm) { 614 call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice, 615 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 616 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 617 <vscale x 2 x double> %zm, i32 1) 618 %slice.7 = add i32 %slice, 7 619 call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice.7, 620 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 621 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, 622 <vscale x 2 x double> %zm, i32 1) 623 ret void 624} 625 626declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 627declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 628declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 629declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 630 631declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 632declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 633declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 634declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 635 636declare void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 637declare void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 638declare void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, 639 <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 640declare void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, 641 <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 642 643declare void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 644declare void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 645declare void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, 646 <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 647declare void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, 648 <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 649 650declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32) 651declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32) 652declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32) 653declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32) 654 655declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32) 656declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32) 657declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32) 658declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32) 659