1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs < %s | FileCheck %s 3 4; 5; ADD Multi-Single x2 6; 7 8define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zm) { 9; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i32: 10; CHECK: // %bb.0: 11; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 12; CHECK-NEXT: mov w8, w0 13; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 14; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s 15; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s 16; CHECK-NEXT: ret 17 call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice, 18 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 19 <vscale x 4 x i32> %zm) 20 %slice.7 = add i32 %slice, 7 21 call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice.7, 22 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 23 <vscale x 4 x i32> %zm) 24 ret void 25} 26 27define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zm) { 28; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i64: 29; CHECK: // %bb.0: 30; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 31; CHECK-NEXT: mov w8, w0 32; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 33; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d 34; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d 35; CHECK-NEXT: ret 36 call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice, 37 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 38 <vscale x 2 x i64> %zm) 39 %slice.7 = add i32 %slice, 7 40 call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice.7, 41 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 42 <vscale x 2 x i64> %zm) 43 ret void 44} 45 46; 47; ADD Multi-Single x4 48; 49 50define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 51; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i32: 52; CHECK: // %bb.0: 53; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 54; CHECK-NEXT: mov w8, w0 55; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 56; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 57; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 58; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s 59; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s 60; CHECK-NEXT: ret 61 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, 62 <vscale x 4 x i32> %zm) { 63 call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice, 64 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 65 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, 66 <vscale x 4 x i32> %zm) 67 %slice.7 = add i32 %slice, 7 68 call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice.7, 69 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 70 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, 71 <vscale x 4 x i32> %zm) 72 ret void 73} 74 75define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice, 76; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i64: 77; CHECK: // %bb.0: 78; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 79; CHECK-NEXT: mov w8, w0 80; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 81; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 82; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 83; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d 84; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d 85; CHECK-NEXT: ret 86 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 87 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, 88 <vscale x 2 x i64> %zm) { 89 call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice, 90 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 91 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, 92 <vscale x 2 x i64> %zm) 93 %slice.7 = add i32 %slice, 7 94 call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice.7, 95 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 96 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, 97 <vscale x 2 x i64> %zm) 98 ret void 99} 100 101; 102; ADD Multi-Multi x2 103; 104 105define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 106; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i32: 107; CHECK: // %bb.0: 108; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 109; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 110; CHECK-NEXT: mov w8, w0 111; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 112; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 113; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s } 114; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s } 115; CHECK-NEXT: ret 116 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) { 117 call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice, 118 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 119 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) 120 %slice.7 = add i32 %slice, 7 121 call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice.7, 122 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 123 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) 124 ret void 125} 126 127 128define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 129; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i64: 130; CHECK: // %bb.0: 131; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 132; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 133; CHECK-NEXT: mov w8, w0 134; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 135; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 136; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d } 137; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d } 138; CHECK-NEXT: ret 139 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) { 140 call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice, 141 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 142 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) 143 %slice.7 = add i32 %slice, 7 144 call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice.7, 145 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 146 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) 147 ret void 148} 149 150 151; 152; ADD Multi-Multi x4 153; 154 155define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 156; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i32: 157; CHECK: // %bb.0: 158; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 159; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 160; CHECK-NEXT: mov w8, w0 161; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 162; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 163; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 164; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 165; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 166; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 167; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s } 168; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s } 169; CHECK-NEXT: ret 170 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, 171 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1, 172 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) { 173 call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice, 174 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 175 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, 176 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1, 177 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) 178 %slice.7 = add i32 %slice, 7 179 call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice.7, 180 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 181 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, 182 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1, 183 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) 184 ret void 185} 186 187define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 188; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i64: 189; CHECK: // %bb.0: 190; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 191; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 192; CHECK-NEXT: mov w8, w0 193; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 194; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 195; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 196; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 197; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 198; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 199; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d } 200; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d } 201; CHECK-NEXT: ret 202 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, 203 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1, 204 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) { 205 call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice, 206 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 207 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, 208 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1, 209 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) 210 %slice.7 = add i32 %slice, 7 211 call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice.7, 212 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 213 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, 214 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1, 215 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) 216 ret void 217} 218 219; 220; ADD and accumulate into ZA 221; 222; x2 223define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) { 224; CHECK-LABEL: multi_vector_add_za_vg1x2_i32: 225; CHECK: // %bb.0: 226; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 227; CHECK-NEXT: mov w8, w0 228; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 229; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s } 230; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s } 231; CHECK-NEXT: ret 232 call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice,<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) 233 %slice.7 = add i32 %slice, 7 234 call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) 235 ret void 236} 237 238define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) { 239; CHECK-LABEL: multi_vector_add_za_vg1x2_i64: 240; CHECK: // %bb.0: 241; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 242; CHECK-NEXT: mov w8, w0 243; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 244; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d } 245; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d } 246; CHECK-NEXT: ret 247 call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) 248 %slice.7 = add i32 %slice, 7 249 call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) 250 ret void 251} 252 253define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) { 254; CHECK-LABEL: multi_vector_add_za_vg1x2_f32: 255; CHECK: // %bb.0: 256; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 257; CHECK-NEXT: mov w8, w0 258; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 259; CHECK-NEXT: fadd za.s[w8, 0, vgx2], { z0.s, z1.s } 260; CHECK-NEXT: fadd za.s[w8, 7, vgx2], { z0.s, z1.s } 261; CHECK-NEXT: ret 262 call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice, 263 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) 264 %slice.7 = add i32 %slice, 7 265 call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice.7, 266 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) 267 ret void 268} 269 270define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) { 271; CHECK-LABEL: multi_vector_add_za_vg1x2_f64: 272; CHECK: // %bb.0: 273; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 274; CHECK-NEXT: mov w8, w0 275; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 276; CHECK-NEXT: fadd za.d[w8, 0, vgx2], { z0.d, z1.d } 277; CHECK-NEXT: fadd za.d[w8, 7, vgx2], { z0.d, z1.d } 278; CHECK-NEXT: ret 279 call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice, 280 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) 281 %slice.7 = add i32 %slice, 7 282 call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice.7, 283 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) 284 ret void 285} 286 287; x4 288 289define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) { 290; CHECK-LABEL: multi_vector_add_za_vg1x4_i32: 291; CHECK: // %bb.0: 292; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 293; CHECK-NEXT: mov w8, w0 294; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 295; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 296; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 297; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s } 298; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s } 299; CHECK-NEXT: ret 300 call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice, 301 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 302 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) 303 %slice.7 = add i32 %slice, 7 304 call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice.7, 305 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, 306 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) 307 ret void 308} 309 310define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) { 311; CHECK-LABEL: multi_vector_add_za_vg1x4_i64: 312; CHECK: // %bb.0: 313; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 314; CHECK-NEXT: mov w8, w0 315; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 316; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 317; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 318; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d } 319; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d } 320; CHECK-NEXT: ret 321 call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice, 322 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 323 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) 324 %slice.7 = add i32 %slice, 7 325 call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice.7, 326 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, 327 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) 328 ret void 329} 330 331define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) { 332; CHECK-LABEL: multi_vector_add_za_vg1x4_f32: 333; CHECK: // %bb.0: 334; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 335; CHECK-NEXT: mov w8, w0 336; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 337; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 338; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 339; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z0.s - z3.s } 340; CHECK-NEXT: fadd za.s[w8, 7, vgx4], { z0.s - z3.s } 341; CHECK-NEXT: ret 342 call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice, 343 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 344 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) 345 %slice.7 = add i32 %slice, 7 346 call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice.7, 347 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, 348 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) 349 ret void 350} 351 352define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) { 353; CHECK-LABEL: multi_vector_add_za_vg1x4_f64: 354; CHECK: // %bb.0: 355; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 356; CHECK-NEXT: mov w8, w0 357; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 358; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 359; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 360; CHECK-NEXT: fadd za.d[w8, 0, vgx4], { z0.d - z3.d } 361; CHECK-NEXT: fadd za.d[w8, 7, vgx4], { z0.d - z3.d } 362; CHECK-NEXT: ret 363 call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice, 364 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 365 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) 366 %slice.7 = add i32 %slice, 7 367 call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice.7, 368 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, 369 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) 370 ret void 371} 372 373; 374; ADD Vectors Multi-Single x2 375; 376 377define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) { 378; CHECK-LABEL: multi_vec_add_single_x2_s8: 379; CHECK: // %bb.0: 380; CHECK-NEXT: mov z5.d, z2.d 381; CHECK-NEXT: mov z4.d, z1.d 382; CHECK-NEXT: add { z4.b, z5.b }, { z4.b, z5.b }, z3.b 383; CHECK-NEXT: mov z0.d, z4.d 384; CHECK-NEXT: mov z1.d, z5.d 385; CHECK-NEXT: ret 386 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } 387 @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, 388 <vscale x 16 x i8> %zm) 389 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res 390} 391 392define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) { 393; CHECK-LABEL: multi_vec_add_single_x2_s16: 394; CHECK: // %bb.0: 395; CHECK-NEXT: mov z5.d, z2.d 396; CHECK-NEXT: mov z4.d, z1.d 397; CHECK-NEXT: add { z4.h, z5.h }, { z4.h, z5.h }, z3.h 398; CHECK-NEXT: mov z0.d, z4.d 399; CHECK-NEXT: mov z1.d, z5.d 400; CHECK-NEXT: ret 401 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } 402 @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, 403 <vscale x 8 x i16> %zm) 404 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res 405} 406 407define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) { 408; CHECK-LABEL: multi_vec_add_single_x2_s32: 409; CHECK: // %bb.0: 410; CHECK-NEXT: mov z5.d, z2.d 411; CHECK-NEXT: mov z4.d, z1.d 412; CHECK-NEXT: add { z4.s, z5.s }, { z4.s, z5.s }, z3.s 413; CHECK-NEXT: mov z0.d, z4.d 414; CHECK-NEXT: mov z1.d, z5.d 415; CHECK-NEXT: ret 416 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } 417 @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, 418 <vscale x 4 x i32> %zm) 419 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res 420} 421 422define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) { 423; CHECK-LABEL: multi_vec_add_single_x2_s64: 424; CHECK: // %bb.0: 425; CHECK-NEXT: mov z5.d, z2.d 426; CHECK-NEXT: mov z4.d, z1.d 427; CHECK-NEXT: add { z4.d, z5.d }, { z4.d, z5.d }, z3.d 428; CHECK-NEXT: mov z0.d, z4.d 429; CHECK-NEXT: mov z1.d, z5.d 430; CHECK-NEXT: ret 431 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } 432 @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, 433 <vscale x 2 x i64> %zm) 434 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res 435} 436 437; 438; ADD Vectors Multi-Single x4 439; 440 441define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8>%zm) { 442; CHECK-LABEL: multi_vec_add_single_x4_s8: 443; CHECK: // %bb.0: 444; CHECK-NEXT: mov z27.d, z4.d 445; CHECK-NEXT: mov z26.d, z3.d 446; CHECK-NEXT: mov z25.d, z2.d 447; CHECK-NEXT: mov z24.d, z1.d 448; CHECK-NEXT: add { z24.b - z27.b }, { z24.b - z27.b }, z5.b 449; CHECK-NEXT: mov z0.d, z24.d 450; CHECK-NEXT: mov z1.d, z25.d 451; CHECK-NEXT: mov z2.d, z26.d 452; CHECK-NEXT: mov z3.d, z27.d 453; CHECK-NEXT: ret 454 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 455 @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, 456 <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, 457 <vscale x 16 x i8> %zm) 458 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res 459} 460 461define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_x4_single_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) { 462; CHECK-LABEL: multi_vec_add_x4_single_s16: 463; CHECK: // %bb.0: 464; CHECK-NEXT: mov z27.d, z4.d 465; CHECK-NEXT: mov z26.d, z3.d 466; CHECK-NEXT: mov z25.d, z2.d 467; CHECK-NEXT: mov z24.d, z1.d 468; CHECK-NEXT: add { z24.h - z27.h }, { z24.h - z27.h }, z5.h 469; CHECK-NEXT: mov z0.d, z24.d 470; CHECK-NEXT: mov z1.d, z25.d 471; CHECK-NEXT: mov z2.d, z26.d 472; CHECK-NEXT: mov z3.d, z27.d 473; CHECK-NEXT: ret 474 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } 475 @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, 476 <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, 477 <vscale x 8 x i16> %zm) 478 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res 479} 480 481define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_x4_single_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) { 482; CHECK-LABEL: multi_vec_add_x4_single_s32: 483; CHECK: // %bb.0: 484; CHECK-NEXT: mov z27.d, z4.d 485; CHECK-NEXT: mov z26.d, z3.d 486; CHECK-NEXT: mov z25.d, z2.d 487; CHECK-NEXT: mov z24.d, z1.d 488; CHECK-NEXT: add { z24.s - z27.s }, { z24.s - z27.s }, z5.s 489; CHECK-NEXT: mov z0.d, z24.d 490; CHECK-NEXT: mov z1.d, z25.d 491; CHECK-NEXT: mov z2.d, z26.d 492; CHECK-NEXT: mov z3.d, z27.d 493; CHECK-NEXT: ret 494 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 495 @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, 496 <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, 497 <vscale x 4 x i32> %zm) 498 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res 499} 500 501define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_x4_single_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) { 502; CHECK-LABEL: multi_vec_add_x4_single_s64: 503; CHECK: // %bb.0: 504; CHECK-NEXT: mov z27.d, z4.d 505; CHECK-NEXT: mov z26.d, z3.d 506; CHECK-NEXT: mov z25.d, z2.d 507; CHECK-NEXT: mov z24.d, z1.d 508; CHECK-NEXT: add { z24.d - z27.d }, { z24.d - z27.d }, z5.d 509; CHECK-NEXT: mov z0.d, z24.d 510; CHECK-NEXT: mov z1.d, z25.d 511; CHECK-NEXT: mov z2.d, z26.d 512; CHECK-NEXT: mov z3.d, z27.d 513; CHECK-NEXT: ret 514 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } 515 @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, 516 <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, 517 <vscale x 2 x i64> %zm) 518 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res 519} 520declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 521declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 522declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 523declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 524declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 525declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 526declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 527declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 528declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>) 529declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>) 530declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>) 531declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>) 532declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>) 533declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>) 534declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>) 535declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>) 536declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 537declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 538declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 539declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 540declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 541declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 542declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 543declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 544