1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s 3 4target triple="aarch64-linux-gnu" 5 6 7; == Multi, multi (unsigned) == 8 9define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 { 10; CHECK-LABEL: udot_multi_za32_u16_vg1x2: 11; CHECK: // %bb.0: 12; CHECK-NEXT: mov z5.d, z4.d 13; CHECK-NEXT: mov z7.d, z2.d 14; CHECK-NEXT: mov w8, w0 15; CHECK-NEXT: mov z4.d, z3.d 16; CHECK-NEXT: mov z6.d, z1.d 17; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 18; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 19; CHECK-NEXT: ret 20 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) 21 %slice2 = add i32 %slice, 7 22 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) 23 ret void 24} 25 26define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 27; CHECK-LABEL: udot_multi_za32_u16_vg1x4: 28; CHECK: // %bb.0: 29; CHECK-NEXT: ptrue p0.h 30; CHECK-NEXT: mov z26.d, z7.d 31; CHECK-NEXT: mov z25.d, z6.d 32; CHECK-NEXT: mov z7.d, z4.d 33; CHECK-NEXT: mov w8, w0 34; CHECK-NEXT: mov z24.d, z5.d 35; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] 36; CHECK-NEXT: mov z6.d, z3.d 37; CHECK-NEXT: mov z5.d, z2.d 38; CHECK-NEXT: mov z4.d, z1.d 39; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h } 40; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h } 41; CHECK-NEXT: ret 42 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 { 43 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 44 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) 45 %slice2 = add i32 %slice, 7 46 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 47 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) 48 ret void 49} 50 51define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 { 52; CHECK-LABEL: udot_multi_za32_u8_vg1x2: 53; CHECK: // %bb.0: 54; CHECK-NEXT: mov z5.d, z4.d 55; CHECK-NEXT: mov z7.d, z2.d 56; CHECK-NEXT: mov w8, w0 57; CHECK-NEXT: mov z4.d, z3.d 58; CHECK-NEXT: mov z6.d, z1.d 59; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 60; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 61; CHECK-NEXT: ret 62 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) 63 %slice2 = add i32 %slice, 7 64 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) 65 ret void 66} 67 68define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 69; CHECK-LABEL: udot_multi_za32_u8_vg1x4: 70; CHECK: // %bb.0: 71; CHECK-NEXT: ptrue p0.b 72; CHECK-NEXT: mov z26.d, z7.d 73; CHECK-NEXT: mov z25.d, z6.d 74; CHECK-NEXT: mov z7.d, z4.d 75; CHECK-NEXT: mov w8, w0 76; CHECK-NEXT: mov z24.d, z5.d 77; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] 78; CHECK-NEXT: mov z6.d, z3.d 79; CHECK-NEXT: mov z5.d, z2.d 80; CHECK-NEXT: mov z4.d, z1.d 81; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b } 82; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b } 83; CHECK-NEXT: ret 84 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 { 85 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 86 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) 87 %slice2 = add i32 %slice, 7 88 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 89 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) 90 ret void 91} 92 93define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 { 94; CHECK-LABEL: udot_multi_za64_u16_vg1x2: 95; CHECK: // %bb.0: 96; CHECK-NEXT: mov z5.d, z4.d 97; CHECK-NEXT: mov z7.d, z2.d 98; CHECK-NEXT: mov w8, w0 99; CHECK-NEXT: mov z4.d, z3.d 100; CHECK-NEXT: mov z6.d, z1.d 101; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 102; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 103; CHECK-NEXT: ret 104 call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) 105 %slice2 = add i32 %slice, 7 106 call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) 107 ret void 108} 109 110define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 111; CHECK-LABEL: udot_multi_za64_u16_vg1x4: 112; CHECK: // %bb.0: 113; CHECK-NEXT: ptrue p0.h 114; CHECK-NEXT: mov z26.d, z7.d 115; CHECK-NEXT: mov z25.d, z6.d 116; CHECK-NEXT: mov z7.d, z4.d 117; CHECK-NEXT: mov w8, w0 118; CHECK-NEXT: mov z24.d, z5.d 119; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] 120; CHECK-NEXT: mov z6.d, z3.d 121; CHECK-NEXT: mov z5.d, z2.d 122; CHECK-NEXT: mov z4.d, z1.d 123; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h } 124; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h } 125; CHECK-NEXT: ret 126 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 { 127 call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 128 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) 129 %slice2 = add i32 %slice, 7 130 call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 131 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) 132 ret void 133} 134 135define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 { 136; CHECK-LABEL: usdot_multi_za32_u8_vg1x2: 137; CHECK: // %bb.0: 138; CHECK-NEXT: mov z5.d, z4.d 139; CHECK-NEXT: mov z7.d, z2.d 140; CHECK-NEXT: mov w8, w0 141; CHECK-NEXT: mov z4.d, z3.d 142; CHECK-NEXT: mov z6.d, z1.d 143; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 144; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 145; CHECK-NEXT: ret 146 call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) 147 %slice2 = add i32 %slice, 7 148 call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) 149 ret void 150} 151 152define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 153; CHECK-LABEL: usdot_multi_za32_u8_vg1x4: 154; CHECK: // %bb.0: 155; CHECK-NEXT: ptrue p0.b 156; CHECK-NEXT: mov z26.d, z7.d 157; CHECK-NEXT: mov z25.d, z6.d 158; CHECK-NEXT: mov z7.d, z4.d 159; CHECK-NEXT: mov w8, w0 160; CHECK-NEXT: mov z24.d, z5.d 161; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] 162; CHECK-NEXT: mov z6.d, z3.d 163; CHECK-NEXT: mov z5.d, z2.d 164; CHECK-NEXT: mov z4.d, z1.d 165; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b } 166; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b } 167; CHECK-NEXT: ret 168 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 { 169 call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 170 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) 171 %slice2 = add i32 %slice, 7 172 call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 173 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) 174 ret void 175} 176 177 178; == Multi, multi (signed) == 179 180define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 { 181; CHECK-LABEL: sdot_multi_za32_u16_vg1x2: 182; CHECK: // %bb.0: 183; CHECK-NEXT: mov z5.d, z4.d 184; CHECK-NEXT: mov z7.d, z2.d 185; CHECK-NEXT: mov w8, w0 186; CHECK-NEXT: mov z4.d, z3.d 187; CHECK-NEXT: mov z6.d, z1.d 188; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 189; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 190; CHECK-NEXT: ret 191 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) 192 %slice2 = add i32 %slice, 7 193 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) 194 ret void 195} 196 197define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 198; CHECK-LABEL: sdot_multi_za32_u16_vg1x4: 199; CHECK: // %bb.0: 200; CHECK-NEXT: ptrue p0.h 201; CHECK-NEXT: mov z26.d, z7.d 202; CHECK-NEXT: mov z25.d, z6.d 203; CHECK-NEXT: mov z7.d, z4.d 204; CHECK-NEXT: mov w8, w0 205; CHECK-NEXT: mov z24.d, z5.d 206; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] 207; CHECK-NEXT: mov z6.d, z3.d 208; CHECK-NEXT: mov z5.d, z2.d 209; CHECK-NEXT: mov z4.d, z1.d 210; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h } 211; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h } 212; CHECK-NEXT: ret 213 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 { 214 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 215 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) 216 %slice2 = add i32 %slice, 7 217 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 218 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) 219 ret void 220} 221 222define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 { 223; CHECK-LABEL: sdot_multi_za32_u8_vg1x2: 224; CHECK: // %bb.0: 225; CHECK-NEXT: mov z5.d, z4.d 226; CHECK-NEXT: mov z7.d, z2.d 227; CHECK-NEXT: mov w8, w0 228; CHECK-NEXT: mov z4.d, z3.d 229; CHECK-NEXT: mov z6.d, z1.d 230; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 231; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } 232; CHECK-NEXT: ret 233 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) 234 %slice2 = add i32 %slice, 7 235 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) 236 ret void 237} 238 239define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 240; CHECK-LABEL: sdot_multi_za32_u8_vg1x4: 241; CHECK: // %bb.0: 242; CHECK-NEXT: ptrue p0.b 243; CHECK-NEXT: mov z26.d, z7.d 244; CHECK-NEXT: mov z25.d, z6.d 245; CHECK-NEXT: mov z7.d, z4.d 246; CHECK-NEXT: mov w8, w0 247; CHECK-NEXT: mov z24.d, z5.d 248; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] 249; CHECK-NEXT: mov z6.d, z3.d 250; CHECK-NEXT: mov z5.d, z2.d 251; CHECK-NEXT: mov z4.d, z1.d 252; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b } 253; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b } 254; CHECK-NEXT: ret 255 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 { 256 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 257 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) 258 %slice2 = add i32 %slice, 7 259 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 260 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) 261 ret void 262} 263 264define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 { 265; CHECK-LABEL: sdot_multi_za64_u16_vg1x2: 266; CHECK: // %bb.0: 267; CHECK-NEXT: mov z5.d, z4.d 268; CHECK-NEXT: mov z7.d, z2.d 269; CHECK-NEXT: mov w8, w0 270; CHECK-NEXT: mov z4.d, z3.d 271; CHECK-NEXT: mov z6.d, z1.d 272; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 273; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } 274; CHECK-NEXT: ret 275 call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) 276 %slice2 = add i32 %slice, 7 277 call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) 278 ret void 279} 280 281define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 282; CHECK-LABEL: sdot_multi_za64_u16_vg1x4: 283; CHECK: // %bb.0: 284; CHECK-NEXT: ptrue p0.h 285; CHECK-NEXT: mov z26.d, z7.d 286; CHECK-NEXT: mov z25.d, z6.d 287; CHECK-NEXT: mov z7.d, z4.d 288; CHECK-NEXT: mov w8, w0 289; CHECK-NEXT: mov z24.d, z5.d 290; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] 291; CHECK-NEXT: mov z6.d, z3.d 292; CHECK-NEXT: mov z5.d, z2.d 293; CHECK-NEXT: mov z4.d, z1.d 294; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h } 295; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h } 296; CHECK-NEXT: ret 297 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 { 298 call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 299 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) 300 %slice2 = add i32 %slice, 7 301 call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 302 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) 303 ret void 304} 305 306 307; == Multi, single (unsigned) == 308 309define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 { 310; CHECK-LABEL: udot_single_za32_u16_vg1x2: 311; CHECK: // %bb.0: 312; CHECK-NEXT: mov w8, w0 313; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h 314; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h 315; CHECK-NEXT: ret 316 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 317 %slice2 = add i32 %slice, 7 318 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 319 ret void 320} 321 322define void @udot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 { 323; CHECK-LABEL: udot_single_za32_u16_vg1x2_tuple: 324; CHECK: // %bb.0: // %entry 325; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 326; CHECK-NEXT: addvl sp, sp, #-3 327; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 328; CHECK-NEXT: ptrue pn8.b 329; CHECK-NEXT: add x9, x0, x1 330; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill 331; CHECK-NEXT: mov w8, wzr 332; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill 333; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x0] 334; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x9] 335; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h 336; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h 337; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload 338; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload 339; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 340; CHECK-NEXT: addvl sp, sp, #3 341; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 342; CHECK-NEXT: ret 343entry: 344 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 345 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) 346 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0 347 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1 348 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 349 %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) 350 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0 351 %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1 352 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> %zn) 353 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> %zn) 354 ret void 355} 356 357define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 { 358; CHECK-LABEL: udot_single_za32_u16_vg1x4: 359; CHECK: // %bb.0: 360; CHECK-NEXT: mov w8, w0 361; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h 362; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h 363; CHECK-NEXT: ret 364 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 365 %slice2 = add i32 %slice, 7 366 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 367 ret void 368} 369 370define void @udot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 { 371; CHECK-LABEL: udot_single_za32_u16_vg1x4_tuple: 372; CHECK: // %bb.0: // %entry 373; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 374; CHECK-NEXT: addvl sp, sp, #-11 375; CHECK-NEXT: add x9, x1, x1, lsl #1 376; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 377; CHECK-NEXT: ptrue pn8.b 378; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill 379; CHECK-NEXT: add x10, x0, x1 380; CHECK-NEXT: mov w8, wzr 381; CHECK-NEXT: str z16, [sp, #2, mul vl] // 16-byte Folded Spill 382; CHECK-NEXT: add x9, x0, x9 383; CHECK-NEXT: str z15, [sp, #3, mul vl] // 16-byte Folded Spill 384; CHECK-NEXT: str z14, [sp, #4, mul vl] // 16-byte Folded Spill 385; CHECK-NEXT: str z13, [sp, #5, mul vl] // 16-byte Folded Spill 386; CHECK-NEXT: str z12, [sp, #6, mul vl] // 16-byte Folded Spill 387; CHECK-NEXT: str z11, [sp, #7, mul vl] // 16-byte Folded Spill 388; CHECK-NEXT: str z10, [sp, #8, mul vl] // 16-byte Folded Spill 389; CHECK-NEXT: str z9, [sp, #9, mul vl] // 16-byte Folded Spill 390; CHECK-NEXT: str z8, [sp, #10, mul vl] // 16-byte Folded Spill 391; CHECK-NEXT: ld1h { z1.h, z5.h, z9.h, z13.h }, pn8/z, [x0] 392; CHECK-NEXT: ld1h { z2.h, z6.h, z10.h, z14.h }, pn8/z, [x10] 393; CHECK-NEXT: ld1h { z3.h, z7.h, z11.h, z15.h }, pn8/z, [x0, x1, lsl #1] 394; CHECK-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x9] 395; CHECK-NEXT: mov z4.d, z16.d 396; CHECK-NEXT: mov z8.d, z20.d 397; CHECK-NEXT: mov z12.d, z24.d 398; CHECK-NEXT: mov z16.d, z28.d 399; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z0.h 400; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z5.h - z8.h }, z0.h 401; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z9.h - z12.h }, z0.h 402; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z13.h - z16.h }, z0.h 403; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload 404; CHECK-NEXT: ldr z16, [sp, #2, mul vl] // 16-byte Folded Reload 405; CHECK-NEXT: ldr z15, [sp, #3, mul vl] // 16-byte Folded Reload 406; CHECK-NEXT: ldr z14, [sp, #4, mul vl] // 16-byte Folded Reload 407; CHECK-NEXT: ldr z13, [sp, #5, mul vl] // 16-byte Folded Reload 408; CHECK-NEXT: ldr z12, [sp, #6, mul vl] // 16-byte Folded Reload 409; CHECK-NEXT: ldr z11, [sp, #7, mul vl] // 16-byte Folded Reload 410; CHECK-NEXT: ldr z10, [sp, #8, mul vl] // 16-byte Folded Reload 411; CHECK-NEXT: ldr z9, [sp, #9, mul vl] // 16-byte Folded Reload 412; CHECK-NEXT: ldr z8, [sp, #10, mul vl] // 16-byte Folded Reload 413; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 414; CHECK-NEXT: addvl sp, sp, #11 415; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 416; CHECK-NEXT: ret 417entry: 418 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 419 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) 420 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0 421 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1 422 %4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 2 423 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 3 424 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 425 %6 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) 426 %7 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 0 427 %8 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 1 428 %9 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 2 429 %10 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 3 430 %mul3 = shl i64 %stride, 1 431 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 432 %11 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx4) 433 %12 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 0 434 %13 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 1 435 %14 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 2 436 %15 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 3 437 %mul5 = mul i64 %stride, 3 438 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 439 %16 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx6) 440 %17 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 0 441 %18 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 1 442 %19 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 2 443 %20 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 3 444 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %7, <vscale x 8 x i16> %12, <vscale x 8 x i16> %17, <vscale x 8 x i16> %zn) 445 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %8, <vscale x 8 x i16> %13, <vscale x 8 x i16> %18, <vscale x 8 x i16> %zn) 446 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %4, <vscale x 8 x i16> %9, <vscale x 8 x i16> %14, <vscale x 8 x i16> %19, <vscale x 8 x i16> %zn) 447 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %5, <vscale x 8 x i16> %10, <vscale x 8 x i16> %15, <vscale x 8 x i16> %20, <vscale x 8 x i16> %zn) 448 ret void 449} 450 451define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 { 452; CHECK-LABEL: udot_single_za32_u8_vg1x2: 453; CHECK: // %bb.0: 454; CHECK-NEXT: mov w8, w0 455; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b 456; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b 457; CHECK-NEXT: ret 458 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 459 %slice2 = add i32 %slice, 7 460 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 461 ret void 462} 463 464define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 { 465; CHECK-LABEL: udot_single_za32_u8_vg1x4: 466; CHECK: // %bb.0: 467; CHECK-NEXT: mov w8, w0 468; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b 469; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b 470; CHECK-NEXT: ret 471 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 472 %slice2 = add i32 %slice, 7 473 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 474 ret void 475} 476 477define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 { 478; CHECK-LABEL: udot_single_za64_u16_vg1x2: 479; CHECK: // %bb.0: 480; CHECK-NEXT: mov w8, w0 481; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h 482; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h 483; CHECK-NEXT: ret 484 call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 485 %slice2 = add i32 %slice, 7 486 call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 487 ret void 488} 489 490define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 { 491; CHECK-LABEL: udot_single_za64_u16_vg1x4: 492; CHECK: // %bb.0: 493; CHECK-NEXT: mov w8, w0 494; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h 495; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h 496; CHECK-NEXT: ret 497 call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 498 %slice2 = add i32 %slice, 7 499 call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 500 ret void 501} 502 503define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 { 504; CHECK-LABEL: usdot_single_za32_u8_vg1x2: 505; CHECK: // %bb.0: 506; CHECK-NEXT: mov w8, w0 507; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b 508; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b 509; CHECK-NEXT: ret 510 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 511 %slice2 = add i32 %slice, 7 512 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 513 ret void 514} 515 516define void @usdot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 { 517; CHECK-LABEL: usdot_single_za32_u16_vg1x2_tuple: 518; CHECK: // %bb.0: // %entry 519; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 520; CHECK-NEXT: addvl sp, sp, #-3 521; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 522; CHECK-NEXT: ptrue pn8.b 523; CHECK-NEXT: mov w8, wzr 524; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill 525; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill 526; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] 527; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] 528; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z0.b 529; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z9.b, z10.b }, z0.b 530; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload 531; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload 532; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 533; CHECK-NEXT: addvl sp, sp, #3 534; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 535; CHECK-NEXT: ret 536entry: 537 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 538 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 539 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 540 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 541 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 542 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 543 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 544 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 545 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %zn) 546 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> %zn) 547 ret void 548} 549 550define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 { 551; CHECK-LABEL: usdot_single_za32_u8_vg1x4: 552; CHECK: // %bb.0: 553; CHECK-NEXT: mov w8, w0 554; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b 555; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b 556; CHECK-NEXT: ret 557 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 558 %slice2 = add i32 %slice, 7 559 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 560 ret void 561} 562 563define void @usdot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 { 564; CHECK-LABEL: usdot_single_za32_u16_vg1x4_tuple: 565; CHECK: // %bb.0: // %entry 566; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 567; CHECK-NEXT: addvl sp, sp, #-11 568; CHECK-NEXT: lsl x9, x1, #1 569; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 570; CHECK-NEXT: ptrue pn8.b 571; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill 572; CHECK-NEXT: mov w8, wzr 573; CHECK-NEXT: str z16, [sp, #2, mul vl] // 16-byte Folded Spill 574; CHECK-NEXT: add x10, x9, x1 575; CHECK-NEXT: str z15, [sp, #3, mul vl] // 16-byte Folded Spill 576; CHECK-NEXT: str z14, [sp, #4, mul vl] // 16-byte Folded Spill 577; CHECK-NEXT: str z13, [sp, #5, mul vl] // 16-byte Folded Spill 578; CHECK-NEXT: str z12, [sp, #6, mul vl] // 16-byte Folded Spill 579; CHECK-NEXT: str z11, [sp, #7, mul vl] // 16-byte Folded Spill 580; CHECK-NEXT: str z10, [sp, #8, mul vl] // 16-byte Folded Spill 581; CHECK-NEXT: str z9, [sp, #9, mul vl] // 16-byte Folded Spill 582; CHECK-NEXT: str z8, [sp, #10, mul vl] // 16-byte Folded Spill 583; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] 584; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] 585; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] 586; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] 587; CHECK-NEXT: mov z4.d, z16.d 588; CHECK-NEXT: mov z8.d, z20.d 589; CHECK-NEXT: mov z12.d, z24.d 590; CHECK-NEXT: mov z16.d, z28.d 591; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z0.b 592; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z5.b - z8.b }, z0.b 593; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z9.b - z12.b }, z0.b 594; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z13.b - z16.b }, z0.b 595; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload 596; CHECK-NEXT: ldr z16, [sp, #2, mul vl] // 16-byte Folded Reload 597; CHECK-NEXT: ldr z15, [sp, #3, mul vl] // 16-byte Folded Reload 598; CHECK-NEXT: ldr z14, [sp, #4, mul vl] // 16-byte Folded Reload 599; CHECK-NEXT: ldr z13, [sp, #5, mul vl] // 16-byte Folded Reload 600; CHECK-NEXT: ldr z12, [sp, #6, mul vl] // 16-byte Folded Reload 601; CHECK-NEXT: ldr z11, [sp, #7, mul vl] // 16-byte Folded Reload 602; CHECK-NEXT: ldr z10, [sp, #8, mul vl] // 16-byte Folded Reload 603; CHECK-NEXT: ldr z9, [sp, #9, mul vl] // 16-byte Folded Reload 604; CHECK-NEXT: ldr z8, [sp, #10, mul vl] // 16-byte Folded Reload 605; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 606; CHECK-NEXT: addvl sp, sp, #11 607; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 608; CHECK-NEXT: ret 609entry: 610 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 611 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 612 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 613 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 614 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 615 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 616 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 617 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 618 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 619 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 620 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 621 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 622 %mul3 = shl i64 %stride, 1 623 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 624 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 625 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 626 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 627 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 628 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 629 %mul5 = mul i64 %stride, 3 630 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 631 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 632 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 633 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 634 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 635 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 636 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> %zn) 637 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> %zn) 638 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> %zn) 639 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> %zn) 640 ret void 641} 642 643; == Multi, single (signed) == 644 645define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 { 646; CHECK-LABEL: sdot_single_za32_u16_vg1x2: 647; CHECK: // %bb.0: 648; CHECK-NEXT: mov w8, w0 649; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h 650; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h 651; CHECK-NEXT: ret 652 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 653 %slice2 = add i32 %slice, 7 654 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 655 ret void 656} 657 658define void @sdot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 { 659; CHECK-LABEL: sdot_single_za32_u16_vg1x2_tuple: 660; CHECK: // %bb.0: // %entry 661; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 662; CHECK-NEXT: addvl sp, sp, #-3 663; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 664; CHECK-NEXT: ptrue pn8.b 665; CHECK-NEXT: add x9, x0, x1 666; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill 667; CHECK-NEXT: mov w8, wzr 668; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill 669; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x0] 670; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x9] 671; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h 672; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h 673; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload 674; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload 675; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 676; CHECK-NEXT: addvl sp, sp, #3 677; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 678; CHECK-NEXT: ret 679entry: 680 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 681 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) 682 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0 683 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1 684 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 685 %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) 686 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0 687 %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1 688 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> %zn) 689 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> %zn) 690 ret void 691} 692 693define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 { 694; CHECK-LABEL: sdot_single_za32_u16_vg1x4: 695; CHECK: // %bb.0: 696; CHECK-NEXT: mov w8, w0 697; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h 698; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h 699; CHECK-NEXT: ret 700 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 701 %slice2 = add i32 %slice, 7 702 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 703 ret void 704} 705 706define void @sdot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 { 707; CHECK-LABEL: sdot_single_za32_u16_vg1x4_tuple: 708; CHECK: // %bb.0: // %entry 709; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 710; CHECK-NEXT: addvl sp, sp, #-11 711; CHECK-NEXT: add x9, x1, x1, lsl #1 712; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 713; CHECK-NEXT: ptrue pn8.b 714; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill 715; CHECK-NEXT: add x10, x0, x1 716; CHECK-NEXT: mov w8, wzr 717; CHECK-NEXT: str z16, [sp, #2, mul vl] // 16-byte Folded Spill 718; CHECK-NEXT: add x9, x0, x9 719; CHECK-NEXT: str z15, [sp, #3, mul vl] // 16-byte Folded Spill 720; CHECK-NEXT: str z14, [sp, #4, mul vl] // 16-byte Folded Spill 721; CHECK-NEXT: str z13, [sp, #5, mul vl] // 16-byte Folded Spill 722; CHECK-NEXT: str z12, [sp, #6, mul vl] // 16-byte Folded Spill 723; CHECK-NEXT: str z11, [sp, #7, mul vl] // 16-byte Folded Spill 724; CHECK-NEXT: str z10, [sp, #8, mul vl] // 16-byte Folded Spill 725; CHECK-NEXT: str z9, [sp, #9, mul vl] // 16-byte Folded Spill 726; CHECK-NEXT: str z8, [sp, #10, mul vl] // 16-byte Folded Spill 727; CHECK-NEXT: ld1h { z1.h, z5.h, z9.h, z13.h }, pn8/z, [x0] 728; CHECK-NEXT: ld1h { z2.h, z6.h, z10.h, z14.h }, pn8/z, [x10] 729; CHECK-NEXT: ld1h { z3.h, z7.h, z11.h, z15.h }, pn8/z, [x0, x1, lsl #1] 730; CHECK-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x9] 731; CHECK-NEXT: mov z4.d, z16.d 732; CHECK-NEXT: mov z8.d, z20.d 733; CHECK-NEXT: mov z12.d, z24.d 734; CHECK-NEXT: mov z16.d, z28.d 735; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z0.h 736; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z5.h - z8.h }, z0.h 737; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z9.h - z12.h }, z0.h 738; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z13.h - z16.h }, z0.h 739; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload 740; CHECK-NEXT: ldr z16, [sp, #2, mul vl] // 16-byte Folded Reload 741; CHECK-NEXT: ldr z15, [sp, #3, mul vl] // 16-byte Folded Reload 742; CHECK-NEXT: ldr z14, [sp, #4, mul vl] // 16-byte Folded Reload 743; CHECK-NEXT: ldr z13, [sp, #5, mul vl] // 16-byte Folded Reload 744; CHECK-NEXT: ldr z12, [sp, #6, mul vl] // 16-byte Folded Reload 745; CHECK-NEXT: ldr z11, [sp, #7, mul vl] // 16-byte Folded Reload 746; CHECK-NEXT: ldr z10, [sp, #8, mul vl] // 16-byte Folded Reload 747; CHECK-NEXT: ldr z9, [sp, #9, mul vl] // 16-byte Folded Reload 748; CHECK-NEXT: ldr z8, [sp, #10, mul vl] // 16-byte Folded Reload 749; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 750; CHECK-NEXT: addvl sp, sp, #11 751; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 752; CHECK-NEXT: ret 753entry: 754 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 755 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) 756 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0 757 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1 758 %4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 2 759 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 3 760 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 761 %6 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) 762 %7 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 0 763 %8 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 1 764 %9 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 2 765 %10 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 3 766 %mul3 = shl i64 %stride, 1 767 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 768 %11 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx4) 769 %12 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 0 770 %13 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 1 771 %14 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 2 772 %15 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 3 773 %mul5 = mul i64 %stride, 3 774 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 775 %16 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx6) 776 %17 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 0 777 %18 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 1 778 %19 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 2 779 %20 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 3 780 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %7, <vscale x 8 x i16> %12, <vscale x 8 x i16> %17, <vscale x 8 x i16> %zn) 781 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %8, <vscale x 8 x i16> %13, <vscale x 8 x i16> %18, <vscale x 8 x i16> %zn) 782 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %4, <vscale x 8 x i16> %9, <vscale x 8 x i16> %14, <vscale x 8 x i16> %19, <vscale x 8 x i16> %zn) 783 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %5, <vscale x 8 x i16> %10, <vscale x 8 x i16> %15, <vscale x 8 x i16> %20, <vscale x 8 x i16> %zn) 784 ret void 785} 786 787define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 { 788; CHECK-LABEL: sdot_single_za32_u8_vg1x2: 789; CHECK: // %bb.0: 790; CHECK-NEXT: mov w8, w0 791; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b 792; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b 793; CHECK-NEXT: ret 794 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 795 %slice2 = add i32 %slice, 7 796 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 797 ret void 798} 799 800define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 { 801; CHECK-LABEL: sdot_single_za32_u8_vg1x4: 802; CHECK: // %bb.0: 803; CHECK-NEXT: mov w8, w0 804; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b 805; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b 806; CHECK-NEXT: ret 807 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 808 %slice2 = add i32 %slice, 7 809 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 810 ret void 811} 812 813define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 { 814; CHECK-LABEL: sdot_single_za64_u16_vg1x2: 815; CHECK: // %bb.0: 816; CHECK-NEXT: mov w8, w0 817; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h 818; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h 819; CHECK-NEXT: ret 820 call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 821 %slice2 = add i32 %slice, 7 822 call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 823 ret void 824} 825 826define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 { 827; CHECK-LABEL: sdot_single_za64_u16_vg1x4: 828; CHECK: // %bb.0: 829; CHECK-NEXT: mov w8, w0 830; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h 831; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h 832; CHECK-NEXT: ret 833 call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 834 %slice2 = add i32 %slice, 7 835 call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 836 ret void 837} 838 839define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 { 840; CHECK-LABEL: sudot_single_za32_u8_vg1x2: 841; CHECK: // %bb.0: 842; CHECK-NEXT: mov w8, w0 843; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b 844; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b 845; CHECK-NEXT: ret 846 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 847 %slice2 = add i32 %slice, 7 848 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 849 ret void 850} 851 852define void @sudot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 { 853; CHECK-LABEL: sudot_single_za32_u16_vg1x2_tuple: 854; CHECK: // %bb.0: // %entry 855; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 856; CHECK-NEXT: addvl sp, sp, #-3 857; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 858; CHECK-NEXT: ptrue pn8.b 859; CHECK-NEXT: mov w8, wzr 860; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill 861; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill 862; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] 863; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] 864; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z0.b 865; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z9.b, z10.b }, z0.b 866; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload 867; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload 868; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 869; CHECK-NEXT: addvl sp, sp, #3 870; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 871; CHECK-NEXT: ret 872entry: 873 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 874 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 875 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 876 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 877 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 878 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 879 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 880 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 881 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %zn) 882 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> %zn) 883 ret void 884} 885 886define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 { 887; CHECK-LABEL: sudot_single_za32_u8_vg1x4: 888; CHECK: // %bb.0: 889; CHECK-NEXT: mov w8, w0 890; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b 891; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b 892; CHECK-NEXT: ret 893 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 894 %slice2 = add i32 %slice, 7 895 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 896 ret void 897} 898 899define void @sudot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 { 900; CHECK-LABEL: sudot_single_za32_u16_vg1x4_tuple: 901; CHECK: // %bb.0: // %entry 902; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 903; CHECK-NEXT: addvl sp, sp, #-11 904; CHECK-NEXT: lsl x9, x1, #1 905; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 906; CHECK-NEXT: ptrue pn8.b 907; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill 908; CHECK-NEXT: mov w8, wzr 909; CHECK-NEXT: str z16, [sp, #2, mul vl] // 16-byte Folded Spill 910; CHECK-NEXT: add x10, x9, x1 911; CHECK-NEXT: str z15, [sp, #3, mul vl] // 16-byte Folded Spill 912; CHECK-NEXT: str z14, [sp, #4, mul vl] // 16-byte Folded Spill 913; CHECK-NEXT: str z13, [sp, #5, mul vl] // 16-byte Folded Spill 914; CHECK-NEXT: str z12, [sp, #6, mul vl] // 16-byte Folded Spill 915; CHECK-NEXT: str z11, [sp, #7, mul vl] // 16-byte Folded Spill 916; CHECK-NEXT: str z10, [sp, #8, mul vl] // 16-byte Folded Spill 917; CHECK-NEXT: str z9, [sp, #9, mul vl] // 16-byte Folded Spill 918; CHECK-NEXT: str z8, [sp, #10, mul vl] // 16-byte Folded Spill 919; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] 920; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] 921; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] 922; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] 923; CHECK-NEXT: mov z4.d, z16.d 924; CHECK-NEXT: mov z8.d, z20.d 925; CHECK-NEXT: mov z12.d, z24.d 926; CHECK-NEXT: mov z16.d, z28.d 927; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z0.b 928; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z5.b - z8.b }, z0.b 929; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z9.b - z12.b }, z0.b 930; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z13.b - z16.b }, z0.b 931; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload 932; CHECK-NEXT: ldr z16, [sp, #2, mul vl] // 16-byte Folded Reload 933; CHECK-NEXT: ldr z15, [sp, #3, mul vl] // 16-byte Folded Reload 934; CHECK-NEXT: ldr z14, [sp, #4, mul vl] // 16-byte Folded Reload 935; CHECK-NEXT: ldr z13, [sp, #5, mul vl] // 16-byte Folded Reload 936; CHECK-NEXT: ldr z12, [sp, #6, mul vl] // 16-byte Folded Reload 937; CHECK-NEXT: ldr z11, [sp, #7, mul vl] // 16-byte Folded Reload 938; CHECK-NEXT: ldr z10, [sp, #8, mul vl] // 16-byte Folded Reload 939; CHECK-NEXT: ldr z9, [sp, #9, mul vl] // 16-byte Folded Reload 940; CHECK-NEXT: ldr z8, [sp, #10, mul vl] // 16-byte Folded Reload 941; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 942; CHECK-NEXT: addvl sp, sp, #11 943; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 944; CHECK-NEXT: ret 945entry: 946 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 947 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 948 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 949 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 950 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 951 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 952 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 953 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 954 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 955 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 956 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 957 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 958 %mul3 = shl i64 %stride, 1 959 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 960 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 961 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 962 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 963 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 964 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 965 %mul5 = mul i64 %stride, 3 966 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 967 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 968 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 969 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 970 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 971 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 972 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> %zn) 973 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> %zn) 974 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> %zn) 975 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> %zn) 976 ret void 977} 978 979; == Multi, indexed (unsigned) == 980 981define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 { 982; CHECK-LABEL: udot_lane_za32_u16_vg1x2: 983; CHECK: // %bb.0: 984; CHECK-NEXT: mov z5.d, z2.d 985; CHECK-NEXT: mov z4.d, z1.d 986; CHECK-NEXT: mov w8, w0 987; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3] 988; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3] 989; CHECK-NEXT: ret 990 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3) 991 %slice2 = add i32 %slice, 7 992 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3) 993 ret void 994} 995 996define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 { 997; CHECK-LABEL: udot_lane_za32_u16_vg1x4: 998; CHECK: // %bb.0: 999; CHECK-NEXT: mov w8, w0 1000; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3] 1001; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3] 1002; CHECK-NEXT: ret 1003 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1004 <vscale x 8 x i16> %zn4, i32 3) 1005 %slice2 = add i32 %slice, 7 1006 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1007 <vscale x 8 x i16> %zn4, i32 3) 1008 ret void 1009} 1010 1011define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 { 1012; CHECK-LABEL: udot_lane_za32_u8_vg1x2: 1013; CHECK: // %bb.0: 1014; CHECK-NEXT: mov z5.d, z2.d 1015; CHECK-NEXT: mov z4.d, z1.d 1016; CHECK-NEXT: mov w8, w0 1017; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3] 1018; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3] 1019; CHECK-NEXT: ret 1020 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3) 1021 %slice2 = add i32 %slice, 7 1022 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3) 1023 ret void 1024} 1025 1026define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 { 1027; CHECK-LABEL: udot_lane_za32_u8_vg1x4: 1028; CHECK: // %bb.0: 1029; CHECK-NEXT: mov z27.d, z4.d 1030; CHECK-NEXT: mov z26.d, z3.d 1031; CHECK-NEXT: mov w8, w0 1032; CHECK-NEXT: mov z25.d, z2.d 1033; CHECK-NEXT: mov z24.d, z1.d 1034; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3] 1035; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3] 1036; CHECK-NEXT: ret 1037 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 1038 <vscale x 16 x i8> %zn4, i32 3) 1039 %slice2 = add i32 %slice, 7 1040 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 1041 <vscale x 16 x i8> %zn4, i32 3) 1042 ret void 1043} 1044 1045define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { 1046; CHECK-LABEL: udot_form_2x_tuple: 1047; CHECK: // %bb.0: // %entry 1048; CHECK-NEXT: ptrue pn8.b 1049; CHECK-NEXT: mov w8, wzr 1050; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0] 1051; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1] 1052; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0] 1053; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0] 1054; CHECK-NEXT: ret 1055entry: 1056 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1057 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1058 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1059 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1060 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1061 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1062 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 1063 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 1064 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0) 1065 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0) 1066 ret void 1067} 1068 1069define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 { 1070; CHECK-LABEL: udot_form_2x_tuple_svecc: 1071; CHECK: // %bb.0: // %entry 1072; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1073; CHECK-NEXT: addvl sp, sp, #-3 1074; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 1075; CHECK-NEXT: ptrue pn8.b 1076; CHECK-NEXT: mov w8, wzr 1077; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill 1078; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill 1079; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] 1080; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] 1081; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] 1082; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] 1083; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload 1084; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload 1085; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 1086; CHECK-NEXT: addvl sp, sp, #3 1087; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1088; CHECK-NEXT: ret 1089entry: 1090 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1091 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1092 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1093 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1094 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1095 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1096 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 1097 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 1098 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0) 1099 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0) 1100 ret void 1101} 1102 1103define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { 1104; CHECK-LABEL: udot_form_4x_tuple: 1105; CHECK: // %bb.0: // %entry 1106; CHECK-NEXT: lsl x9, x1, #1 1107; CHECK-NEXT: ptrue pn8.b 1108; CHECK-NEXT: mov w8, wzr 1109; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] 1110; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] 1111; CHECK-NEXT: add x10, x9, x1 1112; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] 1113; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] 1114; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] 1115; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] 1116; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] 1117; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] 1118; CHECK-NEXT: ret 1119entry: 1120 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1121 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1122 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1123 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1124 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 1125 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 1126 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1127 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1128 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 1129 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 1130 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 1131 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 1132 %mul3 = shl i64 %stride, 1 1133 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 1134 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 1135 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 1136 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 1137 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 1138 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 1139 %mul5 = mul i64 %stride, 3 1140 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 1141 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 1142 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 1143 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 1144 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 1145 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 1146 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) 1147 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) 1148 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) 1149 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) 1150 ret void 1151} 1152 1153define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 { 1154; CHECK-LABEL: udot_form_4x_tuple_svecc: 1155; CHECK: // %bb.0: // %entry 1156; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1157; CHECK-NEXT: addvl sp, sp, #-9 1158; CHECK-NEXT: lsl x9, x1, #1 1159; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 1160; CHECK-NEXT: ptrue pn8.b 1161; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill 1162; CHECK-NEXT: mov w8, wzr 1163; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill 1164; CHECK-NEXT: add x10, x9, x1 1165; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill 1166; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill 1167; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill 1168; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill 1169; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill 1170; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill 1171; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] 1172; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] 1173; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] 1174; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] 1175; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] 1176; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] 1177; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] 1178; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] 1179; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload 1180; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload 1181; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload 1182; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload 1183; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload 1184; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload 1185; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload 1186; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload 1187; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 1188; CHECK-NEXT: addvl sp, sp, #9 1189; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1190; CHECK-NEXT: ret 1191entry: 1192 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1193 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1194 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1195 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1196 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 1197 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 1198 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1199 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1200 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 1201 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 1202 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 1203 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 1204 %mul3 = shl i64 %stride, 1 1205 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 1206 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 1207 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 1208 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 1209 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 1210 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 1211 %mul5 = mul i64 %stride, 3 1212 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 1213 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 1214 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 1215 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 1216 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 1217 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 1218 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) 1219 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) 1220 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) 1221 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) 1222 ret void 1223} 1224 1225define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 { 1226; CHECK-LABEL: udot_lane_za64_u16_vg1x2: 1227; CHECK: // %bb.0: 1228; CHECK-NEXT: mov z5.d, z2.d 1229; CHECK-NEXT: mov z4.d, z1.d 1230; CHECK-NEXT: mov w8, w0 1231; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1] 1232; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1] 1233; CHECK-NEXT: ret 1234 call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1) 1235 %slice2 = add i32 %slice, 7 1236 call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1) 1237 ret void 1238} 1239 1240define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 { 1241; CHECK-LABEL: udot_lane_za64_u16_vg1x4: 1242; CHECK: // %bb.0: 1243; CHECK-NEXT: mov z27.d, z4.d 1244; CHECK-NEXT: mov z26.d, z3.d 1245; CHECK-NEXT: mov w8, w0 1246; CHECK-NEXT: mov z25.d, z2.d 1247; CHECK-NEXT: mov z24.d, z1.d 1248; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1] 1249; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z24.h - z27.h }, z5.h[1] 1250; CHECK-NEXT: ret 1251 call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1252 <vscale x 8 x i16> %zn4, i32 1) 1253 %slice2 = add i32 %slice, 7 1254 call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1255 <vscale x 8 x i16> %zn4, i32 1) 1256 ret void 1257} 1258 1259define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 { 1260; CHECK-LABEL: usdot_lane_za32_u8_vg1x2: 1261; CHECK: // %bb.0: 1262; CHECK-NEXT: mov z5.d, z2.d 1263; CHECK-NEXT: mov z4.d, z1.d 1264; CHECK-NEXT: mov w8, w0 1265; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3] 1266; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3] 1267; CHECK-NEXT: ret 1268 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3) 1269 %slice2 = add i32 %slice, 7 1270 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3) 1271 ret void 1272} 1273 1274define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 { 1275; CHECK-LABEL: usdot_lane_za32_u8_vg1x4: 1276; CHECK: // %bb.0: 1277; CHECK-NEXT: mov z27.d, z4.d 1278; CHECK-NEXT: mov z26.d, z3.d 1279; CHECK-NEXT: mov w8, w0 1280; CHECK-NEXT: mov z25.d, z2.d 1281; CHECK-NEXT: mov z24.d, z1.d 1282; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3] 1283; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3] 1284; CHECK-NEXT: ret 1285 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 1286 <vscale x 16 x i8> %zn4, i32 3) 1287 %slice2 = add i32 %slice, 7 1288 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 1289 <vscale x 16 x i8> %zn4, i32 3) 1290 ret void 1291} 1292 1293define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { 1294; CHECK-LABEL: usdot_form_2x_tuple: 1295; CHECK: // %bb.0: // %entry 1296; CHECK-NEXT: ptrue pn8.b 1297; CHECK-NEXT: mov w8, wzr 1298; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0] 1299; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1] 1300; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0] 1301; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0] 1302; CHECK-NEXT: ret 1303entry: 1304 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1305 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1306 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1307 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1308 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1309 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1310 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 1311 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 1312 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0) 1313 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0) 1314 ret void 1315} 1316 1317define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 { 1318; CHECK-LABEL: usdot_form_2x_tuple_svecc: 1319; CHECK: // %bb.0: // %entry 1320; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1321; CHECK-NEXT: addvl sp, sp, #-3 1322; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 1323; CHECK-NEXT: ptrue pn8.b 1324; CHECK-NEXT: mov w8, wzr 1325; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill 1326; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill 1327; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] 1328; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] 1329; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] 1330; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] 1331; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload 1332; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload 1333; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 1334; CHECK-NEXT: addvl sp, sp, #3 1335; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1336; CHECK-NEXT: ret 1337entry: 1338 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1339 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1340 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1341 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1342 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1343 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1344 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 1345 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 1346 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0) 1347 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0) 1348 ret void 1349} 1350 1351define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { 1352; CHECK-LABEL: usdot_form_4x_tuple: 1353; CHECK: // %bb.0: // %entry 1354; CHECK-NEXT: lsl x9, x1, #1 1355; CHECK-NEXT: ptrue pn8.b 1356; CHECK-NEXT: mov w8, wzr 1357; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] 1358; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] 1359; CHECK-NEXT: add x10, x9, x1 1360; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] 1361; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] 1362; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] 1363; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] 1364; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] 1365; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] 1366; CHECK-NEXT: ret 1367entry: 1368 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1369 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1370 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1371 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1372 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 1373 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 1374 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1375 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1376 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 1377 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 1378 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 1379 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 1380 %mul3 = shl i64 %stride, 1 1381 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 1382 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 1383 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 1384 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 1385 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 1386 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 1387 %mul5 = mul i64 %stride, 3 1388 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 1389 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 1390 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 1391 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 1392 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 1393 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 1394 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) 1395 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) 1396 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) 1397 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) 1398 ret void 1399} 1400 1401define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 { 1402; CHECK-LABEL: usdot_form_4x_tuple_svecc: 1403; CHECK: // %bb.0: // %entry 1404; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1405; CHECK-NEXT: addvl sp, sp, #-9 1406; CHECK-NEXT: lsl x9, x1, #1 1407; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 1408; CHECK-NEXT: ptrue pn8.b 1409; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill 1410; CHECK-NEXT: mov w8, wzr 1411; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill 1412; CHECK-NEXT: add x10, x9, x1 1413; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill 1414; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill 1415; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill 1416; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill 1417; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill 1418; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill 1419; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] 1420; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] 1421; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] 1422; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] 1423; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] 1424; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] 1425; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] 1426; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] 1427; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload 1428; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload 1429; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload 1430; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload 1431; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload 1432; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload 1433; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload 1434; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload 1435; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 1436; CHECK-NEXT: addvl sp, sp, #9 1437; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1438; CHECK-NEXT: ret 1439entry: 1440 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1441 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1442 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1443 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1444 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 1445 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 1446 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1447 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1448 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 1449 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 1450 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 1451 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 1452 %mul3 = shl i64 %stride, 1 1453 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 1454 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 1455 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 1456 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 1457 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 1458 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 1459 %mul5 = mul i64 %stride, 3 1460 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 1461 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 1462 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 1463 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 1464 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 1465 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 1466 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) 1467 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) 1468 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) 1469 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) 1470 ret void 1471} 1472 1473; == Multi, indexed (signed) == 1474 1475define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 { 1476; CHECK-LABEL: sdot_lane_za32_u16_vg1x2: 1477; CHECK: // %bb.0: 1478; CHECK-NEXT: mov z5.d, z2.d 1479; CHECK-NEXT: mov z4.d, z1.d 1480; CHECK-NEXT: mov w8, w0 1481; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3] 1482; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3] 1483; CHECK-NEXT: ret 1484 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3) 1485 %slice2 = add i32 %slice, 7 1486 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3) 1487 ret void 1488} 1489 1490define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 { 1491; CHECK-LABEL: sdot_lane_za32_u16_vg1x4: 1492; CHECK: // %bb.0: 1493; CHECK-NEXT: mov z27.d, z4.d 1494; CHECK-NEXT: mov z26.d, z3.d 1495; CHECK-NEXT: mov w8, w0 1496; CHECK-NEXT: mov z25.d, z2.d 1497; CHECK-NEXT: mov z24.d, z1.d 1498; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3] 1499; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z24.h - z27.h }, z5.h[3] 1500; CHECK-NEXT: ret 1501 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1502 <vscale x 8 x i16> %zn4, i32 3) 1503 %slice2 = add i32 %slice, 7 1504 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1505 <vscale x 8 x i16> %zn4, i32 3) 1506 ret void 1507} 1508 1509define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 { 1510; CHECK-LABEL: sdot_lane_za32_u8_vg1x2: 1511; CHECK: // %bb.0: 1512; CHECK-NEXT: mov z5.d, z2.d 1513; CHECK-NEXT: mov z4.d, z1.d 1514; CHECK-NEXT: mov w8, w0 1515; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3] 1516; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3] 1517; CHECK-NEXT: ret 1518 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3) 1519 %slice2 = add i32 %slice, 7 1520 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3) 1521 ret void 1522} 1523 1524define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 { 1525; CHECK-LABEL: sdot_lane_za32_u8_vg1x4: 1526; CHECK: // %bb.0: 1527; CHECK-NEXT: mov z27.d, z4.d 1528; CHECK-NEXT: mov z26.d, z3.d 1529; CHECK-NEXT: mov w8, w0 1530; CHECK-NEXT: mov z25.d, z2.d 1531; CHECK-NEXT: mov z24.d, z1.d 1532; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3] 1533; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3] 1534; CHECK-NEXT: ret 1535 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 1536 <vscale x 16 x i8> %zn4, i32 3) 1537 %slice2 = add i32 %slice, 7 1538 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 1539 <vscale x 16 x i8> %zn4, i32 3) 1540 ret void 1541} 1542 1543define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { 1544; CHECK-LABEL: sdot_form_2x_tuple: 1545; CHECK: // %bb.0: // %entry 1546; CHECK-NEXT: ptrue pn8.b 1547; CHECK-NEXT: mov w8, wzr 1548; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0] 1549; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1] 1550; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0] 1551; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0] 1552; CHECK-NEXT: ret 1553entry: 1554 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1555 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1556 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1557 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1558 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1559 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1560 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 1561 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 1562 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0) 1563 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0) 1564 ret void 1565} 1566 1567define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 { 1568; CHECK-LABEL: sdot_form_2x_tuple_svecc: 1569; CHECK: // %bb.0: // %entry 1570; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1571; CHECK-NEXT: addvl sp, sp, #-3 1572; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 1573; CHECK-NEXT: ptrue pn8.b 1574; CHECK-NEXT: mov w8, wzr 1575; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill 1576; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill 1577; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] 1578; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] 1579; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] 1580; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] 1581; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload 1582; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload 1583; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 1584; CHECK-NEXT: addvl sp, sp, #3 1585; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1586; CHECK-NEXT: ret 1587entry: 1588 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1589 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1590 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1591 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1592 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1593 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1594 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 1595 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 1596 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0) 1597 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0) 1598 ret void 1599} 1600 1601define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { 1602; CHECK-LABEL: sdot_form_4x_tuple: 1603; CHECK: // %bb.0: // %entry 1604; CHECK-NEXT: lsl x9, x1, #1 1605; CHECK-NEXT: ptrue pn8.b 1606; CHECK-NEXT: mov w8, wzr 1607; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] 1608; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] 1609; CHECK-NEXT: add x10, x9, x1 1610; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] 1611; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] 1612; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] 1613; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] 1614; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] 1615; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] 1616; CHECK-NEXT: ret 1617entry: 1618 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1619 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1620 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1621 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1622 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 1623 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 1624 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1625 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1626 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 1627 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 1628 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 1629 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 1630 %mul3 = shl i64 %stride, 1 1631 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 1632 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 1633 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 1634 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 1635 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 1636 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 1637 %mul5 = mul i64 %stride, 3 1638 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 1639 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 1640 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 1641 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 1642 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 1643 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 1644 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) 1645 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) 1646 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) 1647 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) 1648 ret void 1649} 1650 1651define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 { 1652; CHECK-LABEL: sdot_form_4x_tuple_svecc: 1653; CHECK: // %bb.0: // %entry 1654; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1655; CHECK-NEXT: addvl sp, sp, #-9 1656; CHECK-NEXT: lsl x9, x1, #1 1657; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 1658; CHECK-NEXT: ptrue pn8.b 1659; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill 1660; CHECK-NEXT: mov w8, wzr 1661; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill 1662; CHECK-NEXT: add x10, x9, x1 1663; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill 1664; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill 1665; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill 1666; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill 1667; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill 1668; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill 1669; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] 1670; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] 1671; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] 1672; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] 1673; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] 1674; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] 1675; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] 1676; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] 1677; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload 1678; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload 1679; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload 1680; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload 1681; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload 1682; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload 1683; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload 1684; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload 1685; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 1686; CHECK-NEXT: addvl sp, sp, #9 1687; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1688; CHECK-NEXT: ret 1689entry: 1690 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1691 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1692 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1693 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1694 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 1695 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 1696 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1697 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1698 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 1699 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 1700 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 1701 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 1702 %mul3 = shl i64 %stride, 1 1703 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 1704 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 1705 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 1706 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 1707 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 1708 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 1709 %mul5 = mul i64 %stride, 3 1710 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 1711 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 1712 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 1713 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 1714 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 1715 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 1716 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) 1717 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) 1718 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) 1719 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) 1720 ret void 1721} 1722 1723define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 { 1724; CHECK-LABEL: sdot_lane_za64_u16_vg1x2: 1725; CHECK: // %bb.0: 1726; CHECK-NEXT: mov z5.d, z2.d 1727; CHECK-NEXT: mov z4.d, z1.d 1728; CHECK-NEXT: mov w8, w0 1729; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1] 1730; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1] 1731; CHECK-NEXT: ret 1732 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1) 1733 %slice2 = add i32 %slice, 7 1734 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1) 1735 ret void 1736} 1737 1738define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 { 1739; CHECK-LABEL: sdot_lane_za64_u16_vg1x4: 1740; CHECK: // %bb.0: 1741; CHECK-NEXT: mov z27.d, z4.d 1742; CHECK-NEXT: mov z26.d, z3.d 1743; CHECK-NEXT: mov w8, w0 1744; CHECK-NEXT: mov z25.d, z2.d 1745; CHECK-NEXT: mov z24.d, z1.d 1746; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1] 1747; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z24.h - z27.h }, z5.h[1] 1748; CHECK-NEXT: ret 1749 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1750 <vscale x 8 x i16> %zn4, i32 1) 1751 %slice2 = add i32 %slice, 7 1752 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, 1753 <vscale x 8 x i16> %zn4, i32 1) 1754 ret void 1755} 1756 1757 1758 1759define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 { 1760; CHECK-LABEL: sudot_lane_za32_u8_vg1x2: 1761; CHECK: // %bb.0: 1762; CHECK-NEXT: mov z5.d, z2.d 1763; CHECK-NEXT: mov z4.d, z1.d 1764; CHECK-NEXT: mov w8, w0 1765; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3] 1766; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3] 1767; CHECK-NEXT: ret 1768 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3) 1769 %slice2 = add i32 %slice, 7 1770 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3) 1771 ret void 1772} 1773 1774define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 { 1775; CHECK-LABEL: sudot_lane_za32_u8_vg1x4: 1776; CHECK: // %bb.0: 1777; CHECK-NEXT: mov z27.d, z4.d 1778; CHECK-NEXT: mov z26.d, z3.d 1779; CHECK-NEXT: mov w8, w0 1780; CHECK-NEXT: mov z25.d, z2.d 1781; CHECK-NEXT: mov z24.d, z1.d 1782; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3] 1783; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3] 1784; CHECK-NEXT: ret 1785 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 1786 <vscale x 16 x i8> %zn4, i32 3) 1787 %slice2 = add i32 %slice, 7 1788 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, 1789 <vscale x 16 x i8> %zn4, i32 3) 1790 ret void 1791} 1792 1793define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { 1794; CHECK-LABEL: sudot_form_2x_tuple: 1795; CHECK: // %bb.0: // %entry 1796; CHECK-NEXT: ptrue pn8.b 1797; CHECK-NEXT: mov w8, wzr 1798; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0] 1799; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1] 1800; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0] 1801; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0] 1802; CHECK-NEXT: ret 1803entry: 1804 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1805 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1806 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1807 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1808 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1809 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1810 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 1811 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 1812 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0) 1813 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0) 1814 ret void 1815} 1816 1817define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 { 1818; CHECK-LABEL: sudot_form_2x_tuple_svecc: 1819; CHECK: // %bb.0: // %entry 1820; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1821; CHECK-NEXT: addvl sp, sp, #-3 1822; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 1823; CHECK-NEXT: ptrue pn8.b 1824; CHECK-NEXT: mov w8, wzr 1825; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill 1826; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill 1827; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] 1828; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] 1829; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] 1830; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] 1831; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload 1832; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload 1833; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 1834; CHECK-NEXT: addvl sp, sp, #3 1835; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1836; CHECK-NEXT: ret 1837entry: 1838 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1839 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1840 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1841 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1842 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1843 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1844 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 1845 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 1846 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0) 1847 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0) 1848 ret void 1849} 1850 1851define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { 1852; CHECK-LABEL: sudot_form_4x_tuple: 1853; CHECK: // %bb.0: // %entry 1854; CHECK-NEXT: lsl x9, x1, #1 1855; CHECK-NEXT: ptrue pn8.b 1856; CHECK-NEXT: mov w8, wzr 1857; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] 1858; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] 1859; CHECK-NEXT: add x10, x9, x1 1860; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] 1861; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] 1862; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] 1863; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] 1864; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] 1865; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] 1866; CHECK-NEXT: ret 1867entry: 1868 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1869 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1870 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1871 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1872 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 1873 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 1874 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1875 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1876 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 1877 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 1878 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 1879 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 1880 %mul3 = shl i64 %stride, 1 1881 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 1882 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 1883 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 1884 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 1885 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 1886 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 1887 %mul5 = mul i64 %stride, 3 1888 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 1889 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 1890 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 1891 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 1892 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 1893 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 1894 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) 1895 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) 1896 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) 1897 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) 1898 ret void 1899} 1900 1901define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 { 1902; CHECK-LABEL: sudot_form_4x_tuple_svecc: 1903; CHECK: // %bb.0: // %entry 1904; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1905; CHECK-NEXT: addvl sp, sp, #-9 1906; CHECK-NEXT: lsl x9, x1, #1 1907; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 1908; CHECK-NEXT: ptrue pn8.b 1909; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill 1910; CHECK-NEXT: mov w8, wzr 1911; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill 1912; CHECK-NEXT: add x10, x9, x1 1913; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill 1914; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill 1915; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill 1916; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill 1917; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill 1918; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill 1919; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] 1920; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] 1921; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] 1922; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] 1923; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] 1924; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] 1925; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] 1926; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] 1927; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload 1928; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload 1929; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload 1930; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload 1931; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload 1932; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload 1933; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload 1934; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload 1935; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 1936; CHECK-NEXT: addvl sp, sp, #9 1937; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1938; CHECK-NEXT: ret 1939entry: 1940 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() 1941 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) 1942 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 1943 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 1944 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 1945 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 1946 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride 1947 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) 1948 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 1949 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 1950 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 1951 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 1952 %mul3 = shl i64 %stride, 1 1953 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 1954 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) 1955 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 1956 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 1957 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 1958 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 1959 %mul5 = mul i64 %stride, 3 1960 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 1961 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) 1962 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 1963 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 1964 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 1965 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 1966 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) 1967 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) 1968 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) 1969 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) 1970 ret void 1971} 1972 1973 1974attributes #0 = { nounwind "target-features"="+sme2" } 1975attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" } 1976 1977; == Multi, multi (unsigned) 1978 1979declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1980declare void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 1981 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1982declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1983declare void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, 1984 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1985declare void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1986declare void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 1987 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1988declare void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1989declare void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, 1990 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1991 1992; == Multi, multi (signed) 1993 1994declare void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1995declare void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 1996 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 1997declare void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 1998declare void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, 1999 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2000declare void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2001declare void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 2002 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2003 2004; == Multi, single (unsigned) 2005 2006declare void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2007declare void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2008declare void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2009declare void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2010declare void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2011declare void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2012declare void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2013declare void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2014 2015; == Multi, single (signed) 2016 2017declare void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2018declare void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2019declare void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2020declare void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2021declare void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2022declare void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 2023declare void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2024declare void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 2025 2026; == Multi, indexed (unsigned) 2027 2028declare void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 2029declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 2030declare void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 2031declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 2032declare void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 2033declare void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 2034declare void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 2035declare void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 2036 2037; == Multi, indexed (signed) 2038 2039declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 2040declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 2041declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 2042declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 2043declare void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 2044declare void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) 2045declare void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 2046declare void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) 2047