; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s ; == FVDOT == define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, %zn1, %zn2, %zm) { ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 %slice, %zn1, %zn2, %zm, i32 3) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 %slice.7, %zn1, %zn2, %zm, i32 3) ret void } ; == BFVDOT == define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, %zn1, %zn2, %zm) { ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 %slice, %zn1, %zn2, %zm, i32 3) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 %slice.7, %zn1, %zn2, %zm, i32 3) ret void } ; == SVDOT == define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, %zn1, %zn2, %zm) { ; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 %slice, %zn1, %zn2, %zm, i32 3) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 %slice.7, %zn1, %zn2, %zm, i32 3) ret void } define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_svdot_lane_za32_vg1x4_nxv16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: svdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm, i32 3) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 %slice.7, %zn1, %zn2, %zn3, %zn4, %zm, i32 3) ret void } define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_svdot_lane_za64_vg1x4_nxv8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: svdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1] ; CHECK-NEXT: svdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm, i32 1) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 %slice.7, %zn1, %zn2, %zn3, %zn4, %zm, i32 1) ret void } define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: svdot_form_2x_tuple: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0] ; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9] ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0] ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , } %1, 0 %3 = extractvalue { , } %1, 1 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: svdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 ; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] ; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] ; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , } %1, 0 %3 = extractvalue { , } %1, 1 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: svdot_form_4x_tuple: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , , , } %1, 0 %3 = extractvalue { , , , } %1, 1 %4 = extractvalue { , , , } %1, 2 %5 = extractvalue { , , , } %1, 3 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %7 = extractvalue { , , , } %6, 0 %8 = extractvalue { , , , } %6, 1 %9 = extractvalue { , , , } %6, 2 %10 = extractvalue { , , , } %6, 3 %mul3 = shl i64 %stride, 1 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) %12 = extractvalue { , , , } %11, 0 %13 = extractvalue { , , , } %11, 1 %14 = extractvalue { , , , } %11, 2 %15 = extractvalue { , , , } %11, 3 %mul5 = mul i64 %stride, 3 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) %17 = extractvalue { , , , } %16, 0 %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: svdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] ; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] ; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , , , } %1, 0 %3 = extractvalue { , , , } %1, 1 %4 = extractvalue { , , , } %1, 2 %5 = extractvalue { , , , } %1, 3 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %7 = extractvalue { , , , } %6, 0 %8 = extractvalue { , , , } %6, 1 %9 = extractvalue { , , , } %6, 2 %10 = extractvalue { , , , } %6, 3 %mul3 = shl i64 %stride, 1 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) %12 = extractvalue { , , , } %11, 0 %13 = extractvalue { , , , } %11, 1 %14 = extractvalue { , , , } %11, 2 %15 = extractvalue { , , , } %11, 3 %mul5 = mul i64 %stride, 3 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) %17 = extractvalue { , , , } %16, 0 %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } ; == UVDOT == define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, %zn1, %zn2, %zm) { ; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 %slice, %zn1, %zn2, %zm, i32 3) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 %slice.7, %zn1, %zn2, %zm, i32 3) ret void } define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_uvdot_lane_za32_vg1x4_nxv16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: uvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm, i32 3) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 %slice.7, %zn1, %zn2, %zn3, %zn4, %zm, i32 3) ret void } define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_uvdot_lane_za64_vg1x4_nxv8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: uvdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1] ; CHECK-NEXT: uvdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm, i32 1) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 %slice.7, %zn1, %zn2, %zn3, %zn4, %zm, i32 1) ret void } define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: uvdot_form_2x_tuple: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0] ; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , } %1, 0 %3 = extractvalue { , } %1, 1 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: uvdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 ; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] ; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] ; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , } %1, 0 %3 = extractvalue { , } %1, 1 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: uvdot_form_4x_tuple: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , , , } %1, 0 %3 = extractvalue { , , , } %1, 1 %4 = extractvalue { , , , } %1, 2 %5 = extractvalue { , , , } %1, 3 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %7 = extractvalue { , , , } %6, 0 %8 = extractvalue { , , , } %6, 1 %9 = extractvalue { , , , } %6, 2 %10 = extractvalue { , , , } %6, 3 %mul3 = shl i64 %stride, 1 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) %12 = extractvalue { , , , } %11, 0 %13 = extractvalue { , , , } %11, 1 %14 = extractvalue { , , , } %11, 2 %15 = extractvalue { , , , } %11, 3 %mul5 = mul i64 %stride, 3 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) %17 = extractvalue { , , , } %16, 0 %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: uvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] ; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] ; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , , , } %1, 0 %3 = extractvalue { , , , } %1, 1 %4 = extractvalue { , , , } %1, 2 %5 = extractvalue { , , , } %1, 3 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %7 = extractvalue { , , , } %6, 0 %8 = extractvalue { , , , } %6, 1 %9 = extractvalue { , , , } %6, 2 %10 = extractvalue { , , , } %6, 3 %mul3 = shl i64 %stride, 1 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) %12 = extractvalue { , , , } %11, 0 %13 = extractvalue { , , , } %11, 1 %14 = extractvalue { , , , } %11, 2 %15 = extractvalue { , , , } %11, 3 %mul5 = mul i64 %stride, 3 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) %17 = extractvalue { , , , } %16, 0 %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } ; == SUVDOT == define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_suvdot_lane_za32_vg1x4_nxv16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: suvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm, i32 3) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 %slice.7, %zn1, %zn2, %zn3, %zn4, %zm, i32 3) ret void } define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: suvdot_form_4x_tuple: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , , , } %1, 0 %3 = extractvalue { , , , } %1, 1 %4 = extractvalue { , , , } %1, 2 %5 = extractvalue { , , , } %1, 3 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %7 = extractvalue { , , , } %6, 0 %8 = extractvalue { , , , } %6, 1 %9 = extractvalue { , , , } %6, 2 %10 = extractvalue { , , , } %6, 3 %mul3 = shl i64 %stride, 1 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) %12 = extractvalue { , , , } %11, 0 %13 = extractvalue { , , , } %11, 1 %14 = extractvalue { , , , } %11, 2 %15 = extractvalue { , , , } %11, 3 %mul5 = mul i64 %stride, 3 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) %17 = extractvalue { , , , } %16, 0 %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: suvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] ; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] ; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , , , } %1, 0 %3 = extractvalue { , , , } %1, 1 %4 = extractvalue { , , , } %1, 2 %5 = extractvalue { , , , } %1, 3 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %7 = extractvalue { , , , } %6, 0 %8 = extractvalue { , , , } %6, 1 %9 = extractvalue { , , , } %6, 2 %10 = extractvalue { , , , } %6, 3 %mul3 = shl i64 %stride, 1 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) %12 = extractvalue { , , , } %11, 0 %13 = extractvalue { , , , } %11, 1 %14 = extractvalue { , , , } %11, 2 %15 = extractvalue { , , , } %11, 3 %mul5 = mul i64 %stride, 3 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) %17 = extractvalue { , , , } %16, 0 %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } ; == USVDOT == define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { ; CHECK-LABEL: test_usvdot_lane_za32_vg1x4_nxv16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: usvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm, i32 3) %slice.7 = add i32 %slice, 7 call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 %slice.7, %zn1, %zn2, %zn3, %zn4, %zm, i32 3) ret void } define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: usvdot_form_4x_tuple: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , , , } %1, 0 %3 = extractvalue { , , , } %1, 1 %4 = extractvalue { , , , } %1, 2 %5 = extractvalue { , , , } %1, 3 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %7 = extractvalue { , , , } %6, 0 %8 = extractvalue { , , , } %6, 1 %9 = extractvalue { , , , } %6, 2 %10 = extractvalue { , , , } %6, 3 %mul3 = shl i64 %stride, 1 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) %12 = extractvalue { , , , } %11, 0 %13 = extractvalue { , , , } %11, 1 %14 = extractvalue { , , , } %11, 2 %15 = extractvalue { , , , } %11, 3 %mul5 = mul i64 %stride, 3 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) %17 = extractvalue { , , , } %16, 0 %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: usvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] ; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] ; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) %2 = extractvalue { , , , } %1, 0 %3 = extractvalue { , , , } %1, 1 %4 = extractvalue { , , , } %1, 2 %5 = extractvalue { , , , } %1, 3 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %7 = extractvalue { , , , } %6, 0 %8 = extractvalue { , , , } %6, 1 %9 = extractvalue { , , , } %6, 2 %10 = extractvalue { , , , } %6, 3 %mul3 = shl i64 %stride, 1 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) %12 = extractvalue { , , , } %11, 0 %13 = extractvalue { , , , } %11, 1 %14 = extractvalue { , , , } %11, 2 %15 = extractvalue { , , , } %11, 3 %mul5 = mul i64 %stride, 3 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) %17 = extractvalue { , , , } %16, 0 %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } attributes #0 = { nounwind "target-features"="+sme2" } attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" } ; == FVDOT == declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32, , , , i32) declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32, , , , i32) ; == SVDOT == declare void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32, , , , i32) declare void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32, , , , , , i32) declare void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32, , , , , , i32) ; == UVDOT == declare void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32, , , , i32) declare void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32, , , , , , i32) declare void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32, , , , , , i32) ; == SUVDOT == declare void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32, , , , , , i32) ; == USVDOT == declare void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32, , , , , , i32)