1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s 3 4define void @test_amx(ptr %pointer, ptr %base, i64 %stride) { 5; CHECK-LABEL: test_amx: 6; CHECK: # %bb.0: 7; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 8; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 9; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 10; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 11; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 12; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 13; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 14; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 15; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 16; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 17; CHECK-NEXT: movw $8, %ax 18; CHECK-NEXT: tilezero %tmm0 19; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm1 20; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm2 21; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 22; CHECK-NEXT: tdpbsud %tmm2, %tmm1, %tmm0 23; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0 24; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0 25; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0 26; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm1 27; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) 28; CHECK-NEXT: tilerelease 29; CHECK-NEXT: vzeroupper 30; CHECK-NEXT: retq 31 %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) 32 %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) 33 %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) 34 %d0 = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) 35 %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b) 36 %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) 37 %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) 38 %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b) 39 %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, ptr %base, i64 %stride) 40 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %pointer, i64 %stride, x86_amx %d4) 41 42 ret void 43} 44 45declare x86_amx @llvm.x86.tilezero.internal(i16, i16) 46declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) 47declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, ptr, i64) 48declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 49declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 50declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 51declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 52declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 53declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) 54 55define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind { 56; CHECK-LABEL: PR90954: 57; CHECK: # %bb.0: 58; CHECK-NEXT: pushq %rbp 59; CHECK-NEXT: movq %rsp, %rbp 60; CHECK-NEXT: pushq %r15 61; CHECK-NEXT: pushq %r14 62; CHECK-NEXT: pushq %r13 63; CHECK-NEXT: pushq %r12 64; CHECK-NEXT: pushq %rbx 65; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 66; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400 67; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 68; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 69; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) 70; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp) 71; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp) 72; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp) 73; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp) 74; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp) 75; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp) 76; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 77; CHECK-NEXT: shll $4, %edx 78; CHECK-NEXT: xorl %eax, %eax 79; CHECK-NEXT: movw $64, %cx 80; CHECK-NEXT: movw $16, %di 81; CHECK-NEXT: movb $1, %r8b 82; CHECK-NEXT: movl $64, %r9d 83; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r10 84; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r11 85; CHECK-NEXT: xorl %ebx, %ebx 86; CHECK-NEXT: xorl %r14d, %r14d 87; CHECK-NEXT: jmp .LBB1_1 88; CHECK-NEXT: .p2align 4 89; CHECK-NEXT: .LBB1_5: # in Loop: Header=BB1_1 Depth=1 90; CHECK-NEXT: incq %r14 91; CHECK-NEXT: addl %edx, %ebx 92; CHECK-NEXT: .LBB1_1: # =>This Loop Header: Depth=1 93; CHECK-NEXT: # Child Loop BB1_2 Depth 2 94; CHECK-NEXT: movslq %ebx, %r15 95; CHECK-NEXT: leaq (%rsi,%r15,4), %r15 96; CHECK-NEXT: xorl %r12d, %r12d 97; CHECK-NEXT: xorl %r13d, %r13d 98; CHECK-NEXT: jmp .LBB1_2 99; CHECK-NEXT: .p2align 4 100; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_2 Depth=2 101; CHECK-NEXT: tilestored %tmm1, (%r15,%rax) 102; CHECK-NEXT: incq %r13 103; CHECK-NEXT: addq $64, %r15 104; CHECK-NEXT: decq %r12 105; CHECK-NEXT: je .LBB1_5 106; CHECK-NEXT: .LBB1_2: # Parent Loop BB1_1 Depth=1 107; CHECK-NEXT: # => This Inner Loop Header: Depth=2 108; CHECK-NEXT: tilezero %tmm0 109; CHECK-NEXT: tilezero %tmm1 110; CHECK-NEXT: testb %r8b, %r8b 111; CHECK-NEXT: jne .LBB1_4 112; CHECK-NEXT: # %bb.3: # in Loop: Header=BB1_2 Depth=2 113; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 114; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 115; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 116; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 117; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 118; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 119; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 120; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 121; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 122; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 123; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 124; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 125; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 126; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 127; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 128; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 129; CHECK-NEXT: tileloadd (%r10,%r9), %tmm1 130; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 131; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 132; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 133; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 134; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 135; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 136; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 137; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 138; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 139; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 140; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 141; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 142; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 143; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 144; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 145; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 146; CHECK-NEXT: tileloadd (%r11,%r9), %tmm2 147; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0 148; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 149; CHECK-NEXT: movabsq $64, %rax 150; CHECK-NEXT: tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill 151; CHECK-NEXT: tileloadd 3072(%rsp,%rax), %tmm1 # 1024-byte Folded Reload 152; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 153; CHECK-NEXT: jmp .LBB1_4 154 %4 = shl i32 %2, 4 155 %5 = icmp eq i64 0, 0 156 br label %6 157 1586: ; preds = %31, %3 159 %7 = phi i64 [ 0, %3 ], [ %32, %31 ] 160 %8 = trunc nuw nsw i64 %7 to i32 161 %9 = mul i32 %4, %8 162 %10 = mul i32 0, %8 163 %11 = sext i32 %9 to i64 164 %12 = getelementptr inbounds i32, ptr %1, i64 %11 165 br label %13 166 16713: ; preds = %25, %6 168 %14 = phi i64 [ %29, %25 ], [ 0, %6 ] 169 %15 = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) 170 %16 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %15) 171 %17 = shl nsw i64 %14, 4 172 %18 = getelementptr i32, ptr %0, i64 %17 173 br i1 %5, label %25, label %19 174 17519: ; preds = %13 176 %20 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16) 177 %21 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer) 178 %22 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer) 179 %23 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %20, x86_amx %21, x86_amx %22) 180 %24 = tail call noundef <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %23) 181 br label %25 182 18325: ; preds = %19, %13 184 %26 = phi <256 x i32> [ undef, %13 ], [ %24, %19 ] 185 %27 = getelementptr inbounds i32, ptr %12, i64 %17 186 %28 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %26) 187 tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %27, i64 0, x86_amx %28) 188 %29 = add nuw nsw i64 %14, 1 189 %30 = icmp eq i64 %29, 0 190 br i1 %30, label %31, label %13 191 19231: ; preds = %25 193 %32 = add nuw nsw i64 %7, 1 194 br label %6 195} 196 197define void @multi_use() nounwind { 198; CHECK-LABEL: multi_use: 199; CHECK: # %bb.0: 200; CHECK-NEXT: pushq %rbp 201; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70 202; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 203; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 204; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) 205; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp) 206; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp) 207; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp) 208; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp) 209; CHECK-NEXT: movw $64, %ax 210; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 211; CHECK-NEXT: movw $16, %cx 212; CHECK-NEXT: tilezero %tmm0 213; CHECK-NEXT: movabsq $64, %rbp 214; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill 215; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm1 # 1024-byte Folded Reload 216; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm1 217; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm0 218; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70 219; CHECK-NEXT: popq %rbp 220; CHECK-NEXT: tilerelease 221; CHECK-NEXT: vzeroupper 222; CHECK-NEXT: retq 223 %1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) 224 %2 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1) 225 %3 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1) 226 ret void 227} 228 229declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) 230declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) 231