1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx10.2-512, \ 3; RUN: -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s 4 5define void @test_amx(i8* %pointer, i8* %base, i32 %index, i64 %stride) { 6; CHECK-LABEL: test_amx: 7; CHECK: # %bb.0: 8; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 9; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 10; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 11; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 12; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 13; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 14; CHECK-NEXT: movw $8, %ax 15; CHECK-NEXT: tileloadd (%rsi,%rcx), %tmm0 16; CHECK-NEXT: tcvtrowd2ps %edx, %tmm0, %zmm0 17; CHECK-NEXT: tcvtrowd2ps $16, %tmm0, %zmm0 18; CHECK-NEXT: tcvtrowps2bf16h %edx, %tmm0, %zmm0 19; CHECK-NEXT: tcvtrowps2bf16h $16, %tmm0, %zmm0 20; CHECK-NEXT: tcvtrowps2bf16l %edx, %tmm0, %zmm0 21; CHECK-NEXT: tcvtrowps2bf16l $16, %tmm0, %zmm0 22; CHECK-NEXT: tcvtrowps2phh %edx, %tmm0, %zmm0 23; CHECK-NEXT: tcvtrowps2phh $16, %tmm0, %zmm0 24; CHECK-NEXT: tcvtrowps2phl %edx, %tmm0, %zmm0 25; CHECK-NEXT: tcvtrowps2phl $16, %tmm0, %zmm0 26; CHECK-NEXT: tilemovrow %edx, %tmm0, %zmm0 27; CHECK-NEXT: tilemovrow $16, %tmm0, %zmm0 28; CHECK-NEXT: tilestored %tmm0, (%rdi,%rcx) 29; CHECK-NEXT: tilerelease 30; CHECK-NEXT: vzeroupper 31; CHECK-NEXT: retq 32 33 %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) 34 call <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16 8, i16 8, x86_amx %a, i32 %index) 35 call <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16 8, i16 8, x86_amx %a, i32 16) 36 call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h.internal(i16 8, i16 8, x86_amx %a, i32 %index) 37 call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h.internal(i16 8, i16 8, x86_amx %a, i32 16) 38 call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l.internal(i16 8, i16 8, x86_amx %a, i32 %index) 39 call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l.internal(i16 8, i16 8, x86_amx %a, i32 16) 40 call <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16 8, i16 8, x86_amx %a, i32 %index) 41 call <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16 8, i16 8, x86_amx %a, i32 16) 42 call <32 x half> @llvm.x86.tcvtrowps2phl.internal(i16 8, i16 8, x86_amx %a, i32 %index) 43 call <32 x half> @llvm.x86.tcvtrowps2phl.internal(i16 8, i16 8, x86_amx %a, i32 16) 44 call <16 x i32> @llvm.x86.tilemovrow.internal(i16 8, i16 8, x86_amx %a, i32 %index) 45 call <16 x i32> @llvm.x86.tilemovrow.internal(i16 8, i16 8, x86_amx %a, i32 16) 46 47 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %a) 48 ret void 49} 50 51declare x86_amx @llvm.x86.tilezero.internal(i16, i16) 52declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) 53declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64) 54declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) 55 56declare <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16, i16, x86_amx, i32) 57declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16h.internal(i16, i16, x86_amx, i32) 58declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16l.internal(i16, i16, x86_amx, i32) 59declare <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16, i16, x86_amx, i32) 60declare <32 x half> @llvm.x86.tcvtrowps2phl.internal(i16, i16, x86_amx, i32) 61declare <16 x i32> @llvm.x86.tilemovrow.internal(i16, i16, x86_amx, i32) 62