1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR 4 5define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { 6; CHECK-LABEL: test_amx: 7; CHECK: # %bb.0: 8; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 9; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 10; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 11; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 12; CHECK-NEXT: ttransposed %tmm3, %tmm1 13; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 14; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 15; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 16; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 17; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 18; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1 19; CHECK-NEXT: retq 20; 21; EGPR-LABEL: test_amx: 22; EGPR: # %bb.0: 23; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31] 24; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31] 25; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31] 26; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31] 27; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb] 28; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca] 29; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5] 30; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca] 31; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca] 32; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca] 33; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca] 34; EGPR-NEXT: retq # encoding: [0xc3] 35 call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) 36 call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) 37 call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) 38 call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride) 39 call void @llvm.x86.ttransposed(i8 1, i8 3) 40 call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3) 41 call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6) 42 call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3) 43 call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3) 44 call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3) 45 call void @llvm.x86.tconjtfp16(i8 1, i8 2) 46 ret void 47} 48 49declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride) 50declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride) 51declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride) 52declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride) 53declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1) 54declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2) 55declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2) 56declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C) 57declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C) 58declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C) 59declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B) 60 61define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 { 62; CHECK-LABEL: test_amx2: 63; CHECK: # %bb.0: 64; CHECK-NEXT: pushq %rbp 65; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70 66; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 67; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 68; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) 69; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 70; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 71; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 72; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 73; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 74; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 75; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 76; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 77; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 78; CHECK-NEXT: movw $8, %ax 79; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 80; CHECK-NEXT: tilezero %tmm1 81; CHECK-NEXT: tilezero %tmm2 82; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 83; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 84; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 85; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 86; CHECK-NEXT: movabsq $64, %rbp 87; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill 88; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload 89; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 90; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0 91; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) 92; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70 93; CHECK-NEXT: popq %rbp 94; CHECK-NEXT: tilerelease 95; CHECK-NEXT: vzeroupper 96; CHECK-NEXT: retq 97; 98; EGPR-LABEL: test_amx2: 99; EGPR: # %bb.0: 100; EGPR-NEXT: pushq %rbp # encoding: [0x55] 101; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00] 102; EGPR-NEXT: # imm = 0xB70 103; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] 104; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d] 105; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01] 106; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08] 107; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00] 108; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08] 109; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00] 110; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08] 111; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00] 112; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08] 113; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00] 114; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00] 115; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] 116; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] 117; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8] 118; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0] 119; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0] 120; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0] 121; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0] 122; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0] 123; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] 124; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill 125; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00] 126; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload 127; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00] 128; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8] 129; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3] 130; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17] 131; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00] 132; EGPR-NEXT: # imm = 0xB70 133; EGPR-NEXT: popq %rbp # encoding: [0x5d] 134; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] 135; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 136; EGPR-NEXT: retq # encoding: [0xc3] 137 138 %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) 139 %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) 140 %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) 141 %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) 142 %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) 143 %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b) 144 %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b) 145 %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b) 146 %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5) 147 148 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4) 149 ret void 150} 151 152define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { 153; CHECK-LABEL: test_amx3: 154; CHECK: # %bb.0: 155; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 156; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 157; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 158; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 159; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 160; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 161; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 162; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) 163; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp) 164; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 165; CHECK-NEXT: xorl %eax, %eax 166; CHECK-NEXT: movw $8, %cx 167; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 168; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 169; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 170; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 171; CHECK-NEXT: ttransposed %tmm4, %tmm0 172; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) 173; CHECK-NEXT: tilerelease 174; CHECK-NEXT: vzeroupper 175; CHECK-NEXT: retq 176; 177; EGPR-LABEL: test_amx3: 178; EGPR: # %bb.0: 179; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] 180; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff] 181; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] 182; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08] 183; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00] 184; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] 185; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] 186; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00] 187; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00] 188; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] 189; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] 190; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00] 191; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] 192; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16] 193; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16] 194; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16] 195; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4] 196; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17] 197; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] 198; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 199; EGPR-NEXT: retq # encoding: [0xc3] 200 %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) 201 %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) 202 %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) 203 %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) 204 %5 = extractvalue { x86_amx, x86_amx } %4, 0 205 %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5) 206 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6) 207 ret void 208} 209 210define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { 211; CHECK-LABEL: test_amx_spill: 212; CHECK: # %bb.0: 213; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8 214; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 215; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 216; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 217; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 218; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 219; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 220; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 221; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 222; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 223; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 224; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 225; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 226; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 227; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 228; CHECK-NEXT: movw $8, %ax 229; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 230; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 231; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 232; CHECK-NEXT: movabsq $64, %rcx 233; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill 234; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill 235; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 236; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill 237; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill 238; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 239; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill 240; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill 241; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 242; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) 243; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) 244; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload 245; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload 246; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) 247; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) 248; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload 249; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload 250; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) 251; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) 252; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload 253; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload 254; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) 255; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) 256; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx) 257; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx) 258; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8 259; CHECK-NEXT: tilerelease 260; CHECK-NEXT: vzeroupper 261; CHECK-NEXT: retq 262; 263; EGPR-LABEL: test_amx_spill: 264; EGPR: # %bb.0: 265; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00] 266; EGPR-NEXT: # imm = 0x17C8 267; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] 268; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe] 269; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01] 270; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08] 271; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00] 272; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08] 273; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00] 274; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08] 275; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00] 276; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08] 277; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00] 278; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08] 279; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00] 280; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80] 281; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] 282; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] 283; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] 284; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16] 285; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] 286; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill 287; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00] 288; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill 289; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00] 290; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16] 291; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill 292; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00] 293; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill 294; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00] 295; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16] 296; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill 297; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0] 298; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill 299; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00] 300; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16] 301; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] 302; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] 303; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload 304; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00] 305; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload 306; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00] 307; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] 308; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] 309; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload 310; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00] 311; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload 312; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00] 313; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] 314; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] 315; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload 316; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0] 317; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload 318; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00] 319; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] 320; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] 321; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16] 322; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16] 323; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00] 324; EGPR-NEXT: # imm = 0x17C8 325; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] 326; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 327; EGPR-NEXT: retq # encoding: [0xc3] 328 %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) 329 %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) 330 %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) 331 %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) 332 %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) 333 %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) 334 %e11 = extractvalue { x86_amx, x86_amx } %b1, 0 335 %e12 = extractvalue { x86_amx, x86_amx } %b1, 1 336 %e21 = extractvalue { x86_amx, x86_amx } %b2, 0 337 %e22 = extractvalue { x86_amx, x86_amx } %b2, 1 338 %e31 = extractvalue { x86_amx, x86_amx } %b3, 0 339 %e32 = extractvalue { x86_amx, x86_amx } %b3, 1 340 %e41 = extractvalue { x86_amx, x86_amx } %b4, 0 341 %e42 = extractvalue { x86_amx, x86_amx } %b4, 1 342 %e51 = extractvalue { x86_amx, x86_amx } %b5, 0 343 %e52 = extractvalue { x86_amx, x86_amx } %b5, 1 344 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11) 345 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12) 346 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21) 347 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22) 348 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31) 349 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32) 350 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41) 351 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42) 352 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51) 353 call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52) 354 ret void 355} 356 357declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) 358declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) 359declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) 360declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64) 361declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64) 362declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64) 363declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx) 364declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 365declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 366declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 367declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 368declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 369declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx) 370 371attributes #0 = { nounwind } 372