1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=AVX512 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx2 -verify-machineinstrs | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx -verify-machineinstrs | FileCheck %s --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s --check-prefix=SSE2 6 7@buf = dso_local global [1024 x i8] zeroinitializer, align 64 8@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64 9 10; Function Attrs: nounwind uwtable 11define <4 x i32> @test_api(i32 %0, i16 signext %1, i16 signext %2, <4 x i32> %xmm0) { 12; AVX512-LABEL: test_api: 13; AVX512: # %bb.0: 14; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 15; AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) 16; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp) 17; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp) 18; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp) 19; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) 20; AVX512-NEXT: testl %edi, %edi 21; AVX512-NEXT: movsbl %sil, %eax 22; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) 23; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) 24; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) 25; AVX512-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 26; AVX512-NEXT: je .LBB0_2 27; AVX512-NEXT: # %bb.1: 28; AVX512-NEXT: movl $buf, %ecx 29; AVX512-NEXT: jmp .LBB0_3 30; AVX512-NEXT: .LBB0_2: 31; AVX512-NEXT: movl $buf2, %ecx 32; AVX512-NEXT: .LBB0_3: 33; AVX512-NEXT: movl $32, %edi 34; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm0 35; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm2 36; AVX512-NEXT: tileloadd (%rcx,%rdi), %tmm1 37; AVX512-NEXT: tdpbssd %tmm2, %tmm0, %tmm1 38; AVX512-NEXT: movl $buf, %ecx 39; AVX512-NEXT: movl $32, %esi 40; AVX512-NEXT: tilestored %tmm1, (%rcx,%rsi) 41; AVX512-NEXT: tilerelease 42; AVX512-NEXT: vzeroupper 43; AVX512-NEXT: retq 44; 45; AVX2-LABEL: test_api: 46; AVX2: # %bb.0: 47; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 48; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) 49; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) 50; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp) 51; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) 52; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) 53; AVX2-NEXT: movw %si, -{{[0-9]+}}(%rsp) 54; AVX2-NEXT: testl %edi, %edi 55; AVX2-NEXT: movsbl %sil, %eax 56; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) 57; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) 58; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) 59; AVX2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 60; AVX2-NEXT: je .LBB0_2 61; AVX2-NEXT: # %bb.1: 62; AVX2-NEXT: movl $buf, %ecx 63; AVX2-NEXT: jmp .LBB0_3 64; AVX2-NEXT: .LBB0_2: 65; AVX2-NEXT: movl $buf2, %ecx 66; AVX2-NEXT: .LBB0_3: 67; AVX2-NEXT: movl $32, %edi 68; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm0 69; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm2 70; AVX2-NEXT: tileloadd (%rcx,%rdi), %tmm1 71; AVX2-NEXT: tdpbssd %tmm2, %tmm0, %tmm1 72; AVX2-NEXT: movl $buf, %ecx 73; AVX2-NEXT: movl $32, %esi 74; AVX2-NEXT: tilestored %tmm1, (%rcx,%rsi) 75; AVX2-NEXT: tilerelease 76; AVX2-NEXT: vzeroupper 77; AVX2-NEXT: retq 78; 79; AVX1-LABEL: test_api: 80; AVX1: # %bb.0: 81; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 82; AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) 83; AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) 84; AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) 85; AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) 86; AVX1-NEXT: movb $1, -{{[0-9]+}}(%rsp) 87; AVX1-NEXT: movw %dx, -{{[0-9]+}}(%rsp) 88; AVX1-NEXT: movw %dx, -{{[0-9]+}}(%rsp) 89; AVX1-NEXT: movw %si, -{{[0-9]+}}(%rsp) 90; AVX1-NEXT: testl %edi, %edi 91; AVX1-NEXT: movsbl %sil, %eax 92; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) 93; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) 94; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) 95; AVX1-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 96; AVX1-NEXT: je .LBB0_2 97; AVX1-NEXT: # %bb.1: 98; AVX1-NEXT: movl $buf, %ecx 99; AVX1-NEXT: jmp .LBB0_3 100; AVX1-NEXT: .LBB0_2: 101; AVX1-NEXT: movl $buf2, %ecx 102; AVX1-NEXT: .LBB0_3: 103; AVX1-NEXT: movl $32, %edi 104; AVX1-NEXT: tileloadd (%rcx,%rdi), %tmm0 105; AVX1-NEXT: tileloadd (%rcx,%rdi), %tmm2 106; AVX1-NEXT: tileloadd (%rcx,%rdi), %tmm1 107; AVX1-NEXT: tdpbssd %tmm2, %tmm0, %tmm1 108; AVX1-NEXT: movl $buf, %ecx 109; AVX1-NEXT: movl $32, %esi 110; AVX1-NEXT: tilestored %tmm1, (%rcx,%rsi) 111; AVX1-NEXT: tilerelease 112; AVX1-NEXT: retq 113; 114; SSE2-LABEL: test_api: 115; SSE2: # %bb.0: 116; SSE2-NEXT: xorps %xmm1, %xmm1 117; SSE2-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) 118; SSE2-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) 119; SSE2-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) 120; SSE2-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) 121; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp) 122; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) 123; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) 124; SSE2-NEXT: movw %si, -{{[0-9]+}}(%rsp) 125; SSE2-NEXT: testl %edi, %edi 126; SSE2-NEXT: movsbl %sil, %eax 127; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) 128; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) 129; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) 130; SSE2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 131; SSE2-NEXT: je .LBB0_2 132; SSE2-NEXT: # %bb.1: 133; SSE2-NEXT: movl $buf, %ecx 134; SSE2-NEXT: jmp .LBB0_3 135; SSE2-NEXT: .LBB0_2: 136; SSE2-NEXT: movl $buf2, %ecx 137; SSE2-NEXT: .LBB0_3: 138; SSE2-NEXT: movl $32, %edi 139; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm0 140; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm2 141; SSE2-NEXT: tileloadd (%rcx,%rdi), %tmm1 142; SSE2-NEXT: tdpbssd %tmm2, %tmm0, %tmm1 143; SSE2-NEXT: movl $buf, %ecx 144; SSE2-NEXT: movl $32, %esi 145; SSE2-NEXT: tilestored %tmm1, (%rcx,%rsi) 146; SSE2-NEXT: tilerelease 147; SSE2-NEXT: retq 148 %4 = icmp eq i32 %0, 0 149 %5 = shl i16 %1, 8 150 %6 = ashr exact i16 %5, 8 151 br i1 %4, label %11, label %7 152 1537: ; preds = %3 154 %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf, i64 32) 155 %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32) 156 %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32) 157 br label %15 158 15911: ; preds = %3 160 %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf2, i64 32) 161 %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32) 162 %14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32) 163 br label %15 164 16515: ; preds = %11, %7 166 %16 = phi x86_amx [ %12, %11 ], [ %8, %7 ] 167 %17 = phi x86_amx [ %13, %11 ], [ %9, %7 ] 168 %18 = phi x86_amx [ %14, %11 ], [ %10, %7 ] 169 %19 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, x86_amx %18, x86_amx %16, x86_amx %17) 170 tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, ptr @buf, i64 32, x86_amx %19) 171 ret <4 x i32> %xmm0 172} 173 174declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) 175declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 176declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) 177