1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -O0 | FileCheck %s --check-prefix=AVX512-O0 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 -O0 | FileCheck %s --check-prefix=AVX2-O0 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -O0 | FileCheck %s --check-prefix=SSE2-O0 8 9define void @foo(ptr %buf) nounwind { 10; AVX512-LABEL: foo: 11; AVX512: # %bb.0: # %entry 12; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 13; AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 14; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp) 15; AVX512-NEXT: movb $8, -{{[0-9]+}}(%rsp) 16; AVX512-NEXT: movw $32, -{{[0-9]+}}(%rsp) 17; AVX512-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 18; AVX512-NEXT: movw $32, %ax 19; AVX512-NEXT: movw $8, %cx 20; AVX512-NEXT: tilezero %tmm0 21; AVX512-NEXT: movl $1024, %edx # imm = 0x400 22; AVX512-NEXT: tilestored %tmm0, (%rdi,%rdx) 23; AVX512-NEXT: tilerelease 24; AVX512-NEXT: vzeroupper 25; AVX512-NEXT: retq 26; 27; AVX2-LABEL: foo: 28; AVX2: # %bb.0: # %entry 29; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 30; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) 31; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) 32; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp) 33; AVX2-NEXT: movb $8, -{{[0-9]+}}(%rsp) 34; AVX2-NEXT: movw $32, -{{[0-9]+}}(%rsp) 35; AVX2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 36; AVX2-NEXT: movw $32, %ax 37; AVX2-NEXT: movw $8, %cx 38; AVX2-NEXT: tilezero %tmm0 39; AVX2-NEXT: movl $1024, %edx # imm = 0x400 40; AVX2-NEXT: tilestored %tmm0, (%rdi,%rdx) 41; AVX2-NEXT: tilerelease 42; AVX2-NEXT: vzeroupper 43; AVX2-NEXT: retq 44; 45; SSE2-LABEL: foo: 46; SSE2: # %bb.0: # %entry 47; SSE2-NEXT: xorps %xmm0, %xmm0 48; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) 49; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) 50; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) 51; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) 52; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp) 53; SSE2-NEXT: movb $8, -{{[0-9]+}}(%rsp) 54; SSE2-NEXT: movw $32, -{{[0-9]+}}(%rsp) 55; SSE2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 56; SSE2-NEXT: movw $32, %ax 57; SSE2-NEXT: movw $8, %cx 58; SSE2-NEXT: tilezero %tmm0 59; SSE2-NEXT: movl $1024, %edx # imm = 0x400 60; SSE2-NEXT: tilestored %tmm0, (%rdi,%rdx) 61; SSE2-NEXT: tilerelease 62; SSE2-NEXT: retq 63; 64; AVX512-O0-LABEL: foo: 65; AVX512-O0: # %bb.0: # %entry 66; AVX512-O0-NEXT: pushq %rbp 67; AVX512-O0-NEXT: movq %rsp, %rbp 68; AVX512-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 69; AVX512-O0-NEXT: subq $3072, %rsp # imm = 0xC00 70; AVX512-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 71; AVX512-O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 72; AVX512-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) 73; AVX512-O0-NEXT: movw $32, %cx 74; AVX512-O0-NEXT: movw $8, %ax 75; AVX512-O0-NEXT: # implicit-def: $al 76; AVX512-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 77; AVX512-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 78; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 79; AVX512-O0-NEXT: tilezero %tmm0 80; AVX512-O0-NEXT: movl $64, %esi 81; AVX512-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 82; AVX512-O0-NEXT: movw $32, %cx 83; AVX512-O0-NEXT: movw $8, %ax 84; AVX512-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) 85; AVX512-O0-NEXT: movl $64, %esi 86; AVX512-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 87; AVX512-O0-NEXT: movw $32, %cx 88; AVX512-O0-NEXT: movw $8, %ax 89; AVX512-O0-NEXT: # implicit-def: $al 90; AVX512-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 91; AVX512-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 92; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 93; AVX512-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 94; AVX512-O0-NEXT: movl $1024, %edx # imm = 0x400 95; AVX512-O0-NEXT: movw $32, %cx 96; AVX512-O0-NEXT: movw $8, %ax 97; AVX512-O0-NEXT: tilestored %tmm0, (%rdi,%rdx) 98; AVX512-O0-NEXT: movq %rbp, %rsp 99; AVX512-O0-NEXT: popq %rbp 100; AVX512-O0-NEXT: tilerelease 101; AVX512-O0-NEXT: vzeroupper 102; AVX512-O0-NEXT: retq 103; 104; AVX2-O0-LABEL: foo: 105; AVX2-O0: # %bb.0: # %entry 106; AVX2-O0-NEXT: pushq %rbp 107; AVX2-O0-NEXT: movq %rsp, %rbp 108; AVX2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 109; AVX2-O0-NEXT: subq $3072, %rsp # imm = 0xC00 110; AVX2-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 111; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 112; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 113; AVX2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) 114; AVX2-O0-NEXT: movw $32, %cx 115; AVX2-O0-NEXT: movw $8, %ax 116; AVX2-O0-NEXT: # implicit-def: $al 117; AVX2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 118; AVX2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 119; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 120; AVX2-O0-NEXT: tilezero %tmm0 121; AVX2-O0-NEXT: movl $64, %esi 122; AVX2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 123; AVX2-O0-NEXT: movw $32, %cx 124; AVX2-O0-NEXT: movw $8, %ax 125; AVX2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) 126; AVX2-O0-NEXT: movl $64, %esi 127; AVX2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 128; AVX2-O0-NEXT: movw $32, %cx 129; AVX2-O0-NEXT: movw $8, %ax 130; AVX2-O0-NEXT: # implicit-def: $al 131; AVX2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 132; AVX2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 133; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 134; AVX2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 135; AVX2-O0-NEXT: movl $1024, %edx # imm = 0x400 136; AVX2-O0-NEXT: movw $32, %cx 137; AVX2-O0-NEXT: movw $8, %ax 138; AVX2-O0-NEXT: tilestored %tmm0, (%rdi,%rdx) 139; AVX2-O0-NEXT: movq %rbp, %rsp 140; AVX2-O0-NEXT: popq %rbp 141; AVX2-O0-NEXT: tilerelease 142; AVX2-O0-NEXT: vzeroupper 143; AVX2-O0-NEXT: retq 144; 145; SSE2-O0-LABEL: foo: 146; SSE2-O0: # %bb.0: # %entry 147; SSE2-O0-NEXT: pushq %rbp 148; SSE2-O0-NEXT: movq %rsp, %rbp 149; SSE2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 150; SSE2-O0-NEXT: subq $3072, %rsp # imm = 0xC00 151; SSE2-O0-NEXT: xorps %xmm0, %xmm0 152; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 153; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 154; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 155; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 156; SSE2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) 157; SSE2-O0-NEXT: movw $32, %cx 158; SSE2-O0-NEXT: movw $8, %ax 159; SSE2-O0-NEXT: # implicit-def: $al 160; SSE2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 161; SSE2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 162; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 163; SSE2-O0-NEXT: tilezero %tmm0 164; SSE2-O0-NEXT: movl $64, %esi 165; SSE2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 166; SSE2-O0-NEXT: movw $32, %cx 167; SSE2-O0-NEXT: movw $8, %ax 168; SSE2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) 169; SSE2-O0-NEXT: movl $64, %esi 170; SSE2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 171; SSE2-O0-NEXT: movw $32, %cx 172; SSE2-O0-NEXT: movw $8, %ax 173; SSE2-O0-NEXT: # implicit-def: $al 174; SSE2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) 175; SSE2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) 176; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 177; SSE2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 178; SSE2-O0-NEXT: movl $1024, %edx # imm = 0x400 179; SSE2-O0-NEXT: movw $32, %cx 180; SSE2-O0-NEXT: movw $8, %ax 181; SSE2-O0-NEXT: tilestored %tmm0, (%rdi,%rdx) 182; SSE2-O0-NEXT: movq %rbp, %rsp 183; SSE2-O0-NEXT: popq %rbp 184; SSE2-O0-NEXT: tilerelease 185; SSE2-O0-NEXT: retq 186entry: 187 %t = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) 188 call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t) 189 ret void 190} 191 192declare x86_amx @llvm.x86.tilezero.internal(i16, i16) 193declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) 194