1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s 3@buf = dso_local global [3072 x i8] zeroinitializer, align 16 4 5define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind { 6; CHECK-LABEL: test1: 7; CHECK: # %bb.0: 8; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 9; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 10; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 11; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) 12; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) 13; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) 14; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 15; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) 16; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 17; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 18; CHECK-NEXT: movl $buf, %eax 19; CHECK-NEXT: movl $32, %ecx 20; CHECK-NEXT: movw $8, %dx 21; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0 22; CHECK-NEXT: movl $buf+1024, %eax 23; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1 24; CHECK-NEXT: movl $buf+2048, %eax 25; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 26; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 27; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx) 28; CHECK-NEXT: tilerelease 29; CHECK-NEXT: vzeroupper 30; CHECK-NEXT: jmp foo # TAILCALL 31 %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) 32 %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) 33 %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) 34 %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) 35 call void @llvm.dbg.value(metadata x86_amx %6, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3 36 tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6) 37 tail call void @foo() 38 ret void 39} 40 41define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind { 42; CHECK-LABEL: test2: 43; CHECK: # %bb.0: 44; CHECK-NEXT: pushq %rbp 45; CHECK-NEXT: pushq %rbx 46; CHECK-NEXT: subq $72, %rsp 47; CHECK-NEXT: movl %esi, %ebx 48; CHECK-NEXT: movl %edi, %ebp 49; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 50; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) 51; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) 52; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 53; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 54; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) 55; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) 56; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) 57; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) 58; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) 59; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) 60; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) 61; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) 62; CHECK-NEXT: vzeroupper 63; CHECK-NEXT: callq foo 64; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 65; CHECK-NEXT: xorl %eax, %eax 66; CHECK-NEXT: testb %al, %al 67; CHECK-NEXT: jne .LBB1_3 68; CHECK-NEXT: # %bb.1: # %if.true 69; CHECK-NEXT: movw $8, %ax 70; CHECK-NEXT: tilezero %tmm0 71; CHECK-NEXT: movl $32, %ecx 72; CHECK-NEXT: movl $buf+1024, %edx 73; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1 74; CHECK-NEXT: movl $buf+2048, %edx 75; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2 76; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 77; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx) 78; CHECK-NEXT: jmp .LBB1_2 79; CHECK-NEXT: .LBB1_3: # %if.false 80; CHECK-NEXT: movl $buf, %eax 81; CHECK-NEXT: movl $32, %ecx 82; CHECK-NEXT: movw $8, %dx 83; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3 84; CHECK-NEXT: movl $buf+1024, %eax 85; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4 86; CHECK-NEXT: movl $buf+2048, %eax 87; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 88; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3 89; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx) 90; CHECK-NEXT: .LBB1_2: # %if.true 91; CHECK-NEXT: addq $72, %rsp 92; CHECK-NEXT: popq %rbx 93; CHECK-NEXT: popq %rbp 94; CHECK-NEXT: tilerelease 95; CHECK-NEXT: retq 96 call void @foo() 97 br i1 undef, label %if.true, label %if.false 98 99if.true: 100 %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8) 101 %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) 102 %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) 103 %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) 104 tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %t4) 105 br label %exit 106 107if.false: 108 %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) 109 %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) 110 %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) 111 %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7) 112 tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %t8) 113 br label %exit 114 115exit: 116 ret void 117} 118 119define dso_local void @test3(i16 signext %0, i16 signext %1) nounwind { 120; CHECK-LABEL: test3: 121; CHECK: # %bb.0: 122; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 123; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 124; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 125; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) 126; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 127; CHECK-NEXT: xorl %eax, %eax 128; CHECK-NEXT: testb %al, %al 129; CHECK-NEXT: jne .LBB2_2 130; CHECK-NEXT: # %bb.1: # %if.true 131; CHECK-NEXT: incl %edi 132; CHECK-NEXT: jmp .LBB2_3 133; CHECK-NEXT: .LBB2_2: # %if.false 134; CHECK-NEXT: decl %edi 135; CHECK-NEXT: .LBB2_3: # %exit 136; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 137; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 138; CHECK-NEXT: tilezero %tmm0 139; CHECK-NEXT: movl $buf, %eax 140; CHECK-NEXT: movl $32, %ecx 141; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx) 142; CHECK-NEXT: tilerelease 143; CHECK-NEXT: vzeroupper 144; CHECK-NEXT: retq 145 br i1 undef, label %if.true, label %if.false 146 147if.true: 148 %3 = add i16 %0, 1 149 br label %exit 150 151if.false: 152 %4 = sub i16 %0, 1 153 br label %exit 154 155exit: 156 %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ] 157 %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1) 158 tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, x86_amx %6) 159 ret void 160} 161 162define dso_local void @test4(i16 signext %0, i16 signext %1) nounwind { 163; CHECK-LABEL: test4: 164; CHECK: # %bb.0: 165; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 166; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 167; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 168; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) 169; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 170; CHECK-NEXT: xorl %eax, %eax 171; CHECK-NEXT: testb %al, %al 172; CHECK-NEXT: jne .LBB3_3 173; CHECK-NEXT: # %bb.1: # %if.true 174; CHECK-NEXT: incl %edi 175; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 176; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 177; CHECK-NEXT: xorl %eax, %eax 178; CHECK-NEXT: testb %al, %al 179; CHECK-NEXT: jne .LBB3_4 180; CHECK-NEXT: .LBB3_2: # %amx2 181; CHECK-NEXT: movl $32, %eax 182; CHECK-NEXT: movl $buf+1024, %ecx 183; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 184; CHECK-NEXT: movl $buf, %ecx 185; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax) 186; CHECK-NEXT: tilerelease 187; CHECK-NEXT: vzeroupper 188; CHECK-NEXT: retq 189; CHECK-NEXT: .LBB3_3: # %if.false 190; CHECK-NEXT: decl %edi 191; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 192; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 193; CHECK-NEXT: xorl %eax, %eax 194; CHECK-NEXT: testb %al, %al 195; CHECK-NEXT: jne .LBB3_2 196; CHECK-NEXT: .LBB3_4: # %amx1 197; CHECK-NEXT: tilezero %tmm0 198; CHECK-NEXT: movl $buf, %eax 199; CHECK-NEXT: movl $32, %ecx 200; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx) 201; CHECK-NEXT: tilerelease 202; CHECK-NEXT: vzeroupper 203; CHECK-NEXT: retq 204 br i1 undef, label %if.true, label %if.false 205 206if.true: 207 %3 = add i16 %0, 1 208 br i1 undef, label %amx1, label %amx2 209 210if.false: 211 %4 = sub i16 %0, 1 212 br i1 undef, label %amx2, label %amx1 213 214amx1: 215 %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ] 216 %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1) 217 tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, x86_amx %6) 218 br label %exit 219 220amx2: 221 %7 = phi i16 [ %3, %if.true ], [ %4, %if.false ] 222 %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %7, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) 223 tail call void @llvm.x86.tilestored64.internal(i16 %7, i16 %1, ptr @buf, i64 32, x86_amx %8) 224 br label %exit 225 226exit: 227 ret void 228} 229 230define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind { 231; CHECK-LABEL: test5: 232; CHECK: # %bb.0: # %entry 233; CHECK-NEXT: # kill: def $esi killed $esi def $rsi 234; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 235; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 236; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 237; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) 238; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 239; CHECK-NEXT: xorl %eax, %eax 240; CHECK-NEXT: movl $buf, %ecx 241; CHECK-NEXT: movl $32, %edx 242; CHECK-NEXT: leal -1(%rsi), %r8d 243; CHECK-NEXT: jmp .LBB4_1 244; CHECK-NEXT: .p2align 4 245; CHECK-NEXT: .LBB4_3: # %if.false 246; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1 247; CHECK-NEXT: movl %r8d, %esi 248; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp) 249; CHECK-NEXT: cmpw $7, %si 250; CHECK-NEXT: jne .LBB4_5 251; CHECK-NEXT: .LBB4_1: # %loop.bb1 252; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 253; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 254; CHECK-NEXT: testb %al, %al 255; CHECK-NEXT: jne .LBB4_3 256; CHECK-NEXT: # %bb.2: # %if.true 257; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1 258; CHECK-NEXT: tilezero %tmm0 259; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx) 260; CHECK-NEXT: cmpw $7, %si 261; CHECK-NEXT: je .LBB4_1 262; CHECK-NEXT: .LBB4_5: # %exit 263; CHECK-NEXT: tilerelease 264; CHECK-NEXT: vzeroupper 265; CHECK-NEXT: retq 266entry: 267 br label %loop.bb1 268 269loop.bb1: 270 %2 = phi i16 [ %1, %entry ], [ %5, %loop.bb2 ] 271 br i1 undef, label %if.true, label %if.false 272 273if.true: 274 %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %2) 275 tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %2, ptr @buf, i64 32, x86_amx %3) 276 br label %loop.bb2 277 278if.false: 279 %4 = sub i16 %1, 1 280 br label %loop.bb2 281 282loop.bb2: 283 %5 = phi i16 [ %2, %if.true ], [ %4, %if.false ] 284 %6 = icmp eq i16 %5, 7 285 br i1 %6, label %loop.bb1, label %exit 286 287exit: 288 ret void 289} 290 291define dso_local void @test6(i16 signext %0) nounwind { 292; CHECK-LABEL: test6: 293; CHECK: # %bb.0: # %entry 294; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 295; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 296; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) 297; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) 298; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 299; CHECK-NEXT: xorl %eax, %eax 300; CHECK-NEXT: movl $buf, %ecx 301; CHECK-NEXT: movl $32, %edx 302; CHECK-NEXT: xorl %esi, %esi 303; CHECK-NEXT: jmp .LBB5_1 304; CHECK-NEXT: .p2align 4 305; CHECK-NEXT: .LBB5_3: # %if.false 306; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 307; CHECK-NEXT: decl %esi 308; CHECK-NEXT: .LBB5_4: # %loop.bb2 309; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 310; CHECK-NEXT: leal (%rdi,%rsi), %r8d 311; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp) 312; CHECK-NEXT: cmpw $7, %si 313; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) 314; CHECK-NEXT: tilezero %tmm0 315; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx) 316; CHECK-NEXT: jne .LBB5_5 317; CHECK-NEXT: .LBB5_1: # %loop.bb1 318; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 319; CHECK-NEXT: testb %al, %al 320; CHECK-NEXT: jne .LBB5_3 321; CHECK-NEXT: # %bb.2: # %if.true 322; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 323; CHECK-NEXT: incl %esi 324; CHECK-NEXT: jmp .LBB5_4 325; CHECK-NEXT: .LBB5_5: # %exit 326; CHECK-NEXT: tilerelease 327; CHECK-NEXT: vzeroupper 328; CHECK-NEXT: retq 329entry: 330 br label %loop.bb1 331 332loop.bb1: 333 %1 = phi i16 [ 0, %entry ], [ %4, %loop.bb2 ] 334 br i1 undef, label %if.true, label %if.false 335 336if.true: 337 %2 = add i16 %1, 1 338 br label %loop.bb2 339 340if.false: 341 %3 = sub i16 %1, 1 342 br label %loop.bb2 343 344loop.bb2: 345 %4 = phi i16 [ %2, %if.true ], [ %3, %if.false ] 346 %5 = icmp eq i16 %4, 7 347 %6 = add i16 %0, %4 348 %7 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %6) 349 tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %6, ptr @buf, i64 32, x86_amx %7) 350 br i1 %5, label %loop.bb1, label %exit 351 352exit: 353 ret void 354} 355 356 357declare dso_local void @foo() nounwind 358declare void @llvm.dbg.value(metadata, metadata, metadata) 359declare x86_amx @llvm.x86.tilezero.internal(i16, i16) 360declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) 361declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 362declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) 363 364!llvm.dbg.cu = !{!0} 365!llvm.module.flags = !{!1} 366 367!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !DIFile(filename: "1", directory: "1")) 368!1 = !{i32 2, !"Debug Info Version", i32 3} 369!2 = distinct !DISubprogram(unit: !0) 370!3 = !DILocation(line: 1, column: 1, scope: !2) 371