1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s 3 4define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 { 5; CHECK-LABEL: @test_no_bitcast( 6; CHECK-NEXT: entry: 7; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]] 8; CHECK: tileload.scalarize.rows.header: 9; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ] 10; CHECK-NEXT: [[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP10:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ] 11; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]] 12; CHECK: tileload.scalarize.rows.body: 13; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]] 14; CHECK: tileload.scalarize.cols.header: 15; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ] 16; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP10]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ] 17; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]] 18; CHECK: tileload.scalarize.cols.body: 19; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64 20; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64 21; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP1]], 4 22; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP2]] 23; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[C_MEM:%.*]], i64 [[TMP4]] 24; CHECK-NEXT: [[TMP7:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16 25; CHECK-NEXT: [[TMP8:%.*]] = add i16 [[TMP7]], [[TILELOAD_SCALARIZE_COLS_IV]] 26; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP6]], align 4 27; CHECK-NEXT: [[TMP10]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP9]], i16 [[TMP8]] 28; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_LATCH]] 29; CHECK: tileload.scalarize.cols.latch: 30; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1 31; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], 4 32; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label [[TILELOAD_SCALARIZE_COLS_HEADER]], label [[TILELOAD_SCALARIZE_ROWS_LATCH]] 33; CHECK: tileload.scalarize.rows.latch: 34; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_STEP]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 1 35; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], 4 36; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] 37; CHECK: continue: 38; CHECK-NEXT: [[TMP11:%.*]] = bitcast <256 x i32> [[TMP10]] to x86_amx 39; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER2:%.*]] 40; CHECK: tileload.scalarize.rows.header2: 41; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV5:%.*]] = phi i16 [ 0, [[CONTINUE]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP6:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH4:%.*]] ] 42; CHECK-NEXT: [[VEC_PHI_ROW14:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE]] ], [ [[TMP22:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH4]] ] 43; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_BODY3:%.*]] 44; CHECK: tileload.scalarize.rows.body3: 45; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_HEADER8:%.*]] 46; CHECK: tileload.scalarize.cols.header8: 47; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_IV11:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY3]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP12:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH10:%.*]] ] 48; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW14]], [[TILELOAD_SCALARIZE_ROWS_BODY3]] ], [ [[TMP22]], [[TILELOAD_SCALARIZE_COLS_LATCH10]] ] 49; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_BODY9:%.*]] 50; CHECK: tileload.scalarize.cols.body9: 51; CHECK-NEXT: [[TMP13:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV5]] to i64 52; CHECK-NEXT: [[TMP14:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV11]] to i64 53; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 4 54; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], [[TMP14]] 55; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[A_MEM:%.*]], i64 [[TMP16]] 56; CHECK-NEXT: [[TMP19:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV5]], 16 57; CHECK-NEXT: [[TMP20:%.*]] = add i16 [[TMP19]], [[TILELOAD_SCALARIZE_COLS_IV11]] 58; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP18]], align 4 59; CHECK-NEXT: [[TMP22]] = insertelement <256 x i32> [[VEC_PHI15]], i32 [[TMP21]], i16 [[TMP20]] 60; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_LATCH10]] 61; CHECK: tileload.scalarize.cols.latch10: 62; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_STEP12]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV11]], 1 63; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_COND13:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP12]], 4 64; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_COLS_COND13]], label [[TILELOAD_SCALARIZE_COLS_HEADER8]], label [[TILELOAD_SCALARIZE_ROWS_LATCH4]] 65; CHECK: tileload.scalarize.rows.latch4: 66; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_STEP6]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV5]], 1 67; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND7:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP6]], 4 68; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND7]], label [[TILELOAD_SCALARIZE_ROWS_HEADER2]], label [[CONTINUE1:%.*]] 69; CHECK: continue1: 70; CHECK-NEXT: [[TMP23:%.*]] = bitcast <256 x i32> [[TMP22]] to x86_amx 71; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER17:%.*]] 72; CHECK: tileload.scalarize.rows.header17: 73; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV20:%.*]] = phi i16 [ 0, [[CONTINUE1]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP21:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH19:%.*]] ] 74; CHECK-NEXT: [[VEC_PHI_ROW29:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE1]] ], [ [[TMP34:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH19]] ] 75; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_BODY18:%.*]] 76; CHECK: tileload.scalarize.rows.body18: 77; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_HEADER23:%.*]] 78; CHECK: tileload.scalarize.cols.header23: 79; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_IV26:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY18]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP27:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH25:%.*]] ] 80; CHECK-NEXT: [[VEC_PHI30:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW29]], [[TILELOAD_SCALARIZE_ROWS_BODY18]] ], [ [[TMP34]], [[TILELOAD_SCALARIZE_COLS_LATCH25]] ] 81; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_BODY24:%.*]] 82; CHECK: tileload.scalarize.cols.body24: 83; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV20]] to i64 84; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV26]] to i64 85; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP25]], 4 86; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], [[TMP26]] 87; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[B_MEM:%.*]], i64 [[TMP28]] 88; CHECK-NEXT: [[TMP31:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV20]], 16 89; CHECK-NEXT: [[TMP32:%.*]] = add i16 [[TMP31]], [[TILELOAD_SCALARIZE_COLS_IV26]] 90; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP30]], align 4 91; CHECK-NEXT: [[TMP34]] = insertelement <256 x i32> [[VEC_PHI30]], i32 [[TMP33]], i16 [[TMP32]] 92; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_COLS_LATCH25]] 93; CHECK: tileload.scalarize.cols.latch25: 94; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_STEP27]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV26]], 1 95; CHECK-NEXT: [[TILELOAD_SCALARIZE_COLS_COND28:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP27]], 4 96; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_COLS_COND28]], label [[TILELOAD_SCALARIZE_COLS_HEADER23]], label [[TILELOAD_SCALARIZE_ROWS_LATCH19]] 97; CHECK: tileload.scalarize.rows.latch19: 98; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_STEP21]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV20]], 1 99; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND22:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP21]], 4 100; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND22]], label [[TILELOAD_SCALARIZE_ROWS_HEADER17]], label [[CONTINUE16:%.*]] 101; CHECK: continue16: 102; CHECK-NEXT: [[TMP35:%.*]] = bitcast <256 x i32> [[TMP34]] to x86_amx 103; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER:%.*]] 104; CHECK: tiledpbssd.scalarize.rows.header: 105; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[CONTINUE16]] ], [ [[TILEDPBSSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH:%.*]] ] 106; CHECK-NEXT: [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[TMP10]], [[CONTINUE16]] ], [ [[TMP52:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ] 107; CHECK-NEXT: [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE16]] ], [ [[TMP54:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ] 108; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_ROWS_BODY:%.*]] 109; CHECK: tiledpbssd.scalarize.rows.body: 110; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_COLS_HEADER:%.*]] 111; CHECK: tiledpbssd.scalarize.cols.header: 112; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH:%.*]] ] 113; CHECK-NEXT: [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP52]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ] 114; CHECK-NEXT: [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP54]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ] 115; CHECK-NEXT: [[TMP36:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16 116; CHECK-NEXT: [[TMP37:%.*]] = add i16 [[TMP36]], [[TILEDPBSSD_SCALARIZE_COLS_IV]] 117; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_COLS_BODY:%.*]] 118; CHECK: tiledpbssd.scalarize.cols.body: 119; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_HEADER:%.*]] 120; CHECK: tiledpbssd.scalarize.inner.header: 121; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH:%.*]] ] 122; CHECK-NEXT: [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TMP52]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH]] ] 123; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_BODY:%.*]] 124; CHECK: tiledpbssd.scalarize.inner.body: 125; CHECK-NEXT: [[TMP38:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16 126; CHECK-NEXT: [[TMP39:%.*]] = add i16 [[TMP38]], [[TILEDPBSSD_SCALARIZE_INNER_IV]] 127; CHECK-NEXT: [[TMP40:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 16 128; CHECK-NEXT: [[TMP41:%.*]] = add i16 [[TMP40]], [[TILEDPBSSD_SCALARIZE_COLS_IV]] 129; CHECK-NEXT: [[TMP42:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP37]] 130; CHECK-NEXT: [[TMP43:%.*]] = extractelement <256 x i32> [[TMP22]], i16 [[TMP39]] 131; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32 [[TMP43]] to <4 x i8> 132; CHECK-NEXT: [[TMP45:%.*]] = extractelement <256 x i32> [[TMP34]], i16 [[TMP41]] 133; CHECK-NEXT: [[TMP46:%.*]] = bitcast i32 [[TMP45]] to <4 x i8> 134; CHECK-NEXT: [[TMP47:%.*]] = sext <4 x i8> [[TMP46]] to <4 x i32> 135; CHECK-NEXT: [[TMP48:%.*]] = sext <4 x i8> [[TMP44]] to <4 x i32> 136; CHECK-NEXT: [[TMP49:%.*]] = mul <4 x i32> [[TMP48]], [[TMP47]] 137; CHECK-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP49]]) 138; CHECK-NEXT: [[TMP51:%.*]] = add i32 [[TMP42]], [[TMP50]] 139; CHECK-NEXT: [[TMP52]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP51]], i16 [[TMP37]] 140; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_INNER_LATCH]] 141; CHECK: tiledpbssd.scalarize.inner.latch: 142; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 1 143; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_INNER_STEP]], 4 144; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_INNER_COND]], label [[TILEDPBSSD_SCALARIZE_INNER_HEADER]], label [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] 145; CHECK: tiledpbssd.scalarize.cols.latch: 146; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_COLS_IV]], 1 147; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_COLS_STEP]], 4 148; CHECK-NEXT: [[TMP53:%.*]] = extractelement <256 x i32> [[TMP52]], i16 [[TMP37]] 149; CHECK-NEXT: [[TMP54]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP53]], i16 [[TMP37]] 150; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_COLS_COND]], label [[TILEDPBSSD_SCALARIZE_COLS_HEADER]], label [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] 151; CHECK: tiledpbssd.scalarize.rows.latch: 152; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 1 153; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_ROWS_STEP]], 4 154; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_ROWS_COND]], label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE31:%.*]] 155; CHECK: continue31: 156; CHECK-NEXT: [[TMP55:%.*]] = bitcast <256 x i32> [[TMP54]] to x86_amx 157; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_ROWS_HEADER:%.*]] 158; CHECK: tilestore.scalarize.rows.header: 159; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[CONTINUE31]] ], [ [[TILESTORE_SCALARIZE_ROWS_STEP:%.*]], [[TILESTORE_SCALARIZE_ROWS_LATCH:%.*]] ] 160; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_ROWS_BODY:%.*]] 161; CHECK: tilestore.scalarize.rows.body: 162; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_HEADER:%.*]] 163; CHECK: tilestore.scalarize.cols.header: 164; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILESTORE_SCALARIZE_ROWS_BODY]] ], [ [[TILESTORE_SCALARIZE_COLS_STEP:%.*]], [[TILESTORE_SCALARIZE_COLS_LATCH:%.*]] ] 165; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_BODY:%.*]] 166; CHECK: tilestore.scalarize.cols.body: 167; CHECK-NEXT: [[TMP56:%.*]] = zext i16 [[TILESTORE_SCALARIZE_ROWS_IV]] to i64 168; CHECK-NEXT: [[TMP57:%.*]] = zext i16 [[TILESTORE_SCALARIZE_COLS_IV]] to i64 169; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP56]], 4 170; CHECK-NEXT: [[TMP59:%.*]] = add i64 [[TMP58]], [[TMP57]] 171; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, ptr [[C_MEM:%.*]], i64 [[TMP59]] 172; CHECK-NEXT: [[TMP62:%.*]] = mul i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 16 173; CHECK-NEXT: [[TMP63:%.*]] = add i16 [[TMP62]], [[TILESTORE_SCALARIZE_COLS_IV]] 174; CHECK-NEXT: [[TMP64:%.*]] = extractelement <256 x i32> [[TMP54]], i16 [[TMP63]] 175; CHECK-NEXT: store i32 [[TMP64]], ptr [[TMP61]], align 4 176; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_COLS_LATCH]] 177; CHECK: tilestore.scalarize.cols.latch: 178; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_STEP]] = add i16 [[TILESTORE_SCALARIZE_COLS_IV]], 1 179; CHECK-NEXT: [[TILESTORE_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_COLS_STEP]], 4 180; CHECK-NEXT: br i1 [[TILESTORE_SCALARIZE_COLS_COND]], label [[TILESTORE_SCALARIZE_COLS_HEADER]], label [[TILESTORE_SCALARIZE_ROWS_LATCH]] 181; CHECK: tilestore.scalarize.rows.latch: 182; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_STEP]] = add i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 1 183; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_ROWS_STEP]], 4 184; CHECK-NEXT: br i1 [[TILESTORE_SCALARIZE_ROWS_COND]], label [[TILESTORE_SCALARIZE_ROWS_HEADER]], label [[CONTINUE32:%.*]] 185; CHECK: continue32: 186; CHECK-NEXT: ret void 187; 188entry: 189 %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %C_mem, i64 16) 190 %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %A_mem, i64 16) 191 %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %B_mem, i64 16) 192 %3 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 4, i16 16, i16 16, x86_amx %0, x86_amx %1, x86_amx %2) 193 tail call void @llvm.x86.tilestored64.internal(i16 4, i16 16, ptr %C_mem, i64 16, x86_amx %3) 194 ret void 195} 196 197declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) 198declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 199declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) 200 201attributes #0 = { noinline nounwind optnone } 202