xref: /llvm-project/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll (revision 90ba33099cbb17e7c159e9ebc5a512037db99d6d)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
3
4; REQUIRES: aarch64-registered-target
5
6target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
7target triple = "aarch64-apple-ios"
8
9; Test tiling without generating explicit loops.
10
11define void @multiply(ptr %A, ptr %B, ptr %C) {
12; CHECK-LABEL: @multiply(
13; CHECK-NEXT:  entry:
14; CHECK-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
15; CHECK-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 128
16; CHECK-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i64
17; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
18; CHECK-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
19; CHECK:       alias_cont:
20; CHECK-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 128
21; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
22; CHECK-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
23; CHECK:       copy:
24; CHECK-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
25; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
26; CHECK-NEXT:    br label [[NO_ALIAS]]
27; CHECK:       no_alias:
28; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
29; CHECK-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i64
30; CHECK-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 128
31; CHECK-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[B:%.*]] to i64
32; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]]
33; CHECK-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
34; CHECK:       alias_cont1:
35; CHECK-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 128
36; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]]
37; CHECK-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
38; CHECK:       copy2:
39; CHECK-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
40; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[B]], i64 128, i1 false)
41; CHECK-NEXT:    br label [[NO_ALIAS3]]
42; CHECK:       no_alias3:
43; CHECK-NEXT:    [[TMP7:%.*]] = phi ptr [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
44; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
45; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
46; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
47; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
48; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
49; CHECK-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
50; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
51; CHECK-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
52; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
53; CHECK-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
54; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
55; CHECK-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
56; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
57; CHECK-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
58; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
59; CHECK-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
60; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
61; CHECK-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
62; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
63; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
64; CHECK-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
65; CHECK-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
66; CHECK-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
67; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
68; CHECK-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
69; CHECK-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
70; CHECK-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
71; CHECK-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
72; CHECK-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
73; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
74; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
75; CHECK-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i64 32
76; CHECK-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
77; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
78; CHECK-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
79; CHECK-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
80; CHECK-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
81; CHECK-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
82; CHECK-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
83; CHECK-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
84; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
85; CHECK-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
86; CHECK-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
87; CHECK-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
88; CHECK-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
89; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
90; CHECK-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
91; CHECK-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
92; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
93; CHECK-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
94; CHECK-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
95; CHECK-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
96; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
97; CHECK-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
98; CHECK-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
99; CHECK-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
100; CHECK-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
101; CHECK-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
102; CHECK-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
103; CHECK-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
104; CHECK-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
105; CHECK-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
106; CHECK-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
107; CHECK-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
108; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i64 16
109; CHECK-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
110; CHECK-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i64 48
111; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
112; CHECK-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
113; CHECK-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
114; CHECK-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
115; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
116; CHECK-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
117; CHECK-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
118; CHECK-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
119; CHECK-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
120; CHECK-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
121; CHECK-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
122; CHECK-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
123; CHECK-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
124; CHECK-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
125; CHECK-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
126; CHECK-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
127; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
128; CHECK-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
129; CHECK-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
130; CHECK-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
131; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
132; CHECK-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
133; CHECK-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
134; CHECK-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
135; CHECK-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
136; CHECK-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
137; CHECK-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
138; CHECK-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
139; CHECK-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
140; CHECK-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
141; CHECK-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
142; CHECK-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
143; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i64 64
144; CHECK-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
145; CHECK-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i64 96
146; CHECK-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
147; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
148; CHECK-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
149; CHECK-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
150; CHECK-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
151; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
152; CHECK-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
153; CHECK-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
154; CHECK-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
155; CHECK-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
156; CHECK-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
157; CHECK-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
158; CHECK-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
159; CHECK-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
160; CHECK-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
161; CHECK-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
162; CHECK-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
163; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
164; CHECK-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
165; CHECK-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
166; CHECK-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
167; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
168; CHECK-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
169; CHECK-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
170; CHECK-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
171; CHECK-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
172; CHECK-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
173; CHECK-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
174; CHECK-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
175; CHECK-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
176; CHECK-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
177; CHECK-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
178; CHECK-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
179; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i64 80
180; CHECK-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
181; CHECK-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
182; CHECK-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
183; CHECK-NEXT:    ret void
184;
185
186
187;; np.dot(a[0:2, 0:2], b[0:2, 0:2])
188
189
190;; + np.dot(a[0:2, 2:4], b[2:4, 0:2])
191
192
193;; -> c[0:2, 0:2]
194
195
196;; np.dot(a[2:4, 0:2], b[0:2, 0:2])
197
198
199;; + np.dot(a[2:4, 2:4], b[2:4, 0:2])
200
201
202;; -> c[2:4, 0:2]
203
204
205;; np.dot(a[0:2, 0:2], b[0:2, 2:4])
206
207
208;; + np.dot(a[0:2, 2:4], b[2:4, 2:4])
209
210
211;; -> c[0:2, 2:4]
212
213
214;;  np.dot(a[2:4, 0:2], b[2:4, 0:2])
215
216
217;; + np.dot(a[2:4, 2:4], b[2:4, 2:4])
218
219
220;; ->  c[2:4, 2:4]
221
222entry:
223  %a = load <16 x double>, ptr %A, align 8
224  %b = load <16 x double>, ptr %B, align 8
225
226  %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4)
227
228  store <16 x double> %c, ptr %C, align 8
229  ret void
230}
231
232; The same load is used for both operands of the multiply.
233define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
234; CHECK-LABEL: @multiply_reuse_load(
235; CHECK-NEXT:  entry:
236; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
237; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 32
238; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
239; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> zeroinitializer
240; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
241; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
242; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP0]])
243; CHECK-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> zeroinitializer
244; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT10]]
245; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
246; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
247; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 64
248; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
249; CHECK-NEXT:    [[VEC_GEP15:%.*]] = getelementptr i8, ptr [[A]], i64 96
250; CHECK-NEXT:    [[COL_LOAD16:%.*]] = load <2 x double>, ptr [[VEC_GEP15]], align 8
251; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 16
252; CHECK-NEXT:    [[COL_LOAD17:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
253; CHECK-NEXT:    [[VEC_GEP18:%.*]] = getelementptr i8, ptr [[A]], i64 48
254; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <2 x double>, ptr [[VEC_GEP18]], align 8
255; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <2 x double> [[COL_LOAD17]], <2 x double> poison, <2 x i32> zeroinitializer
256; CHECK-NEXT:    [[TMP6:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD14]], <2 x double> [[SPLAT_SPLAT23]], <2 x double> [[TMP1]])
257; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <2 x double> [[COL_LOAD17]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
258; CHECK-NEXT:    [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD16]], <2 x double> [[SPLAT_SPLAT26]], <2 x double> [[TMP6]])
259; CHECK-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD19]], <2 x double> poison, <2 x i32> zeroinitializer
260; CHECK-NEXT:    [[TMP8:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD14]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP3]])
261; CHECK-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD19]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
262; CHECK-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD16]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP8]])
263; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
264; CHECK-NEXT:    [[VEC_GEP34:%.*]] = getelementptr i8, ptr [[C]], i64 32
265; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[VEC_GEP34]], align 8
266; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 16
267; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <2 x double>, ptr [[TMP10]], align 8
268; CHECK-NEXT:    [[VEC_GEP36:%.*]] = getelementptr i8, ptr [[A]], i64 48
269; CHECK-NEXT:    [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[VEC_GEP36]], align 8
270; CHECK-NEXT:    [[COL_LOAD38:%.*]] = load <2 x double>, ptr [[A]], align 8
271; CHECK-NEXT:    [[VEC_GEP39:%.*]] = getelementptr i8, ptr [[A]], i64 32
272; CHECK-NEXT:    [[COL_LOAD40:%.*]] = load <2 x double>, ptr [[VEC_GEP39]], align 8
273; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> zeroinitializer
274; CHECK-NEXT:    [[TMP11:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT43]]
275; CHECK-NEXT:    [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
276; CHECK-NEXT:    [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP11]])
277; CHECK-NEXT:    [[SPLAT_SPLAT49:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> zeroinitializer
278; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT49]]
279; CHECK-NEXT:    [[SPLAT_SPLAT52:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
280; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]])
281; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[A]], i64 80
282; CHECK-NEXT:    [[COL_LOAD53:%.*]] = load <2 x double>, ptr [[TMP15]], align 8
283; CHECK-NEXT:    [[VEC_GEP54:%.*]] = getelementptr i8, ptr [[A]], i64 112
284; CHECK-NEXT:    [[COL_LOAD55:%.*]] = load <2 x double>, ptr [[VEC_GEP54]], align 8
285; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 16
286; CHECK-NEXT:    [[COL_LOAD56:%.*]] = load <2 x double>, ptr [[TMP16]], align 8
287; CHECK-NEXT:    [[VEC_GEP57:%.*]] = getelementptr i8, ptr [[A]], i64 48
288; CHECK-NEXT:    [[COL_LOAD58:%.*]] = load <2 x double>, ptr [[VEC_GEP57]], align 8
289; CHECK-NEXT:    [[SPLAT_SPLAT62:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> poison, <2 x i32> zeroinitializer
290; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD53]], <2 x double> [[SPLAT_SPLAT62]], <2 x double> [[TMP12]])
291; CHECK-NEXT:    [[SPLAT_SPLAT65:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
292; CHECK-NEXT:    [[TMP18:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD55]], <2 x double> [[SPLAT_SPLAT65]], <2 x double> [[TMP17]])
293; CHECK-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD58]], <2 x double> poison, <2 x i32> zeroinitializer
294; CHECK-NEXT:    [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD53]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP14]])
295; CHECK-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD58]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
296; CHECK-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD55]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP19]])
297; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[C]], i64 16
298; CHECK-NEXT:    store <2 x double> [[TMP18]], ptr [[TMP21]], align 8
299; CHECK-NEXT:    [[VEC_GEP73:%.*]] = getelementptr i8, ptr [[C]], i64 48
300; CHECK-NEXT:    store <2 x double> [[TMP20]], ptr [[VEC_GEP73]], align 8
301; CHECK-NEXT:    [[COL_LOAD74:%.*]] = load <2 x double>, ptr [[A]], align 8
302; CHECK-NEXT:    [[VEC_GEP75:%.*]] = getelementptr i8, ptr [[A]], i64 32
303; CHECK-NEXT:    [[COL_LOAD76:%.*]] = load <2 x double>, ptr [[VEC_GEP75]], align 8
304; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[A]], i64 64
305; CHECK-NEXT:    [[COL_LOAD77:%.*]] = load <2 x double>, ptr [[TMP22]], align 8
306; CHECK-NEXT:    [[VEC_GEP78:%.*]] = getelementptr i8, ptr [[A]], i64 96
307; CHECK-NEXT:    [[COL_LOAD79:%.*]] = load <2 x double>, ptr [[VEC_GEP78]], align 8
308; CHECK-NEXT:    [[SPLAT_SPLAT82:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> poison, <2 x i32> zeroinitializer
309; CHECK-NEXT:    [[TMP23:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT82]]
310; CHECK-NEXT:    [[SPLAT_SPLAT85:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
311; CHECK-NEXT:    [[TMP24:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT85]], <2 x double> [[TMP23]])
312; CHECK-NEXT:    [[SPLAT_SPLAT88:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> poison, <2 x i32> zeroinitializer
313; CHECK-NEXT:    [[TMP25:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT88]]
314; CHECK-NEXT:    [[SPLAT_SPLAT91:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
315; CHECK-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT91]], <2 x double> [[TMP25]])
316; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[A]], i64 64
317; CHECK-NEXT:    [[COL_LOAD92:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
318; CHECK-NEXT:    [[VEC_GEP93:%.*]] = getelementptr i8, ptr [[A]], i64 96
319; CHECK-NEXT:    [[COL_LOAD94:%.*]] = load <2 x double>, ptr [[VEC_GEP93]], align 8
320; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[A]], i64 80
321; CHECK-NEXT:    [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[TMP28]], align 8
322; CHECK-NEXT:    [[VEC_GEP96:%.*]] = getelementptr i8, ptr [[A]], i64 112
323; CHECK-NEXT:    [[COL_LOAD97:%.*]] = load <2 x double>, ptr [[VEC_GEP96]], align 8
324; CHECK-NEXT:    [[SPLAT_SPLAT101:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> zeroinitializer
325; CHECK-NEXT:    [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]])
326; CHECK-NEXT:    [[SPLAT_SPLAT104:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
327; CHECK-NEXT:    [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP29]])
328; CHECK-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> zeroinitializer
329; CHECK-NEXT:    [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]])
330; CHECK-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
331; CHECK-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP31]])
332; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[C]], i64 64
333; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP33]], align 8
334; CHECK-NEXT:    [[VEC_GEP112:%.*]] = getelementptr i8, ptr [[C]], i64 96
335; CHECK-NEXT:    store <2 x double> [[TMP32]], ptr [[VEC_GEP112]], align 8
336; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[A]], i64 16
337; CHECK-NEXT:    [[COL_LOAD113:%.*]] = load <2 x double>, ptr [[TMP34]], align 8
338; CHECK-NEXT:    [[VEC_GEP114:%.*]] = getelementptr i8, ptr [[A]], i64 48
339; CHECK-NEXT:    [[COL_LOAD115:%.*]] = load <2 x double>, ptr [[VEC_GEP114]], align 8
340; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[A]], i64 64
341; CHECK-NEXT:    [[COL_LOAD116:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
342; CHECK-NEXT:    [[VEC_GEP117:%.*]] = getelementptr i8, ptr [[A]], i64 96
343; CHECK-NEXT:    [[COL_LOAD118:%.*]] = load <2 x double>, ptr [[VEC_GEP117]], align 8
344; CHECK-NEXT:    [[SPLAT_SPLAT121:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> poison, <2 x i32> zeroinitializer
345; CHECK-NEXT:    [[TMP36:%.*]] = fmul contract <2 x double> [[COL_LOAD113]], [[SPLAT_SPLAT121]]
346; CHECK-NEXT:    [[SPLAT_SPLAT124:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
347; CHECK-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT124]], <2 x double> [[TMP36]])
348; CHECK-NEXT:    [[SPLAT_SPLAT127:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> poison, <2 x i32> zeroinitializer
349; CHECK-NEXT:    [[TMP38:%.*]] = fmul contract <2 x double> [[COL_LOAD113]], [[SPLAT_SPLAT127]]
350; CHECK-NEXT:    [[SPLAT_SPLAT130:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
351; CHECK-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT130]], <2 x double> [[TMP38]])
352; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[A]], i64 80
353; CHECK-NEXT:    [[COL_LOAD131:%.*]] = load <2 x double>, ptr [[TMP40]], align 8
354; CHECK-NEXT:    [[VEC_GEP132:%.*]] = getelementptr i8, ptr [[A]], i64 112
355; CHECK-NEXT:    [[COL_LOAD133:%.*]] = load <2 x double>, ptr [[VEC_GEP132]], align 8
356; CHECK-NEXT:    [[SPLAT_SPLAT140:%.*]] = shufflevector <2 x double> [[COL_LOAD131]], <2 x double> poison, <2 x i32> zeroinitializer
357; CHECK-NEXT:    [[TMP41:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD131]], <2 x double> [[SPLAT_SPLAT140]], <2 x double> [[TMP37]])
358; CHECK-NEXT:    [[SPLAT_SPLAT143:%.*]] = shufflevector <2 x double> [[COL_LOAD131]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
359; CHECK-NEXT:    [[TMP42:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD133]], <2 x double> [[SPLAT_SPLAT143]], <2 x double> [[TMP41]])
360; CHECK-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> poison, <2 x i32> zeroinitializer
361; CHECK-NEXT:    [[TMP43:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD131]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP39]])
362; CHECK-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
363; CHECK-NEXT:    [[TMP44:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD133]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP43]])
364; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[C]], i64 80
365; CHECK-NEXT:    store <2 x double> [[TMP42]], ptr [[TMP45]], align 8
366; CHECK-NEXT:    [[VEC_GEP151:%.*]] = getelementptr i8, ptr [[C]], i64 112
367; CHECK-NEXT:    store <2 x double> [[TMP44]], ptr [[VEC_GEP151]], align 8
368; CHECK-NEXT:    ret void
369;
370entry:
371  %a = load <16 x double>, ptr %A, align 8
372  %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %a, i32 4, i32 4, i32 4)
373  store <16 x double> %c, ptr %C, align 8
374  ret void
375}
376
377declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)
378