xref: /llvm-project/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll (revision 90ba33099cbb17e7c159e9ebc5a512037db99d6d)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=lower-matrix-intrinsics,instcombine -fuse-matrix-use-loops=false -fuse-matrix-tile-size=1 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
3
4; REQUIRES: aarch64-registered-target
5
6target datalayout = "e-m:o-i64:64-f80:128-n8:4:32:64-S128"
7target triple = "aarch64-apple-ios"
8
9define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
10; CHECK-LABEL: @multiply_can_hoist_cast(
11; CHECK-NEXT:  entry:
12; CHECK-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
13; CHECK-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32
14; CHECK-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[B:%.*]] to i64
15; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
16; CHECK-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
17; CHECK:       alias_cont:
18; CHECK-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32
19; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
20; CHECK-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
21; CHECK:       copy:
22; CHECK-NEXT:    [[TMP2:%.*]] = alloca [4 x double], align 8
23; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
24; CHECK-NEXT:    br label [[NO_ALIAS]]
25; CHECK:       no_alias:
26; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
27; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x double>, ptr [[A:%.*]], align 8
28; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
29; CHECK-NEXT:    [[TMP4:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD1]]
30; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 16
31; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <1 x double>, ptr [[TMP5]], align 8
32; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
33; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[TMP6]], align 8
34; CHECK-NEXT:    [[TMP7:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD2]], <1 x double> [[COL_LOAD3]], <1 x double> [[TMP4]])
35; CHECK-NEXT:    store <1 x double> [[TMP7]], ptr [[C]], align 8
36; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 8
37; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <1 x double>, ptr [[TMP8]], align 8
38; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
39; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[COL_LOAD8]], [[COL_LOAD9]]
40; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 24
41; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <1 x double>, ptr [[TMP10]], align 8
42; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
43; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <1 x double>, ptr [[TMP11]], align 8
44; CHECK-NEXT:    [[TMP12:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD13]], <1 x double> [[COL_LOAD14]], <1 x double> [[TMP9]])
45; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[C]], i64 8
46; CHECK-NEXT:    store <1 x double> [[TMP12]], ptr [[TMP13]], align 8
47; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <1 x double>, ptr [[A]], align 8
48; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
49; CHECK-NEXT:    [[COL_LOAD20:%.*]] = load <1 x double>, ptr [[TMP14]], align 8
50; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[COL_LOAD19]], [[COL_LOAD20]]
51; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 16
52; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <1 x double>, ptr [[TMP16]], align 8
53; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
54; CHECK-NEXT:    [[COL_LOAD25:%.*]] = load <1 x double>, ptr [[TMP17]], align 8
55; CHECK-NEXT:    [[TMP18:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD24]], <1 x double> [[COL_LOAD25]], <1 x double> [[TMP15]])
56; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[C]], i64 16
57; CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[TMP19]], align 8
58; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 8
59; CHECK-NEXT:    [[COL_LOAD30:%.*]] = load <1 x double>, ptr [[TMP20]], align 8
60; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
61; CHECK-NEXT:    [[COL_LOAD31:%.*]] = load <1 x double>, ptr [[TMP21]], align 8
62; CHECK-NEXT:    [[TMP22:%.*]] = fmul contract <1 x double> [[COL_LOAD30]], [[COL_LOAD31]]
63; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[A]], i64 24
64; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <1 x double>, ptr [[TMP23]], align 8
65; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
66; CHECK-NEXT:    [[COL_LOAD36:%.*]] = load <1 x double>, ptr [[TMP24]], align 8
67; CHECK-NEXT:    [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
68; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 24
69; CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
70; CHECK-NEXT:    ret void
71;
72entry:
73  %a = load <4 x double>, ptr %A, align 8
74  %b = load <4 x double>, ptr %B, align 8
75  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
76  store <4 x double> %c, ptr %C, align 8
77  ret void
78}
79
80define void @multiply_can_hoist_multiple_insts(ptr noalias %A, ptr %B, ptr %C) {
81; CHECK-LABEL: @multiply_can_hoist_multiple_insts(
82; CHECK-NEXT:  entry:
83; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 64
84; CHECK-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[GEP]] to i64
85; CHECK-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32
86; CHECK-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[B:%.*]] to i64
87; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
88; CHECK-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
89; CHECK:       alias_cont:
90; CHECK-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32
91; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
92; CHECK-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
93; CHECK:       copy:
94; CHECK-NEXT:    [[TMP2:%.*]] = alloca [4 x double], align 8
95; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
96; CHECK-NEXT:    br label [[NO_ALIAS]]
97; CHECK:       no_alias:
98; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
99; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x double>, ptr [[A:%.*]], align 8
100; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
101; CHECK-NEXT:    [[TMP4:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD1]]
102; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 16
103; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <1 x double>, ptr [[TMP5]], align 8
104; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
105; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[TMP6]], align 8
106; CHECK-NEXT:    [[TMP7:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD2]], <1 x double> [[COL_LOAD3]], <1 x double> [[TMP4]])
107; CHECK-NEXT:    store <1 x double> [[TMP7]], ptr [[GEP]], align 8
108; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 8
109; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <1 x double>, ptr [[TMP8]], align 8
110; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
111; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[COL_LOAD8]], [[COL_LOAD9]]
112; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 24
113; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <1 x double>, ptr [[TMP10]], align 8
114; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
115; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <1 x double>, ptr [[TMP11]], align 8
116; CHECK-NEXT:    [[TMP12:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD13]], <1 x double> [[COL_LOAD14]], <1 x double> [[TMP9]])
117; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[C]], i64 72
118; CHECK-NEXT:    store <1 x double> [[TMP12]], ptr [[TMP13]], align 8
119; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <1 x double>, ptr [[A]], align 8
120; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
121; CHECK-NEXT:    [[COL_LOAD20:%.*]] = load <1 x double>, ptr [[TMP14]], align 8
122; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[COL_LOAD19]], [[COL_LOAD20]]
123; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 16
124; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <1 x double>, ptr [[TMP16]], align 8
125; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
126; CHECK-NEXT:    [[COL_LOAD25:%.*]] = load <1 x double>, ptr [[TMP17]], align 8
127; CHECK-NEXT:    [[TMP18:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD24]], <1 x double> [[COL_LOAD25]], <1 x double> [[TMP15]])
128; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[C]], i64 80
129; CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[TMP19]], align 8
130; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 8
131; CHECK-NEXT:    [[COL_LOAD30:%.*]] = load <1 x double>, ptr [[TMP20]], align 8
132; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
133; CHECK-NEXT:    [[COL_LOAD31:%.*]] = load <1 x double>, ptr [[TMP21]], align 8
134; CHECK-NEXT:    [[TMP22:%.*]] = fmul contract <1 x double> [[COL_LOAD30]], [[COL_LOAD31]]
135; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[A]], i64 24
136; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <1 x double>, ptr [[TMP23]], align 8
137; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
138; CHECK-NEXT:    [[COL_LOAD36:%.*]] = load <1 x double>, ptr [[TMP24]], align 8
139; CHECK-NEXT:    [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
140; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 88
141; CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
142; CHECK-NEXT:    ret void
143;
144entry:
145  %a = load <4 x double>, ptr %A, align 8
146  %b = load <4 x double>, ptr %B, align 8
147  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
148  %gep = getelementptr [4 x double], ptr %C, i32 2
149  store <4 x double> %c, ptr %gep, align 8
150  ret void
151}
152
153; Make sure the correct instruction order is preserved when hoisting.
154define void @multiply_can_hoist_multiple_insts2(ptr noalias %A, ptr %B, ptr %C) {
155; CHECK-LABEL: @multiply_can_hoist_multiple_insts2(
156; CHECK-NEXT:  entry:
157; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 1344
158; CHECK-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[GEP_1]] to i64
159; CHECK-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32
160; CHECK-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[B:%.*]] to i64
161; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
162; CHECK-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
163; CHECK:       alias_cont:
164; CHECK-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32
165; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
166; CHECK-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
167; CHECK:       copy:
168; CHECK-NEXT:    [[TMP2:%.*]] = alloca [4 x double], align 8
169; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
170; CHECK-NEXT:    br label [[NO_ALIAS]]
171; CHECK:       no_alias:
172; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
173; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x double>, ptr [[A:%.*]], align 8
174; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
175; CHECK-NEXT:    [[TMP4:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD1]]
176; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 16
177; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <1 x double>, ptr [[TMP5]], align 8
178; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
179; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[TMP6]], align 8
180; CHECK-NEXT:    [[TMP7:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD2]], <1 x double> [[COL_LOAD3]], <1 x double> [[TMP4]])
181; CHECK-NEXT:    store <1 x double> [[TMP7]], ptr [[GEP_1]], align 8
182; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 8
183; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <1 x double>, ptr [[TMP8]], align 8
184; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
185; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[COL_LOAD8]], [[COL_LOAD9]]
186; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 24
187; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <1 x double>, ptr [[TMP10]], align 8
188; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
189; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <1 x double>, ptr [[TMP11]], align 8
190; CHECK-NEXT:    [[TMP12:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD13]], <1 x double> [[COL_LOAD14]], <1 x double> [[TMP9]])
191; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[C]], i64 1352
192; CHECK-NEXT:    store <1 x double> [[TMP12]], ptr [[TMP13]], align 8
193; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <1 x double>, ptr [[A]], align 8
194; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
195; CHECK-NEXT:    [[COL_LOAD20:%.*]] = load <1 x double>, ptr [[TMP14]], align 8
196; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[COL_LOAD19]], [[COL_LOAD20]]
197; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 16
198; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <1 x double>, ptr [[TMP16]], align 8
199; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
200; CHECK-NEXT:    [[COL_LOAD25:%.*]] = load <1 x double>, ptr [[TMP17]], align 8
201; CHECK-NEXT:    [[TMP18:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD24]], <1 x double> [[COL_LOAD25]], <1 x double> [[TMP15]])
202; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[C]], i64 1360
203; CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[TMP19]], align 8
204; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 8
205; CHECK-NEXT:    [[COL_LOAD30:%.*]] = load <1 x double>, ptr [[TMP20]], align 8
206; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
207; CHECK-NEXT:    [[COL_LOAD31:%.*]] = load <1 x double>, ptr [[TMP21]], align 8
208; CHECK-NEXT:    [[TMP22:%.*]] = fmul contract <1 x double> [[COL_LOAD30]], [[COL_LOAD31]]
209; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[A]], i64 24
210; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <1 x double>, ptr [[TMP23]], align 8
211; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
212; CHECK-NEXT:    [[COL_LOAD36:%.*]] = load <1 x double>, ptr [[TMP24]], align 8
213; CHECK-NEXT:    [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
214; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 1368
215; CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
216; CHECK-NEXT:    ret void
217;
218entry:
219  %a = load <4 x double>, ptr %A, align 8
220  %b = load <4 x double>, ptr %B, align 8
221  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
222  %off.0 = add i32 10, 10
223  %off.1 = add i32 %off.0, 2
224  %off.2 = add i32 %off.0, %off.1
225  %gep.1 = getelementptr <4 x double>, ptr %C, i32 %off.2
226  store <4 x double> %c, ptr %gep.1, align 8
227  ret void
228}
229
230define void @multiply_dont_hoist_phi(ptr noalias %A, ptr %B, ptr %C) {
231; CHECK-LABEL: @multiply_dont_hoist_phi(
232; CHECK-NEXT:  entry:
233; CHECK-NEXT:    br label [[NEXT:%.*]]
234; CHECK:       next:
235; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 16
236; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
237; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 16
238; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
239; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
240; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A]], align 8
241; CHECK-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
242; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT10]]
243; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP0]])
244; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[B]], align 8
245; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
246; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
247; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
248; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP2]])
249; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 832
250; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[GEP_1]], align 8
251; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr i8, ptr [[C]], i64 848
252; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[VEC_GEP14]], align 8
253; CHECK-NEXT:    ret void
254;
255entry:
256  %a = load <4 x double>, ptr %A, align 8
257  %b = load <4 x double>, ptr %B, align 8
258  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
259  br label %next
260
261next:
262  %p = phi i32 [ 2, %entry ]
263  %off.0 = add i32 10, %p
264  %off.1 = add i32 %off.0, 2
265  %off.2 = add i32 %off.0, %off.1
266  %gep.1 = getelementptr <4 x double>, ptr %C, i32 %off.2
267  store <4 x double> %c, ptr %gep.1, align 8
268  ret void
269}
270
271; The address load may alias, so avoid moving it for now.
272define void @multiply_dont_hoist_cast_due_to_operand(ptr noalias %A, ptr %B, ptr %C.ptr) {
273; CHECK-LABEL: @multiply_dont_hoist_cast_due_to_operand(
274; CHECK-NEXT:  entry:
275; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
276; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 16
277; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
278; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
279; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 16
280; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
281; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
282; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
283; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
284; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP0]])
285; CHECK-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
286; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT10]]
287; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
288; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
289; CHECK-NEXT:    [[C:%.*]] = load ptr, ptr [[C_PTR:%.*]], align 8
290; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[C]], align 8
291; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr i8, ptr [[C]], i64 16
292; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[VEC_GEP14]], align 8
293; CHECK-NEXT:    ret void
294;
295entry:
296  %a = load <4 x double>, ptr %A, align 8
297  %b = load <4 x double>, ptr %B, align 8
298  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
299  %C = load ptr, ptr %C.ptr
300  store <4 x double> %c, ptr %C, align 8
301  ret void
302}
303
304; The address load may alias, so avoid moving it for now.
305define void @multiply_dont_hoist_load(ptr noalias %A, ptr %B, ptr %C.ptr) {
306; CHECK-LABEL: @multiply_dont_hoist_load(
307; CHECK-NEXT:  entry:
308; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
309; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 16
310; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
311; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
312; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 16
313; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
314; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
315; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
316; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
317; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP0]])
318; CHECK-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
319; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT10]]
320; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
321; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
322; CHECK-NEXT:    [[C:%.*]] = load ptr, ptr [[C_PTR:%.*]], align 8
323; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[C]], align 8
324; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr i8, ptr [[C]], i64 16
325; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[VEC_GEP14]], align 8
326; CHECK-NEXT:    ret void
327;
328entry:
329  %a = load <4 x double>, ptr %A, align 8
330  %b = load <4 x double>, ptr %B, align 8
331  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
332  %C = load ptr, ptr %C.ptr
333  store <4 x double> %c, ptr %C, align 8
334  ret void
335}
336
337; The call to @get_adress may clobber memory, avoid moving it for now.
338define void @multiply_dont_hoist_call(ptr noalias %A, ptr %B) {
339; CHECK-LABEL: @multiply_dont_hoist_call(
340; CHECK-NEXT:  entry:
341; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
342; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 16
343; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
344; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
345; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 16
346; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
347; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
348; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
349; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
350; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP0]])
351; CHECK-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
352; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT10]]
353; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
354; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
355; CHECK-NEXT:    [[C:%.*]] = call ptr @get_address()
356; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[C]], align 8
357; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr i8, ptr [[C]], i64 16
358; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[VEC_GEP14]], align 8
359; CHECK-NEXT:    ret void
360;
361entry:
362  %a = load <4 x double>, ptr %A, align 8
363  %b = load <4 x double>, ptr %B, align 8
364  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
365  %C = call ptr @get_address()
366  store <4 x double> %c, ptr %C, align 8
367  ret void
368}
369
370declare ptr @get_address()
371
372
373declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32)
374