xref: /llvm-project/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-volatile.ll (revision eae26b6640afff715172d75fdee02e7df7530a9b)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=lower-matrix-intrinsics -fuse-matrix-use-loops -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
3
4; REQUIRES: aarch64-registered-target
5
6target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
7target triple = "aarch64-apple-ios"
8
9define void @multiply_all_volatile(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
10; CHECK-LABEL: @multiply_all_volatile(
11; CHECK-NEXT:  entry:
12; CHECK-NEXT:    br label [[COLS_HEADER:%.*]]
13; CHECK:       cols.header:
14; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ]
15; CHECK-NEXT:    br label [[COLS_BODY:%.*]]
16; CHECK:       cols.body:
17; CHECK-NEXT:    br label [[ROWS_HEADER:%.*]]
18; CHECK:       rows.header:
19; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ]
20; CHECK-NEXT:    br label [[ROWS_BODY:%.*]]
21; CHECK:       rows.body:
22; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
23; CHECK:       inner.header:
24; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
25; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ]
26; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ]
27; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
28; CHECK:       inner.body:
29; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INNER_IV]], 2
30; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], [[ROWS_IV]]
31; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP1]]
32; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
33; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP2]], i64 2
34; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
35; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[COLS_IV]], 2
36; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[INNER_IV]]
37; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[TMP4]]
38; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
39; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP5]], i64 2
40; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
41; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
42; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
43; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
44; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
45; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
46; CHECK-NEXT:    [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK5]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
47; CHECK-NEXT:    [[BLOCK6:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
48; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
49; CHECK-NEXT:    [[SPLAT_SPLATINSERT7:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
50; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT7]], <2 x double> poison, <2 x i32> zeroinitializer
51; CHECK-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK6]], <2 x double> [[SPLAT_SPLAT8]], <2 x double> [[TMP7]])
52; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
53; CHECK-NEXT:    [[TMP11]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> [[TMP10]], <2 x i32> <i32 2, i32 3>
54; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
55; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
56; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
57; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i64 0
58; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> poison, <2 x i32> zeroinitializer
59; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[BLOCK9]])
60; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
61; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
62; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i64 0
63; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
64; CHECK-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK13]], <2 x double> [[SPLAT_SPLAT15]], <2 x double> [[TMP13]])
65; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
66; CHECK-NEXT:    [[TMP17]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 3>
67; CHECK-NEXT:    br label [[INNER_LATCH]]
68; CHECK:       inner.latch:
69; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
70; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
71; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
72; CHECK:       rows.latch:
73; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
74; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
75; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[COLS_IV]], 2
76; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], [[ROWS_IV]]
77; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[C:%.*]], i64 [[TMP19]]
78; CHECK-NEXT:    store volatile <2 x double> [[TMP11]], ptr [[TMP20]], align 8
79; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr double, ptr [[TMP20]], i64 2
80; CHECK-NEXT:    store volatile <2 x double> [[TMP17]], ptr [[VEC_GEP16]], align 8
81; CHECK-NEXT:    br i1 [[ROWS_COND]], label [[ROWS_HEADER]], label [[COLS_LATCH]]
82; CHECK:       cols.latch:
83; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
84; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 2
85; CHECK-NEXT:    br i1 [[COLS_COND]], label [[COLS_HEADER]], label [[CONTINUE:%.*]]
86; CHECK:       continue:
87; CHECK-NEXT:    ret void
88;
89
90
91entry:
92  %a = load volatile <4 x double>, ptr %A, align 8
93  %b = load volatile <4 x double>, ptr %B, align 8
94
95  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
96
97  store volatile <4 x double> %c, ptr %C, align 8
98  ret void
99}
100
101
102define void @multiply_load0_volatile(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
103; CHECK-LABEL: @multiply_load0_volatile(
104; CHECK-NEXT:  entry:
105; CHECK-NEXT:    br label [[COLS_HEADER:%.*]]
106; CHECK:       cols.header:
107; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ]
108; CHECK-NEXT:    br label [[COLS_BODY:%.*]]
109; CHECK:       cols.body:
110; CHECK-NEXT:    br label [[ROWS_HEADER:%.*]]
111; CHECK:       rows.header:
112; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ]
113; CHECK-NEXT:    br label [[ROWS_BODY:%.*]]
114; CHECK:       rows.body:
115; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
116; CHECK:       inner.header:
117; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
118; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ]
119; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ]
120; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
121; CHECK:       inner.body:
122; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INNER_IV]], 2
123; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], [[ROWS_IV]]
124; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP1]]
125; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
126; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP2]], i64 2
127; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
128; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[COLS_IV]], 2
129; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[INNER_IV]]
130; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[TMP4]]
131; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
132; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP5]], i64 2
133; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
134; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
135; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
136; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
137; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
138; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
139; CHECK-NEXT:    [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK5]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
140; CHECK-NEXT:    [[BLOCK6:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
141; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
142; CHECK-NEXT:    [[SPLAT_SPLATINSERT7:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
143; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT7]], <2 x double> poison, <2 x i32> zeroinitializer
144; CHECK-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK6]], <2 x double> [[SPLAT_SPLAT8]], <2 x double> [[TMP7]])
145; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
146; CHECK-NEXT:    [[TMP11]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> [[TMP10]], <2 x i32> <i32 2, i32 3>
147; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
148; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
149; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
150; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i64 0
151; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> poison, <2 x i32> zeroinitializer
152; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[BLOCK9]])
153; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
154; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
155; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i64 0
156; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
157; CHECK-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK13]], <2 x double> [[SPLAT_SPLAT15]], <2 x double> [[TMP13]])
158; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
159; CHECK-NEXT:    [[TMP17]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 3>
160; CHECK-NEXT:    br label [[INNER_LATCH]]
161; CHECK:       inner.latch:
162; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
163; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
164; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop [[LOOP2:![0-9]+]]
165; CHECK:       rows.latch:
166; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
167; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
168; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[COLS_IV]], 2
169; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], [[ROWS_IV]]
170; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[C:%.*]], i64 [[TMP19]]
171; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr [[TMP20]], align 8
172; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr double, ptr [[TMP20]], i64 2
173; CHECK-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP16]], align 8
174; CHECK-NEXT:    br i1 [[ROWS_COND]], label [[ROWS_HEADER]], label [[COLS_LATCH]]
175; CHECK:       cols.latch:
176; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
177; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 2
178; CHECK-NEXT:    br i1 [[COLS_COND]], label [[COLS_HEADER]], label [[CONTINUE:%.*]]
179; CHECK:       continue:
180; CHECK-NEXT:    ret void
181;
182
183
184entry:
185  %a = load volatile <4 x double>, ptr %A, align 8
186  %b = load <4 x double>, ptr %B, align 8
187
188  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
189
190  store <4 x double> %c, ptr %C, align 8
191  ret void
192}
193
194define void @multiply_load1_volatile(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
195; CHECK-LABEL: @multiply_load1_volatile(
196; CHECK-NEXT:  entry:
197; CHECK-NEXT:    br label [[COLS_HEADER:%.*]]
198; CHECK:       cols.header:
199; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ]
200; CHECK-NEXT:    br label [[COLS_BODY:%.*]]
201; CHECK:       cols.body:
202; CHECK-NEXT:    br label [[ROWS_HEADER:%.*]]
203; CHECK:       rows.header:
204; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ]
205; CHECK-NEXT:    br label [[ROWS_BODY:%.*]]
206; CHECK:       rows.body:
207; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
208; CHECK:       inner.header:
209; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
210; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ]
211; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ]
212; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
213; CHECK:       inner.body:
214; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INNER_IV]], 2
215; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], [[ROWS_IV]]
216; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP1]]
217; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
218; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP2]], i64 2
219; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
220; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[COLS_IV]], 2
221; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[INNER_IV]]
222; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[TMP4]]
223; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
224; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP5]], i64 2
225; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
226; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
227; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
228; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
229; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
230; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
231; CHECK-NEXT:    [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK5]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
232; CHECK-NEXT:    [[BLOCK6:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
233; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
234; CHECK-NEXT:    [[SPLAT_SPLATINSERT7:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
235; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT7]], <2 x double> poison, <2 x i32> zeroinitializer
236; CHECK-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK6]], <2 x double> [[SPLAT_SPLAT8]], <2 x double> [[TMP7]])
237; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
238; CHECK-NEXT:    [[TMP11]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> [[TMP10]], <2 x i32> <i32 2, i32 3>
239; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
240; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
241; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
242; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i64 0
243; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> poison, <2 x i32> zeroinitializer
244; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[BLOCK9]])
245; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
246; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
247; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i64 0
248; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
249; CHECK-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK13]], <2 x double> [[SPLAT_SPLAT15]], <2 x double> [[TMP13]])
250; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
251; CHECK-NEXT:    [[TMP17]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 3>
252; CHECK-NEXT:    br label [[INNER_LATCH]]
253; CHECK:       inner.latch:
254; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
255; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
256; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop [[LOOP3:![0-9]+]]
257; CHECK:       rows.latch:
258; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
259; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
260; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[COLS_IV]], 2
261; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], [[ROWS_IV]]
262; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[C:%.*]], i64 [[TMP19]]
263; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr [[TMP20]], align 8
264; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr double, ptr [[TMP20]], i64 2
265; CHECK-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP16]], align 8
266; CHECK-NEXT:    br i1 [[ROWS_COND]], label [[ROWS_HEADER]], label [[COLS_LATCH]]
267; CHECK:       cols.latch:
268; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
269; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 2
270; CHECK-NEXT:    br i1 [[COLS_COND]], label [[COLS_HEADER]], label [[CONTINUE:%.*]]
271; CHECK:       continue:
272; CHECK-NEXT:    ret void
273;
274
275
276entry:
277  %a = load <4 x double>, ptr %A, align 8
278  %b = load volatile <4 x double>, ptr %B, align 8
279
280  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
281
282  store <4 x double> %c, ptr %C, align 8
283  ret void
284}
285
286define void @multiply_store_volatile(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
287; CHECK-LABEL: @multiply_store_volatile(
288; CHECK-NEXT:  entry:
289; CHECK-NEXT:    br label [[COLS_HEADER:%.*]]
290; CHECK:       cols.header:
291; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ]
292; CHECK-NEXT:    br label [[COLS_BODY:%.*]]
293; CHECK:       cols.body:
294; CHECK-NEXT:    br label [[ROWS_HEADER:%.*]]
295; CHECK:       rows.header:
296; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ]
297; CHECK-NEXT:    br label [[ROWS_BODY:%.*]]
298; CHECK:       rows.body:
299; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
300; CHECK:       inner.header:
301; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
302; CHECK-NEXT:    [[RESULT_VEC_0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ]
303; CHECK-NEXT:    [[RESULT_VEC_1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ]
304; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
305; CHECK:       inner.body:
306; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INNER_IV]], 2
307; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], [[ROWS_IV]]
308; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP1]]
309; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
310; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP2]], i64 2
311; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
312; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[COLS_IV]], 2
313; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[INNER_IV]]
314; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[TMP4]]
315; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
316; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP5]], i64 2
317; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
318; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
319; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
320; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
321; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
322; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
323; CHECK-NEXT:    [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK5]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
324; CHECK-NEXT:    [[BLOCK6:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
325; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
326; CHECK-NEXT:    [[SPLAT_SPLATINSERT7:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
327; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT7]], <2 x double> poison, <2 x i32> zeroinitializer
328; CHECK-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK6]], <2 x double> [[SPLAT_SPLAT8]], <2 x double> [[TMP7]])
329; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
330; CHECK-NEXT:    [[TMP11]] = shufflevector <2 x double> [[RESULT_VEC_0]], <2 x double> [[TMP10]], <2 x i32> <i32 2, i32 3>
331; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
332; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
333; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
334; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i64 0
335; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> poison, <2 x i32> zeroinitializer
336; CHECK-NEXT:    [[TMP13:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[BLOCK9]])
337; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
338; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
339; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i64 0
340; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
341; CHECK-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK13]], <2 x double> [[SPLAT_SPLAT15]], <2 x double> [[TMP13]])
342; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> <i32 0, i32 1>
343; CHECK-NEXT:    [[TMP17]] = shufflevector <2 x double> [[RESULT_VEC_1]], <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 3>
344; CHECK-NEXT:    br label [[INNER_LATCH]]
345; CHECK:       inner.latch:
346; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
347; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
348; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop [[LOOP4:![0-9]+]]
349; CHECK:       rows.latch:
350; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
351; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
352; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[COLS_IV]], 2
353; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], [[ROWS_IV]]
354; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[C:%.*]], i64 [[TMP19]]
355; CHECK-NEXT:    store volatile <2 x double> [[TMP11]], ptr [[TMP20]], align 8
356; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr double, ptr [[TMP20]], i64 2
357; CHECK-NEXT:    store volatile <2 x double> [[TMP17]], ptr [[VEC_GEP16]], align 8
358; CHECK-NEXT:    br i1 [[ROWS_COND]], label [[ROWS_HEADER]], label [[COLS_LATCH]]
359; CHECK:       cols.latch:
360; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
361; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 2
362; CHECK-NEXT:    br i1 [[COLS_COND]], label [[COLS_HEADER]], label [[CONTINUE:%.*]]
363; CHECK:       continue:
364; CHECK-NEXT:    ret void
365;
366
367entry:
368  %a = load <4 x double>, ptr %A, align 8
369  %b = load <4 x double>, ptr %B, align 8
370
371  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
372
373  store volatile <4 x double> %c, ptr %C, align 8
374  ret void
375}
376
377declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32)
378