xref: /llvm-project/llvm/test/CodeGen/X86/misched-matrix.ll (revision da71203e6fc6b8e08c9979204506d385e9cb07b8)
1; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic -pre-RA-sched=source -enable-misched \
2; RUN:          -misched-prera-direction=topdown -verify-machineinstrs \
3; RUN:     | FileCheck %s -check-prefix=TOPDOWN
4; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic -pre-RA-sched=source -enable-misched \
5; RUN:          -misched=ilpmin -verify-machineinstrs \
6; RUN:     | FileCheck %s -check-prefix=ILPMIN
7; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic -pre-RA-sched=source -enable-misched \
8; RUN:          -misched=ilpmax -verify-machineinstrs \
9; RUN:     | FileCheck %s -check-prefix=ILPMAX
10;
11; Verify that the MI scheduler minimizes register pressure for a
12; uniform set of bottom-up subtrees (unrolled matrix multiply).
13;
14; For current top-down heuristics, ensure that some folded imulls have
15; been reordered with the stores. This tests the scheduler's cheap
16; alias analysis ability (that doesn't require any AliasAnalysis pass).
17;
18; TOPDOWN-LABEL: %for.body
19; TOPDOWN: movl %{{.*}}, (
20; TOPDOWN: imull {{[0-9]*}}(
21; TOPDOWN: movl %{{.*}}, 4(
22; TOPDOWN: imull {{[0-9]*}}(
23; TOPDOWN: movl %{{.*}}, 8(
24; TOPDOWN: movl %{{.*}}, 12(
25; TOPDOWN-LABEL: %for.end
26;
27; For -misched=ilpmin, verify that each expression subtree is
28; scheduled independently, and that the imull/adds are interleaved.
29;
30; ILPMIN-LABEL: %for.body
31; ILPMIN: movl %{{.*}}, (
32; ILPMIN: imull
33; ILPMIN: imull
34; ILPMIN: addl
35; ILPMIN: imull
36; ILPMIN: imull
37; ILPMIN: addl
38; ILPMIN: addl
39; ILPMIN: movl %{{.*}}, 4(
40; ILPMIN: imull
41; ILPMIN: imull
42; ILPMIN: addl
43; ILPMIN: imull
44; ILPMIN: imull
45; ILPMIN: addl
46; ILPMIN: addl
47; ILPMIN: movl %{{.*}}, 8(
48; ILPMIN: imull
49; ILPMIN: imull
50; ILPMIN: addl
51; ILPMIN: imull
52; ILPMIN: imull
53; ILPMIN: addl
54; ILPMIN: addl
55; ILPMIN: movl %{{.*}}, 12(
56; ILPMIN-LABEL: %for.end
57;
58; For -misched=ilpmax, verify that each expression subtree is
59; scheduled independently, and that the imull/adds are clustered.
60;
61; ILPMAX-LABEL: %for.body
62; ILPMAX: movl %{{.*}}, (
63; ILPMAX: imull
64; ILPMAX: imull
65; ILPMAX: imull
66; ILPMAX: imull
67; ILPMAX: addl
68; ILPMAX: addl
69; ILPMAX: addl
70; ILPMAX: movl %{{.*}}, 4(
71; ILPMAX: imull
72; ILPMAX: imull
73; ILPMAX: imull
74; ILPMAX: imull
75; ILPMAX: addl
76; ILPMAX: addl
77; ILPMAX: addl
78; ILPMAX: movl %{{.*}}, 8(
79; ILPMAX: imull
80; ILPMAX: imull
81; ILPMAX: imull
82; ILPMAX: imull
83; ILPMAX: addl
84; ILPMAX: addl
85; ILPMAX: addl
86; ILPMAX: movl %{{.*}}, 12(
87; ILPMAX-LABEL: %for.end
88
89define void @mmult(ptr noalias nocapture %m1, ptr noalias nocapture %m2,
90ptr noalias nocapture %m3) nounwind uwtable ssp {
91entry:
92  br label %for.body
93
94for.body:                              ; preds = %for.body, %entry
95  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
96  %arrayidx8 = getelementptr inbounds [4 x i32], ptr %m1, i64 %indvars.iv, i64 0
97  %tmp = load i32, ptr %arrayidx8, align 4
98  %tmp1 = load i32, ptr %m2, align 4
99  %arrayidx8.1 = getelementptr inbounds [4 x i32], ptr %m1, i64 %indvars.iv, i64 1
100  %tmp2 = load i32, ptr %arrayidx8.1, align 4
101  %arrayidx12.1 = getelementptr inbounds [4 x i32], ptr %m2, i64 1, i64 0
102  %tmp3 = load i32, ptr %arrayidx12.1, align 4
103  %arrayidx8.2 = getelementptr inbounds [4 x i32], ptr %m1, i64 %indvars.iv, i64 2
104  %tmp4 = load i32, ptr %arrayidx8.2, align 4
105  %arrayidx12.2 = getelementptr inbounds [4 x i32], ptr %m2, i64 2, i64 0
106  %tmp5 = load i32, ptr %arrayidx12.2, align 4
107  %arrayidx8.3 = getelementptr inbounds [4 x i32], ptr %m1, i64 %indvars.iv, i64 3
108  %tmp6 = load i32, ptr %arrayidx8.3, align 4
109  %arrayidx12.3 = getelementptr inbounds [4 x i32], ptr %m2, i64 3, i64 0
110  %tmp8 = load i32, ptr %arrayidx8, align 4
111  %arrayidx12.137 = getelementptr inbounds [4 x i32], ptr %m2, i64 0, i64 1
112  %tmp9 = load i32, ptr %arrayidx12.137, align 4
113  %tmp10 = load i32, ptr %arrayidx8.1, align 4
114  %arrayidx12.1.1 = getelementptr inbounds [4 x i32], ptr %m2, i64 1, i64 1
115  %tmp11 = load i32, ptr %arrayidx12.1.1, align 4
116  %tmp12 = load i32, ptr %arrayidx8.2, align 4
117  %arrayidx12.2.1 = getelementptr inbounds [4 x i32], ptr %m2, i64 2, i64 1
118  %tmp13 = load i32, ptr %arrayidx12.2.1, align 4
119  %tmp14 = load i32, ptr %arrayidx8.3, align 4
120  %arrayidx12.3.1 = getelementptr inbounds [4 x i32], ptr %m2, i64 3, i64 1
121  %tmp15 = load i32, ptr %arrayidx12.3.1, align 4
122  %tmp16 = load i32, ptr %arrayidx8, align 4
123  %arrayidx12.239 = getelementptr inbounds [4 x i32], ptr %m2, i64 0, i64 2
124  %tmp17 = load i32, ptr %arrayidx12.239, align 4
125  %tmp18 = load i32, ptr %arrayidx8.1, align 4
126  %arrayidx12.1.2 = getelementptr inbounds [4 x i32], ptr %m2, i64 1, i64 2
127  %tmp19 = load i32, ptr %arrayidx12.1.2, align 4
128  %tmp20 = load i32, ptr %arrayidx8.2, align 4
129  %arrayidx12.2.2 = getelementptr inbounds [4 x i32], ptr %m2, i64 2, i64 2
130  %tmp21 = load i32, ptr %arrayidx12.2.2, align 4
131  %tmp22 = load i32, ptr %arrayidx8.3, align 4
132  %arrayidx12.3.2 = getelementptr inbounds [4 x i32], ptr %m2, i64 3, i64 2
133  %tmp23 = load i32, ptr %arrayidx12.3.2, align 4
134  %tmp24 = load i32, ptr %arrayidx8, align 4
135  %arrayidx12.341 = getelementptr inbounds [4 x i32], ptr %m2, i64 0, i64 3
136  %tmp25 = load i32, ptr %arrayidx12.341, align 4
137  %tmp26 = load i32, ptr %arrayidx8.1, align 4
138  %arrayidx12.1.3 = getelementptr inbounds [4 x i32], ptr %m2, i64 1, i64 3
139  %tmp27 = load i32, ptr %arrayidx12.1.3, align 4
140  %tmp28 = load i32, ptr %arrayidx8.2, align 4
141  %arrayidx12.2.3 = getelementptr inbounds [4 x i32], ptr %m2, i64 2, i64 3
142  %tmp29 = load i32, ptr %arrayidx12.2.3, align 4
143  %tmp30 = load i32, ptr %arrayidx8.3, align 4
144  %arrayidx12.3.3 = getelementptr inbounds [4 x i32], ptr %m2, i64 3, i64 3
145  %tmp31 = load i32, ptr %arrayidx12.3.3, align 4
146  %tmp7 = load i32, ptr %arrayidx12.3, align 4
147  %mul = mul nsw i32 %tmp1, %tmp
148  %mul.1 = mul nsw i32 %tmp3, %tmp2
149  %mul.2 = mul nsw i32 %tmp5, %tmp4
150  %mul.3 = mul nsw i32 %tmp7, %tmp6
151  %mul.138 = mul nsw i32 %tmp9, %tmp8
152  %mul.1.1 = mul nsw i32 %tmp11, %tmp10
153  %mul.2.1 = mul nsw i32 %tmp13, %tmp12
154  %mul.3.1 = mul nsw i32 %tmp15, %tmp14
155  %mul.240 = mul nsw i32 %tmp17, %tmp16
156  %mul.1.2 = mul nsw i32 %tmp19, %tmp18
157  %mul.2.2 = mul nsw i32 %tmp21, %tmp20
158  %mul.3.2 = mul nsw i32 %tmp23, %tmp22
159  %mul.342 = mul nsw i32 %tmp25, %tmp24
160  %mul.1.3 = mul nsw i32 %tmp27, %tmp26
161  %mul.2.3 = mul nsw i32 %tmp29, %tmp28
162  %mul.3.3 = mul nsw i32 %tmp31, %tmp30
163  %add.1 = add nsw i32 %mul.1, %mul
164  %add.2 = add nsw i32 %mul.2, %add.1
165  %add.3 = add nsw i32 %mul.3, %add.2
166  %add.1.1 = add nsw i32 %mul.1.1, %mul.138
167  %add.2.1 = add nsw i32 %mul.2.1, %add.1.1
168  %add.3.1 = add nsw i32 %mul.3.1, %add.2.1
169  %add.1.2 = add nsw i32 %mul.1.2, %mul.240
170  %add.2.2 = add nsw i32 %mul.2.2, %add.1.2
171  %add.3.2 = add nsw i32 %mul.3.2, %add.2.2
172  %add.1.3 = add nsw i32 %mul.1.3, %mul.342
173  %add.2.3 = add nsw i32 %mul.2.3, %add.1.3
174  %add.3.3 = add nsw i32 %mul.3.3, %add.2.3
175  %arrayidx16 = getelementptr inbounds [4 x i32], ptr %m3, i64 %indvars.iv, i64 0
176  store i32 %add.3, ptr %arrayidx16, align 4
177  %arrayidx16.1 = getelementptr inbounds [4 x i32], ptr %m3, i64 %indvars.iv, i64 1
178  store i32 %add.3.1, ptr %arrayidx16.1, align 4
179  %arrayidx16.2 = getelementptr inbounds [4 x i32], ptr %m3, i64 %indvars.iv, i64 2
180  store i32 %add.3.2, ptr %arrayidx16.2, align 4
181  %arrayidx16.3 = getelementptr inbounds [4 x i32], ptr %m3, i64 %indvars.iv, i64 3
182  store i32 %add.3.3, ptr %arrayidx16.3, align 4
183  %indvars.iv.next = add i64 %indvars.iv, 1
184  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
185  %exitcond = icmp eq i32 %lftr.wideiv, 4
186  br i1 %exitcond, label %for.end, label %for.body
187
188for.end:                                        ; preds = %for.body
189  ret void
190}
191