xref: /llvm-project/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll (revision e1f056f692d869708c1898d9d65a69ac5584a0ed)
1; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
2; RUN: -polly-target-throughput-vector-fma=1 \
3; RUN: -polly-target-latency-vector-fma=8 \
4; RUN: -polly-target-1st-cache-level-associativity=8 \
5; RUN: -polly-target-2nd-cache-level-associativity=8 \
6; RUN: -polly-target-1st-cache-level-size=32768 \
7; RUN: -polly-target-vector-register-bitwidth=256 \
8; RUN: -polly-target-2nd-cache-level-size=262144 \
9; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
10;
11;  opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
12;  -polly-target-throughput-vector-fma=1 \
13;  -polly-target-latency-vector-fma=8 \
14;  -passes=polly-codegen -polly-target-1st-cache-level-associativity=8 \
15;  -polly-target-2nd-cache-level-associativity=8 \
16;  -polly-target-1st-cache-level-size=32768 \
17;  -polly-target-vector-register-bitwidth=256 \
18;  -polly-target-2nd-cache-level-size=262144 -gvn -licm -slp-vectorizer \
19;  -mcpu=corei7 -stats -S < %s 2>&1 | FileCheck %s \
20; --check-prefix=AUTO-VECTORIZATION
21;
22;
23;    /* We isolate a set of partial tile prefixes, which contains only partial
24;       tile prefixes that have exactly Mr x Nr iterations of the two innermost
25;       loops produced by the optimization of the matrix multiplication. Mr and
26;       Nr are parameters of the micro-kernel (see getMicroKernelParams and
27;       getMacroKernelParams from lib/Transform/ScheduleOptimizer.cpp for
28;       details). This test check that in case it cannot be proved that
29;       the number of loop iterations can be evenly divided by tile sizes
30;       and we tile and unroll the point loops, it helps to get rid of
31;       the conditional expressions of the unrolled innermost loops, which
32;       prevents stores and loads of the unrolled loops from being sunk
33;       and hoisted. Otherwise, it causes a run-time regression in comparison
34;       to the vectorized code with sunk and hoisted memory accesses. */
35;    /* C := A * B + C */
36;    for (i = 0; i < 1020; i++)
37;      for (j = 0; j < 1020; j++)
38;	 for (k = 0; k < 1020; ++k)
39;	   C[i][j] += A[i][k] * B[k][j];
40;
41; CHECK:    // 1st level tiling - Tiles
42; CHECK-NEXT:    for (int c1 = 0; c1 <= 3; c1 += 1) {
43; CHECK-NEXT:      for (int c3 = 0; c3 <= 1019; c3 += 1)
44; CHECK-NEXT:        for (int c4 = 256 * c1; c4 <= min(1019, 256 * c1 + 255); c4 += 1)
45; CHECK-NEXT:          CopyStmt_0(0, c3, c4);
46; CHECK-NEXT:      for (int c2 = 0; c2 <= 10; c2 += 1) {
47; CHECK-NEXT:        for (int c6 = 96 * c2; c6 <= min(1019, 96 * c2 + 95); c6 += 1)
48; CHECK-NEXT:          for (int c7 = 256 * c1; c7 <= min(1019, 256 * c1 + 255); c7 += 1)
49; CHECK-NEXT:            CopyStmt_1(0, c1, c2, c6, c7);
50; CHECK-NEXT:        // 1st level tiling - Points
51; CHECK-NEXT:        // Register tiling - Tiles
52; CHECK-NEXT:        {
53; CHECK-NEXT:          for (int c3 = 0; c3 <= 126; c3 += 1)
54; CHECK-NEXT:            for (int c4 = 0; c4 <= min(23, -24 * c2 + 254); c4 += 1)
55; CHECK-NEXT:              for (int c5 = 0; c5 <= min(255, -256 * c1 + 1019); c5 += 1) {
56; CHECK-NEXT:                // Loop Vectorizer Disabled
57; CHECK-NEXT:                // Register tiling - Points
58; CHECK-NEXT:                {
59; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
60; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
61; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
62; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
63; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
64; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
65; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
66; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
67; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
68; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
69; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
70; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
71; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
72; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
73; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
74; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
75; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
76; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
77; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
78; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
79; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
80; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
81; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
82; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
83; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
84; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
85; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
86; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
87; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
88; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
89; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
90; CHECK-NEXT:                  Stmt_for_body6(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
91; CHECK-NEXT:                }
92; CHECK-NEXT:              }
93; CHECK-NEXT:              for (int c4 = 0; c4 <= min(23, -24 * c2 + 254); c4 += 1)
94; CHECK-NEXT:                for (int c5 = 0; c5 <= min(255, -256 * c1 + 1019); c5 += 1) {
95; CHECK-NEXT:                  // Loop Vectorizer Disabled
96; CHECK-NEXT:                  // Register tiling - Points
97; CHECK-NEXT:                  {
98; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4, 1016, 256 * c1 + c5);
99; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4, 1017, 256 * c1 + c5);
100; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4, 1018, 256 * c1 + c5);
101; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4, 1019, 256 * c1 + c5);
102; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 1, 1016, 256 * c1 + c5);
103; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 1, 1017, 256 * c1 + c5);
104; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 1, 1018, 256 * c1 + c5);
105; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 1, 1019, 256 * c1 + c5);
106; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 2, 1016, 256 * c1 + c5);
107; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 2, 1017, 256 * c1 + c5);
108; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 2, 1018, 256 * c1 + c5);
109; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 2, 1019, 256 * c1 + c5);
110; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 3, 1016, 256 * c1 + c5);
111; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 3, 1017, 256 * c1 + c5);
112; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 3, 1018, 256 * c1 + c5);
113; CHECK-NEXT:                    Stmt_for_body6(96 * c2 + 4 * c4 + 3, 1019, 256 * c1 + c5);
114; CHECK-NEXT:                  }
115; CHECK-NEXT:                }
116; CHECK-NEXT:            }
117; CHECK-NEXT:          }
118; CHECK-NEXT:        }
119;
120; AUTO-VECTORIZATION:  fmul <4 x double>
121; AUTO-VECTORIZATION:  fadd <4 x double>
122
123; AUTO-VECTORIZATION: 36 SLP              - Number of vector instructions generated
124; AUTO-VECTORIZATION: 146 licm             - Number of instructions hoisted out of loop
125; AUTO-VECTORIZATION: 1 licm             - Number of load insts hoisted or sunk
126; AUTO-VECTORIZATION: 32 licm             - Number of memory locations promoted to registers
127;
128target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
129target triple = "x86_64-unknown-unknown"
130
131define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, ptr %C, ptr %A, ptr %B) #0 {
132entry:
133  br label %entry.split
134
135entry.split:                                      ; preds = %entry
136  br label %for.cond1.preheader
137
138for.cond1.preheader:                              ; preds = %for.inc20, %entry.split
139  %indvars.iv41 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next42, %for.inc20 ]
140  br label %for.cond4.preheader
141
142for.cond4.preheader:                              ; preds = %for.inc17, %for.cond1.preheader
143  %indvars.iv38 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next39, %for.inc17 ]
144  br label %for.body6
145
146for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
147  %indvars.iv = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next, %for.body6 ]
148  %arrayidx8 = getelementptr inbounds [1020 x double], ptr %A, i64 %indvars.iv41, i64 %indvars.iv
149  %tmp = load double, ptr %arrayidx8, align 8
150  %arrayidx12 = getelementptr inbounds [1020 x double], ptr %B, i64 %indvars.iv, i64 %indvars.iv38
151  %tmp1 = load double, ptr %arrayidx12, align 8
152  %mul = fmul double %tmp, %tmp1
153  %arrayidx16 = getelementptr inbounds [1020 x double], ptr %C, i64 %indvars.iv41, i64 %indvars.iv38
154  %tmp2 = load double, ptr %arrayidx16, align 8
155  %add = fadd double %tmp2, %mul
156  store double %add, ptr %arrayidx16, align 8
157  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
158  %exitcond = icmp ne i64 %indvars.iv.next, 1020
159  br i1 %exitcond, label %for.body6, label %for.inc17
160
161for.inc17:                                        ; preds = %for.body6
162  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
163  %exitcond40 = icmp ne i64 %indvars.iv.next39, 1020
164  br i1 %exitcond40, label %for.cond4.preheader, label %for.inc20
165
166for.inc20:                                        ; preds = %for.inc17
167  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
168  %exitcond43 = icmp ne i64 %indvars.iv.next42, 1020
169  br i1 %exitcond43, label %for.cond1.preheader, label %for.end22
170
171for.end22:                                        ; preds = %for.inc20
172  ret void
173}
174
175attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }
176