1; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ 2; RUN: -polly-target-throughput-vector-fma=2 \ 3; RUN: -polly-target-latency-vector-fma=8 \ 4; RUN: -polly-target-1st-cache-level-associativity=8 \ 5; RUN: -polly-target-2nd-cache-level-associativity=8 \ 6; RUN: -polly-target-1st-cache-level-size=32768 \ 7; RUN: -polly-target-vector-register-bitwidth=128 \ 8; RUN: -polly-target-2nd-cache-level-size=262144 \ 9; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s 10; 11; Test whether isolation works as expected. 12; 13; CHECK: // 1st level tiling - Tiles 14; CHECK-NEXT: for (int c0 = 0; c0 <= 1; c0 += 1) 15; CHECK-NEXT: for (int c1 = 0; c1 <= 6; c1 += 1) { 16; CHECK-NEXT: for (int c3 = 1536 * c0; c3 <= min(1999, 1536 * c0 + 1535); c3 += 1) 17; CHECK-NEXT: for (int c4 = 307 * c1; c4 <= min(1999, 307 * c1 + 306); c4 += 1) 18; CHECK-NEXT: CopyStmt_0(0, c3, c4); 19; CHECK-NEXT: for (int c2 = 0; c2 <= 24; c2 += 1) { 20; CHECK-NEXT: for (int c6 = 80 * c2; c6 <= 80 * c2 + 79; c6 += 1) 21; CHECK-NEXT: for (int c7 = 307 * c1; c7 <= min(1999, 307 * c1 + 306); c7 += 1) 22; CHECK-NEXT: CopyStmt_1(c0, c1, c2, c6, c7); 23; CHECK-NEXT: // 1st level tiling - Points 24; CHECK-NEXT: // Register tiling - Tiles 25; CHECK-NEXT: { 26; CHECK-NEXT: for (int c3 = 0; c3 <= min(255, -256 * c0 + 332); c3 += 1) 27; CHECK-NEXT: for (int c4 = 0; c4 <= 15; c4 += 1) 28; CHECK-NEXT: for (int c5 = 0; c5 <= min(306, -307 * c1 + 1999); c5 += 1) { 29; CHECK-NEXT: // Loop Vectorizer Disabled 30; CHECK-NEXT: // Register tiling - Points 31; CHECK-NEXT: { 32; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3, 307 * c1 + c5); 33; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5); 34; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5); 35; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5); 36; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5); 37; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5); 38; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3, 307 * c1 + c5); 39; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5); 40; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5); 41; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5); 42; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5); 43; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5); 44; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3, 307 * c1 + c5); 45; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5); 46; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5); 47; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5); 48; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5); 49; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5); 50; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3, 307 * c1 + c5); 51; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5); 52; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5); 53; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5); 54; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5); 55; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5); 56; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3, 307 * c1 + c5); 57; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 1, 307 * c1 + c5); 58; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 2, 307 * c1 + c5); 59; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 3, 307 * c1 + c5); 60; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 4, 307 * c1 + c5); 61; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1536 * c0 + 6 * c3 + 5, 307 * c1 + c5); 62; CHECK-NEXT: } 63; CHECK-NEXT: } 64; CHECK-NEXT: if (c0 == 1) 65; CHECK-NEXT: for (int c4 = 0; c4 <= 15; c4 += 1) 66; CHECK-NEXT: for (int c5 = 0; c5 <= min(306, -307 * c1 + 1999); c5 += 1) { 67; CHECK-NEXT: // Loop Vectorizer Disabled 68; CHECK-NEXT: // Register tiling - Points 69; CHECK-NEXT: { 70; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1998, 307 * c1 + c5); 71; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1999, 307 * c1 + c5); 72; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1998, 307 * c1 + c5); 73; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 1, 1999, 307 * c1 + c5); 74; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1998, 307 * c1 + c5); 75; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 2, 1999, 307 * c1 + c5); 76; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1998, 307 * c1 + c5); 77; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 3, 1999, 307 * c1 + c5); 78; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1998, 307 * c1 + c5); 79; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4 + 4, 1999, 307 * c1 + c5); 80; CHECK-NEXT: } 81; CHECK-NEXT: } 82; CHECK-NEXT: } 83; CHECK-NEXT: } 84; CHECK-NEXT: } 85; 86target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 87target triple = "x86_64-unknown-linux-gnu" 88 89define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, ptr %C, ptr %A, ptr %B) { 90entry: 91 br label %entry.split 92 93entry.split: ; preds = %entry 94 br label %for.body 95 96for.body: ; preds = %for.inc20, %entry.split 97 %indvars.iv41 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next42, %for.inc20 ] 98 br label %for.body3 99 100for.body3: ; preds = %for.inc17, %for.body 101 %indvars.iv38 = phi i64 [ 0, %for.body ], [ %indvars.iv.next39, %for.inc17 ] 102 br label %for.body6 103 104for.body6: ; preds = %for.body6, %for.body3 105 %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body6 ] 106 %arrayidx8 = getelementptr inbounds [2000 x double], ptr %A, i64 %indvars.iv41, i64 %indvars.iv 107 %tmp = load double, ptr %arrayidx8, align 8 108 %arrayidx12 = getelementptr inbounds [2000 x double], ptr %B, i64 %indvars.iv, i64 %indvars.iv38 109 %tmp1 = load double, ptr %arrayidx12, align 8 110 %mul = fmul double %tmp, %tmp1 111 %arrayidx16 = getelementptr inbounds [2000 x double], ptr %C, i64 %indvars.iv41, i64 %indvars.iv38 112 %tmp2 = load double, ptr %arrayidx16, align 8 113 %add = fadd double %tmp2, %mul 114 store double %add, ptr %arrayidx16, align 8 115 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 116 %exitcond = icmp ne i64 %indvars.iv.next, 2000 117 br i1 %exitcond, label %for.body6, label %for.inc17 118 119for.inc17: ; preds = %for.body6 120 %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 121 %exitcond40 = icmp ne i64 %indvars.iv.next39, 2000 122 br i1 %exitcond40, label %for.body3, label %for.inc20 123 124for.inc20: ; preds = %for.inc17 125 %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1 126 %exitcond43 = icmp ne i64 %indvars.iv.next42, 2000 127 br i1 %exitcond43, label %for.body, label %for.end22 128 129for.end22: ; preds = %for.inc20 130 ret void 131} 132