1; 2; Here we have 5-way unswitchable switch with each successor also having an unswitchable 3; exiting branch in it. If we start unswitching those branches we start duplicating the 4; whole switch. This can easily lead to exponential behavior w/o proper control. 5; On a real-life testcase there was 16-way switch and that took forever to compile w/o 6; a cost control. 7; 8; 9; When we use the stricted multiplier candidates formula (unscaled candidates == 0) 10; we should be getting just a single loop. 11; 12; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ 13; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \ 14; RUN: -passes='loop(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1 15; 16; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ 17; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \ 18; RUN: -passes='loop(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1 19; 20; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ 21; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \ 22; RUN: -passes='loop-mssa(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1 23; 24; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ 25; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \ 26; RUN: -passes='loop-mssa(simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1 27; 28; With relaxed candidates multiplier (unscaled candidates == 8) we should allow 29; some unswitches to happen until siblings multiplier starts kicking in: 30; 31; The tests below also run licm, because it is needed to hoist out 32; loop-invariant freeze instructions, which otherwise may block further 33; unswitching. 34; 35; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ 36; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \ 37; RUN: -passes='loop-mssa(licm,simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | \ 38; RUN: sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-RELAX 39; 40; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed 41; siblings multiplier for top-level loops (toplevel-div == 8) we should get 42; considerably more copies of the loop (especially top-level ones). 43; 44; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ 45; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \ 46; RUN: -passes='loop-mssa(licm,simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | \ 47; RUN: sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-RELAX2 48; 49; We get hundreds of copies of the loop when cost multiplier is disabled: 50; 51; RUN: opt < %s -enable-unswitch-cost-multiplier=false \ 52; RUN: -passes='loop-mssa(licm,simple-loop-unswitch<nontrivial>),print<loops>' -disable-output 2>&1 | \ 53; RUN: sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-MAX 54 55; Single loop nest, not unswitched 56; LOOP1: Loop at depth 1 containing: 57; LOOP1-NOT: Loop at depth 1 containing: 58; LOOP1: Loop at depth 2 containing: 59; LOOP1-NOT: Loop at depth 2 containing: 60; 61; Somewhat relaxed restrictions on candidates: 62; LOOP-RELAX-COUNT-5: Loop at depth 1 containing: 63; LOOP-RELAX-NOT: Loop at depth 1 containing: 64; LOOP-RELAX-COUNT-32: Loop at depth 2 containing: 65; LOOP-RELAX-NOT: Loop at depth 2 containing: 66; 67; Even more relaxed restrictions on candidates and siblings. 68; LOOP-RELAX2-COUNT-11: Loop at depth 1 containing: 69; LOOP-RELAX2-NOT: Loop at depth 1 containing: 70; LOOP-RELAX2-COUNT-40: Loop at depth 2 containing: 71; LOOP-RELAX-NOT: Loop at depth 2 containing: 72; 73; Unswitched as much as it could (with multiplier disabled). 74; LOOP-MAX-COUNT-56: Loop at depth 1 containing: 75; LOOP-MAX-NOT: Loop at depth 1 containing: 76; LOOP-MAX-COUNT-111: Loop at depth 2 containing: 77; LOOP-MAX-NOT: Loop at depth 2 containing: 78 79define i32 @loop_switch(ptr %addr, i32 %c1, i32 %c2) { 80entry: 81 %addr2 = getelementptr i32, ptr %addr, i64 1 82 %check0 = icmp eq i32 %c2, 0 83 %check1 = icmp eq i32 %c2, 31 84 %check2 = icmp eq i32 %c2, 32 85 %check3 = icmp eq i32 %c2, 33 86 %check4 = icmp eq i32 %c2, 34 87 br label %outer_loop 88 89outer_loop: 90 %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch] 91 %iv1.next = add i32 %iv1, 1 92 br label %inner_loop 93inner_loop: 94 %iv2 = phi i32 [0, %outer_loop], [%iv2.next, %inner_latch] 95 %iv2.next = add i32 %iv2, 1 96 switch i32 %c1, label %inner_latch [ 97 i32 0, label %case0 98 i32 1, label %case1 99 i32 2, label %case2 100 i32 3, label %case3 101 i32 4, label %case4 102 ] 103 104case4: 105 br i1 %check4, label %exit, label %inner_latch 106case3: 107 br i1 %check3, label %exit, label %inner_latch 108case2: 109 br i1 %check2, label %exit, label %inner_latch 110case1: 111 br i1 %check1, label %exit, label %inner_latch 112case0: 113 br i1 %check0, label %exit, label %inner_latch 114 115inner_latch: 116 store volatile i32 0, ptr %addr 117 %test_inner = icmp slt i32 %iv2, 50 118 br i1 %test_inner, label %inner_loop, label %outer_latch 119 120outer_latch: 121 store volatile i32 0, ptr %addr2 122 %test_outer = icmp slt i32 %iv1, 50 123 br i1 %test_outer, label %outer_loop, label %exit 124 125exit: ; preds = %bci_0 126 ret i32 1 127} 128