1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s 3 4; Make sure that we rotate the graph to help avoid the shuffle to 5; the external vectorizable stores. 6; 7; SLP starts vectorizing from the operands of the `fcmp` in bb2, then crosses 8; into bb1, vectorizing all the way to the broadcast load at the top. 9; The stores in bb1 are external to this tree, but they are vectorizable and are 10; in reverse order. 11define void @rotate_with_external_users(ptr %A, ptr %ptr) { 12; CHECK-LABEL: @rotate_with_external_users( 13; CHECK-NEXT: bb1: 14; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 15; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 16; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer 17; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], <double 2.200000e+00, double 1.100000e+00> 18; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 2.200000e+00, double 1.100000e+00> 19; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[A:%.*]], align 8 20; CHECK-NEXT: br label [[BB2:%.*]] 21; CHECK: bb2: 22; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], <double 4.400000e+00, double 3.300000e+00> 23; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 24; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 25; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]] 26; CHECK-NEXT: ret void 27; 28bb1: 29 %ld = load double, ptr undef 30 31 %add1 = fadd double %ld, 1.1 32 %add2 = fadd double %ld, 2.2 33 34 %mul1 = fmul double %add1, 1.1 35 %mul2 = fmul double %add2, 2.2 36 37 ; Thes are external vectorizable stores with operands in reverse order. 38 %ptrA2 = getelementptr inbounds double, ptr %A, i64 1 39 store double %mul2, ptr %A 40 store double %mul1, ptr %ptrA2 41 br label %bb2 42 43bb2: 44 %add3 = fadd double %mul1, 3.3 45 %add4 = fadd double %mul2, 4.4 46 %seed = fcmp ogt double %add3, %add4 47 ret void 48} 49 50; This checks that non-consecutive external users are skipped. 51define void @non_consecutive_external_users(ptr %A, ptr %ptr) { 52; CHECK-LABEL: @non_consecutive_external_users( 53; CHECK-NEXT: bb1: 54; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 55; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[LD]], i32 0 56; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer 57; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[SHUFFLE]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00> 58; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x double> [[TMP1]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00> 59; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00> 60; CHECK-NEXT: [[PTRA4:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 3 61; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 3 62; CHECK-NEXT: store double [[TMP4]], ptr [[A]], align 8 63; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 2 64; CHECK-NEXT: store double [[TMP5]], ptr [[A]], align 8 65; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 1 66; CHECK-NEXT: store double [[TMP6]], ptr [[PTRA4]], align 8 67; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 0 68; CHECK-NEXT: store double [[TMP7]], ptr [[PTRA4]], align 8 69; CHECK-NEXT: br label [[SEED_LOOP:%.*]] 70; CHECK: seed_loop: 71; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x double> [ [[TMP3]], [[BB1:%.*]] ], [ zeroinitializer, [[SEED_LOOP]] ] 72; CHECK-NEXT: br label [[SEED_LOOP]] 73; 74bb1: 75 %ld = load double, ptr undef 76 77 %add5 = fadd double %ld, 1.1 78 %add6 = fadd double %ld, 2.2 79 %add7 = fadd double %ld, 3.3 80 %add8 = fadd double %ld, 4.4 81 82 %add1 = fadd double %add5, 1.1 83 %add2 = fadd double %add6, 2.2 84 %add3 = fadd double %add7, 3.3 85 %add4 = fadd double %add8, 4.4 86 87 %mul1 = fmul double %add1, 1.1 88 %mul2 = fmul double %add2, 2.2 89 %mul3 = fmul double %add3, 3.3 90 %mul4 = fmul double %add4, 4.4 91 92 ; External non-consecutive stores. 93 %ptrA4 = getelementptr inbounds double, ptr %A, i64 3 94 store double %mul4, ptr %A 95 store double %mul3, ptr %A 96 store double %mul2, ptr %ptrA4 97 store double %mul1, ptr %ptrA4 98 br label %seed_loop 99 100seed_loop: 101 %phi1 = phi double [ %mul1, %bb1 ], [ 0.0, %seed_loop ] 102 %phi2 = phi double [ %mul2, %bb1 ], [ 0.0, %seed_loop ] 103 %phi3 = phi double [ %mul3, %bb1 ], [ 0.0, %seed_loop ] 104 %phi4 = phi double [ %mul4, %bb1 ], [ 0.0, %seed_loop ] 105 br label %seed_loop 106} 107 108; We have to be careful when the tree contains add/sub patterns that could be 109; combined into a single addsub instruction. Reordering can block the pattern. 110define void @addsub_and_external_users(ptr %A, ptr %ptr) { 111; CHECK-LABEL: @addsub_and_external_users( 112; CHECK-NEXT: bb1: 113; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 114; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 115; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer 116; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00> 117; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00> 118; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3> 119; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.100000e+00, double 2.200000e+00> 120; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.100000e+00, double 3.200000e+00> 121; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 122; CHECK-NEXT: store <2 x double> [[SHUFFLE1]], ptr [[A:%.*]], align 8 123; CHECK-NEXT: br label [[BB2:%.*]] 124; CHECK: bb2: 125; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], <double 4.100000e+00, double 4.200000e+00> 126; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 127; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 128; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]] 129; CHECK-NEXT: ret void 130; 131bb1: 132 %ld = load double, ptr undef 133 134 %sub1 = fsub double %ld, 1.1 135 %add2 = fadd double %ld, 1.2 136 137 %div1 = fdiv double %sub1, 2.1 138 %div2 = fdiv double %add2, 2.2 139 140 %mul1 = fmul double %div1, 3.1 141 %mul2 = fmul double %div2, 3.2 142 143 ; These are external vectorizable stores with operands in reverse order. 144 %ptrA1 = getelementptr inbounds double, ptr %A, i64 1 145 store double %mul2, ptr %A 146 store double %mul1, ptr %ptrA1 147 br label %bb2 148 149bb2: 150 %addS1 = fadd double %mul1, 4.1 151 %addS2 = fadd double %mul2, 4.2 152 %seed = fcmp ogt double %addS1, %addS2 153 ret void 154} 155 156; This contains a sub/add bundle, reordering it will make it better. 157define void @subadd_and_external_users(ptr %A, ptr %ptr) { 158; CHECK-LABEL: @subadd_and_external_users( 159; CHECK-NEXT: bb1: 160; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 161; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 162; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer 163; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00> 164; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00> 165; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3> 166; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.200000e+00, double 2.100000e+00> 167; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.200000e+00, double 3.100000e+00> 168; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[A:%.*]], align 8 169; CHECK-NEXT: br label [[BB2:%.*]] 170; CHECK: bb2: 171; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], <double 4.200000e+00, double 4.100000e+00> 172; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 173; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 174; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP9]], [[TMP8]] 175; CHECK-NEXT: ret void 176; 177bb1: 178 %ld = load double, ptr undef 179 180 %add1 = fadd double %ld, 1.1 181 %sub2 = fsub double %ld, 1.2 182 183 %div1 = fdiv double %add1, 2.1 184 %div2 = fdiv double %sub2, 2.2 185 186 %mul1 = fmul double %div1, 3.1 187 %mul2 = fmul double %div2, 3.2 188 189 ; These are external vectorizable stores with operands in reverse order. 190 %ptrA1 = getelementptr inbounds double, ptr %A, i64 1 191 store double %mul2, ptr %A 192 store double %mul1, ptr %ptrA1 193 br label %bb2 194 195bb2: 196 %addS1 = fadd double %mul1, 4.1 197 %addS2 = fadd double %mul2, 4.2 198 %seed = fcmp ogt double %addS1, %addS2 199 ret void 200} 201 202define void @alt_but_not_addsub_and_external_users(ptr %A, ptr %ptr) { 203; CHECK-LABEL: @alt_but_not_addsub_and_external_users( 204; CHECK-NEXT: bb1: 205; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 206; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[LD]], i32 0 207; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer 208; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> [[SHUFFLE]], <double 1.400000e+00, double 1.300000e+00, double 1.200000e+00, double 1.100000e+00> 209; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x double> [[SHUFFLE]], <double 1.400000e+00, double 1.300000e+00, double 1.200000e+00, double 1.100000e+00> 210; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3> 211; CHECK-NEXT: [[TMP4:%.*]] = fdiv <4 x double> [[TMP3]], <double 2.400000e+00, double 2.300000e+00, double 2.200000e+00, double 2.100000e+00> 212; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], <double 3.400000e+00, double 3.300000e+00, double 3.200000e+00, double 3.100000e+00> 213; CHECK-NEXT: store <4 x double> [[TMP5]], ptr [[A:%.*]], align 8 214; CHECK-NEXT: br label [[BB2:%.*]] 215; CHECK: bb2: 216; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x double> [ [[TMP5]], [[BB1:%.*]] ], [ <double 4.400000e+00, double 4.300000e+00, double 4.200000e+00, double 4.100000e+00>, [[BB2]] ] 217; CHECK-NEXT: br label [[BB2]] 218; 219bb1: 220 %ld = load double, ptr undef 221 222 %sub1 = fsub double %ld, 1.1 223 %add2 = fadd double %ld, 1.2 224 %add3 = fadd double %ld, 1.3 225 %sub4 = fsub double %ld, 1.4 226 227 %div1 = fdiv double %sub1, 2.1 228 %div2 = fdiv double %add2, 2.2 229 %div3 = fdiv double %add3, 2.3 230 %div4 = fdiv double %sub4, 2.4 231 232 %mul1 = fmul double %div1, 3.1 233 %mul2 = fmul double %div2, 3.2 234 %mul3 = fmul double %div3, 3.3 235 %mul4 = fmul double %div4, 3.4 236 237 ; These are external vectorizable stores with operands in reverse order. 238 %ptrA3 = getelementptr inbounds double, ptr %A, i64 3 239 %ptrA2 = getelementptr inbounds double, ptr %A, i64 2 240 %ptrA1 = getelementptr inbounds double, ptr %A, i64 1 241 store double %mul4, ptr %A 242 store double %mul3, ptr %ptrA1 243 store double %mul2, ptr %ptrA2 244 store double %mul1, ptr %ptrA3 245 br label %bb2 246 247bb2: 248 %phi1 = phi double [ %mul1, %bb1 ], [ 4.1, %bb2 ] 249 %phi2 = phi double [ %mul2, %bb1 ], [ 4.2, %bb2 ] 250 %phi3 = phi double [ %mul3, %bb1 ], [ 4.3, %bb2 ] 251 %phi4 = phi double [ %mul4, %bb1 ], [ 4.4, %bb2 ] 252 br label %bb2 253} 254