1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -passes=slp-vectorizer,verify -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s -check-prefix=ENABLED 3; 4; Without supernode operand reordering, this does not get fully vectorized. 5; S[0] = (A[0] + B[0]) + C[0] 6; S[1] = (B[1] + C[1]) + A[1] 7define void @test_supernode_add(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { 8; ENABLED-LABEL: @test_supernode_add( 9; ENABLED-NEXT: entry: 10; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 11; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 12; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 13; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3> 14; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] 15; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3> 16; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]] 17; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 18; ENABLED-NEXT: ret void 19; 20entry: 21 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1 22 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1 23 %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1 24 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1 25 26 %A0 = load double, ptr %Aarray, align 8 27 %A1 = load double, ptr %idxA1, align 8 28 29 %B0 = load double, ptr %Barray, align 8 30 %B1 = load double, ptr %idxB1, align 8 31 32 %C0 = load double, ptr %Carray, align 8 33 %C1 = load double, ptr %idxC1, align 8 34 35 %addA0B0 = fadd fast double %A0, %B0 36 %addB1C1 = fadd fast double %B1, %C1 37 %add0 = fadd fast double %addA0B0, %C0 38 %add1 = fadd fast double %addB1C1, %A1 39 store double %add0, ptr %Sarray, align 8 40 store double %add1, ptr %idxS1, align 8 41 ret void 42} 43 44 45; Without supernode operand reordering, this does not get fully vectorized. 46; S[0] = (A[0] - B[0]) + C[0] 47; S[1] = (C[1] - B[1]) + A[1] 48define void @test_supernode_addsub(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { 49; ENABLED-LABEL: @test_supernode_addsub( 50; ENABLED-NEXT: entry: 51; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 52; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 53; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 54; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3> 55; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] 56; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3> 57; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]] 58; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 59; ENABLED-NEXT: ret void 60; 61entry: 62 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1 63 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1 64 %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1 65 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1 66 67 %A0 = load double, ptr %Aarray, align 8 68 %A1 = load double, ptr %idxA1, align 8 69 70 %B0 = load double, ptr %Barray, align 8 71 %B1 = load double, ptr %idxB1, align 8 72 73 %C0 = load double, ptr %Carray, align 8 74 %C1 = load double, ptr %idxC1, align 8 75 76 %subA0B0 = fsub fast double %A0, %B0 77 %subC1B1 = fsub fast double %C1, %B1 78 %add0 = fadd fast double %subA0B0, %C0 79 %add1 = fadd fast double %subC1B1, %A1 80 store double %add0, ptr %Sarray, align 8 81 store double %add1, ptr %idxS1, align 8 82 ret void 83} 84 85; Without supernode operand reordering, this does not get fully vectorized. 86; This checks that the super-node works with alternate sequences. 87; 88; S[0] = (A[0] - B[0]) - C[0] 89; S[1] = (B[1] + C[1]) + A[1] 90define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { 91; ENABLED-LABEL: @test_supernode_addsub_alt( 92; ENABLED-NEXT: entry: 93; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 94; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 95; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 96; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3> 97; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] 98; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]] 99; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3> 100; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3> 101; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP7]] 102; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP7]] 103; ENABLED-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3> 104; ENABLED-NEXT: store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8 105; ENABLED-NEXT: ret void 106; 107entry: 108 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1 109 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1 110 %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1 111 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1 112 113 %A0 = load double, ptr %Aarray, align 8 114 %A1 = load double, ptr %idxA1, align 8 115 116 %B0 = load double, ptr %Barray, align 8 117 %B1 = load double, ptr %idxB1, align 8 118 119 %C0 = load double, ptr %Carray, align 8 120 %C1 = load double, ptr %idxC1, align 8 121 122 %subA0B0 = fsub fast double %A0, %B0 123 %addB1C1 = fadd fast double %B1, %C1 124 %sub0 = fsub fast double %subA0B0, %C0 125 %add1 = fadd fast double %addB1C1, %A1 126 store double %sub0, ptr %Sarray, align 8 127 store double %add1, ptr %idxS1, align 8 128 ret void 129} 130 131; This checks that vectorizeTree() works correctly with the supernode 132; and does not generate uses before defs. 133; If all of the operands of the supernode are vectorizable, then the scheduler 134; will fix their position in the program. If not, then the scheduler may not 135; touch them, leading to uses before defs. 136; 137; A0 = ... 138; C = ... 139; t1 = A0 + C 140; B0 = ... 141; t2 = t1 + B0 142; A1 = ... 143; B1 = ... 144; t3 = A1 + B1 145; D = ... 146; t4 = t3 + D 147; 148; 149; A0 C A1 B1 A0 C A1 D A0:1 C,D 150; \ / \ / Reorder \ / \ / Bundles \ / 151; t1 + B0 t3 + D -------> t1 + B0 t3 + B1 ------> t1:3 + B0:1 152; |/ |/ |/ |/ |/ 153; t2 + t4 + t2 + t4 + t2:4 + 154; 155; After reordering, 'D' conceptually becomes an operand of t3: 156; t3 = A1 + D 157; But D is defined *after* its use. 158; 159define void @supernode_scheduling(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Darray, ptr %Sarray) { 160; ENABLED-LABEL: @supernode_scheduling( 161; ENABLED-NEXT: entry: 162; ENABLED-NEXT: [[C:%.*]] = load double, ptr [[CARRAY:%.*]], align 8 163; ENABLED-NEXT: [[D:%.*]] = load double, ptr [[DARRAY:%.*]], align 8 164; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 165; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 166; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C]], i32 0 167; ENABLED-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]] 168; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[D]], i32 1 169; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP4]] 170; ENABLED-NEXT: store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8 171; ENABLED-NEXT: ret void 172; 173entry: 174 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1 175 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1 176 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1 177 178 179 %A0 = load double, ptr %Aarray, align 8 180 %C = load double, ptr %Carray, align 8 181 %t1 = fadd fast double %A0, %C 182 %B0 = load double, ptr %Barray, align 8 183 %t2 = fadd fast double %t1, %B0 184 %A1 = load double, ptr %idxA1, align 8 185 %B1 = load double, ptr %idxB1, align 8 186 %t3 = fadd fast double %A1, %B1 187 %D = load double, ptr %Darray, align 8 188 %t4 = fadd fast double %t3, %D 189 190 store double %t2, ptr %Sarray, align 8 191 store double %t4, ptr %idxS1, align 8 192 ret void 193} 194 195 196; The SLP scheduler has trouble moving instructions across blocks. 197; Even though we can build a SuperNode for this example, we should not because the scheduler 198; cannot handle the cross-block instruction motion that is required once the operands of the 199; SuperNode are reordered. 200; 201; bb1: 202; A0 = ... 203; B1 = ... 204; Tmp0 = A0 + 2.0 205; Tmp1 = B1 + 2.0 206; 207; bb2: 208; A1 = ... 209; B0 = ... 210; S[0] = Tmp0 + B0 211; S[1] = Tmp1 + A1 212define void @supernode_scheduling_cross_block(ptr %Aarray, ptr %Barray, ptr %Sarray) { 213; ENABLED-LABEL: @supernode_scheduling_cross_block( 214; ENABLED-NEXT: entry: 215; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1 216; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1 217; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 218; ENABLED-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 219; ENABLED-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 220; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[B1]], i32 1 221; ENABLED-NEXT: [[TMP2:%.*]] = fadd fast <2 x double> [[TMP1]], splat (double 2.000000e+00) 222; ENABLED-NEXT: br label [[BB:%.*]] 223; ENABLED: bb: 224; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 225; ENABLED-NEXT: [[B0:%.*]] = load double, ptr [[BARRAY]], align 8 226; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 227; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[A1]], i32 1 228; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] 229; ENABLED-NEXT: store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8 230; ENABLED-NEXT: ret void 231; 232entry: 233 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1 234 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1 235 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1 236 237 %A0 = load double, ptr %Aarray, align 8 238 %B1 = load double, ptr %idxB1, align 8 239 %Tmp0 = fadd fast double %A0, 2.0 240 %Tmp1 = fadd fast double %B1, 2.0 241br label %bb 242 243bb: 244 %A1 = load double, ptr %idxA1, align 8 245 %B0 = load double, ptr %Barray, align 8 246 247 %Sum0 = fadd fast double %Tmp0, %B0 248 %Sum1 = fadd fast double %Tmp1, %A1 249 250 store double %Sum0, ptr %Sarray, align 8 251 store double %Sum1, ptr %idxS1, align 8 252 ret void 253} 254