1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,AVX 4; 5; This file tests the look-ahead operand reordering heuristic. 6; 7; 8; This checks that operand reordering will reorder the operands of the adds 9; by taking into consideration the instructions beyond the immediate 10; predecessors. 11; 12; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1] 13; \ / \ / \ / \ / 14; - - - - 15; \ / \ / 16; + + 17; | | 18; S[0] S[1] 19; 20define void @lookahead_basic(ptr %array) { 21; CHECK-LABEL: @lookahead_basic( 22; CHECK-NEXT: entry: 23; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2 24; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4 25; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6 26; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8 27; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8 28; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8 29; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8 30; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]] 31; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]] 32; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]] 33; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[ARRAY]], align 8 34; CHECK-NEXT: ret void 35; 36entry: 37 %idx1 = getelementptr inbounds double, ptr %array, i64 1 38 %idx2 = getelementptr inbounds double, ptr %array, i64 2 39 %idx3 = getelementptr inbounds double, ptr %array, i64 3 40 %idx4 = getelementptr inbounds double, ptr %array, i64 4 41 %idx5 = getelementptr inbounds double, ptr %array, i64 5 42 %idx6 = getelementptr inbounds double, ptr %array, i64 6 43 %idx7 = getelementptr inbounds double, ptr %array, i64 7 44 45 %A_0 = load double, ptr %array, align 8 46 %A_1 = load double, ptr %idx1, align 8 47 %B_0 = load double, ptr %idx2, align 8 48 %B_1 = load double, ptr %idx3, align 8 49 %C_0 = load double, ptr %idx4, align 8 50 %C_1 = load double, ptr %idx5, align 8 51 %D_0 = load double, ptr %idx6, align 8 52 %D_1 = load double, ptr %idx7, align 8 53 54 %subAB_0 = fsub fast double %A_0, %B_0 55 %subCD_0 = fsub fast double %C_0, %D_0 56 57 %subAB_1 = fsub fast double %A_1, %B_1 58 %subCD_1 = fsub fast double %C_1, %D_1 59 60 %addABCD_0 = fadd fast double %subAB_0, %subCD_0 61 %addCDAB_1 = fadd fast double %subCD_1, %subAB_1 62 63 store double %addABCD_0, ptr %array, align 8 64 store double %addCDAB_1, ptr %idx1, align 8 65 ret void 66} 67 68 69; Check whether the look-ahead operand reordering heuristic will avoid 70; bundling the alt opcodes. The vectorized code should have no shuffles. 71; 72; A[0] B[0] A[0] B[0] A[1] A[1] A[1] B[1] 73; \ / \ / \ / \ / 74; + - - + 75; \ / \ / 76; + + 77; | | 78; S[0] S[1] 79; 80define void @lookahead_alt1(ptr %array) { 81; CHECK-LABEL: @lookahead_alt1( 82; CHECK-NEXT: entry: 83; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2 84; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4 85; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 5 86; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6 87; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 7 88; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8 89; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8 90; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]] 91; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]] 92; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]] 93; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[ARRAY]], align 8 94; CHECK-NEXT: ret void 95; 96entry: 97 %idx1 = getelementptr inbounds double, ptr %array, i64 1 98 %idx2 = getelementptr inbounds double, ptr %array, i64 2 99 %idx3 = getelementptr inbounds double, ptr %array, i64 3 100 %idx4 = getelementptr inbounds double, ptr %array, i64 4 101 %idx5 = getelementptr inbounds double, ptr %array, i64 5 102 %idx6 = getelementptr inbounds double, ptr %array, i64 6 103 %idx7 = getelementptr inbounds double, ptr %array, i64 7 104 105 %A_0 = load double, ptr %array, align 8 106 %A_1 = load double, ptr %idx1, align 8 107 %B_0 = load double, ptr %idx2, align 8 108 %B_1 = load double, ptr %idx3, align 8 109 110 %addAB_0_L = fadd fast double %A_0, %B_0 111 %subAB_0_R = fsub fast double %A_0, %B_0 112 113 %subAB_1_L = fsub fast double %A_1, %B_1 114 %addAB_1_R = fadd fast double %A_1, %B_1 115 116 %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R 117 %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R 118 119 store double %addABCD_0, ptr %array, align 8 120 store double %addCDAB_1, ptr %idx1, align 8 121 ret void 122} 123 124 125; This code should get vectorized all the way to the loads with shuffles for 126; the alt opcodes. 127; 128; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1] 129; \ / \ / \ / \ / 130; + - + - 131; \ / \ / 132; + + 133; | | 134; S[0] S[1] 135; 136define void @lookahead_alt2(ptr %array) { 137; CHECK-LABEL: @lookahead_alt2( 138; CHECK-NEXT: entry: 139; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2 140; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4 141; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6 142; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8 143; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8 144; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8 145; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8 146; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]] 147; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP3]] 148; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3> 149; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]] 150; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]] 151; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3> 152; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]] 153; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[ARRAY]], align 8 154; CHECK-NEXT: ret void 155; 156entry: 157 %idx1 = getelementptr inbounds double, ptr %array, i64 1 158 %idx2 = getelementptr inbounds double, ptr %array, i64 2 159 %idx3 = getelementptr inbounds double, ptr %array, i64 3 160 %idx4 = getelementptr inbounds double, ptr %array, i64 4 161 %idx5 = getelementptr inbounds double, ptr %array, i64 5 162 %idx6 = getelementptr inbounds double, ptr %array, i64 6 163 %idx7 = getelementptr inbounds double, ptr %array, i64 7 164 165 %A_0 = load double, ptr %array, align 8 166 %A_1 = load double, ptr %idx1, align 8 167 %B_0 = load double, ptr %idx2, align 8 168 %B_1 = load double, ptr %idx3, align 8 169 %C_0 = load double, ptr %idx4, align 8 170 %C_1 = load double, ptr %idx5, align 8 171 %D_0 = load double, ptr %idx6, align 8 172 %D_1 = load double, ptr %idx7, align 8 173 174 %addAB_0 = fadd fast double %A_0, %B_0 175 %subCD_0 = fsub fast double %C_0, %D_0 176 177 %addCD_1 = fadd fast double %C_1, %D_1 178 %subAB_1 = fsub fast double %A_1, %B_1 179 180 %addABCD_0 = fadd fast double %addAB_0, %subCD_0 181 %addCDAB_1 = fadd fast double %addCD_1, %subAB_1 182 183 store double %addABCD_0, ptr %array, align 8 184 store double %addCDAB_1, ptr %idx1, align 8 185 ret void 186} 187 188 189; 190; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1] 191; \ / \ / / \ / \ / 192; - - U - - 193; \ / \ / 194; + + 195; | | 196; S[0] S[1] 197; 198; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses. 199; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use. 200 201define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2) { 202; CHECK-LABEL: @lookahead_external_uses( 203; CHECK-NEXT: entry: 204; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1 205; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 206; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 207; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 208; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 209; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 210; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 211; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 212; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 213; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 214; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 215; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] 216; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 217; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 218; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0 219; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]] 220; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] 221; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8 222; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 223; CHECK-NEXT: ret void 224; 225entry: 226 227 %IdxA1 = getelementptr inbounds double, ptr %A, i64 1 228 %IdxB2 = getelementptr inbounds double, ptr %B, i64 2 229 %IdxA2 = getelementptr inbounds double, ptr %A, i64 2 230 %IdxB1 = getelementptr inbounds double, ptr %B, i64 1 231 232 %A0 = load double, ptr %A, align 8 233 %B0 = load double, ptr %B, align 8 234 %C0 = load double, ptr %C, align 8 235 %D0 = load double, ptr %D, align 8 236 237 %A1 = load double, ptr %IdxA1, align 8 238 %B2 = load double, ptr %IdxB2, align 8 239 %A2 = load double, ptr %IdxA2, align 8 240 %B1 = load double, ptr %IdxB1, align 8 241 242 %subA0B0 = fsub fast double %A0, %B0 243 %subC0D0 = fsub fast double %C0, %D0 244 245 %subA1B2 = fsub fast double %A1, %B2 246 %subA2B1 = fsub fast double %A2, %B1 247 248 %add0 = fadd fast double %subA0B0, %subC0D0 249 %add1 = fadd fast double %subA1B2, %subA2B1 250 251 %IdxS1 = getelementptr inbounds double, ptr %S, i64 1 252 253 store double %add0, ptr %S, align 8 254 store double %add1, ptr %IdxS1, align 8 255 256 ; External use 257 store double %A1, ptr %Ext1, align 8 258 ret void 259} 260 261; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1] 262; \ / \ / / \ / \ / \ 263; - - U1,U2,U3 - - U4,U5 264; \ / \ / 265; + + 266; | | 267; S[0] S[1] 268; 269; 270; If we limit the users budget for the look-ahead heuristic to 2, then the 271; look-ahead heuristic has no way of choosing B[1] (with 2 external users) 272; over A[1] (with 3 external users). 273; The result is that the operands are of the Add not reordered and the loads 274; from A get vectorized instead of the loads from B. 275; 276define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2, ptr %Ext3, ptr %Ext4, ptr %Ext5) { 277; CHECK-LABEL: @lookahead_limit_users_budget( 278; CHECK-NEXT: entry: 279; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1 280; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 281; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 282; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 283; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 284; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 285; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 286; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 287; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 288; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 289; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 290; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 291; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 292; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] 293; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 294; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 295; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0 296; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]] 297; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] 298; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8 299; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 300; CHECK-NEXT: store double [[A1]], ptr [[EXT2:%.*]], align 8 301; CHECK-NEXT: store double [[A1]], ptr [[EXT3:%.*]], align 8 302; CHECK-NEXT: store double [[B1]], ptr [[EXT4:%.*]], align 8 303; CHECK-NEXT: store double [[B1]], ptr [[EXT5:%.*]], align 8 304; CHECK-NEXT: ret void 305; 306entry: 307 308 %IdxA1 = getelementptr inbounds double, ptr %A, i64 1 309 %IdxB2 = getelementptr inbounds double, ptr %B, i64 2 310 %IdxA2 = getelementptr inbounds double, ptr %A, i64 2 311 %IdxB1 = getelementptr inbounds double, ptr %B, i64 1 312 313 %A0 = load double, ptr %A, align 8 314 %B0 = load double, ptr %B, align 8 315 %C0 = load double, ptr %C, align 8 316 %D0 = load double, ptr %D, align 8 317 318 %A1 = load double, ptr %IdxA1, align 8 319 %B2 = load double, ptr %IdxB2, align 8 320 %A2 = load double, ptr %IdxA2, align 8 321 %B1 = load double, ptr %IdxB1, align 8 322 323 %subA0B0 = fsub fast double %A0, %B0 324 %subC0D0 = fsub fast double %C0, %D0 325 326 %subA1B2 = fsub fast double %A1, %B2 327 %subA2B1 = fsub fast double %A2, %B1 328 329 %add0 = fadd fast double %subA0B0, %subC0D0 330 %add1 = fadd fast double %subA1B2, %subA2B1 331 332 %IdxS1 = getelementptr inbounds double, ptr %S, i64 1 333 334 store double %add0, ptr %S, align 8 335 store double %add1, ptr %IdxS1, align 8 336 337 ; External uses of A1 338 store double %A1, ptr %Ext1, align 8 339 store double %A1, ptr %Ext2, align 8 340 store double %A1, ptr %Ext3, align 8 341 342 ; External uses of B1 343 store double %B1, ptr %Ext4, align 8 344 store double %B1, ptr %Ext5, align 8 345 346 ret void 347} 348 349; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls). 350 351%Class = type { i8 } 352declare double @_ZN1i2ayEv(ptr) 353declare double @_ZN1i2axEv() 354 355define void @lookahead_crash(ptr %A, ptr %S, ptr %Arg0) { 356; CHECK-LABEL: @lookahead_crash( 357; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8 358; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(ptr [[ARG0:%.*]]) 359; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv() 360; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 361; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 362; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] 363; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[S:%.*]], align 8 364; CHECK-NEXT: ret void 365; 366 %IdxA1 = getelementptr inbounds double, ptr %A, i64 1 367 368 %A0 = load double, ptr %A, align 8 369 %A1 = load double, ptr %IdxA1, align 8 370 371 %C0 = call double @_ZN1i2ayEv(ptr %Arg0) 372 %C1 = call double @_ZN1i2axEv() 373 374 %add0 = fadd fast double %A0, %C0 375 %add1 = fadd fast double %A1, %C1 376 377 %IdxS1 = getelementptr inbounds double, ptr %S, i64 1 378 store double %add0, ptr %S, align 8 379 store double %add1, ptr %IdxS1, align 8 380 ret void 381} 382 383; This checks that we choose to group consecutive extracts from the same vectors. 384define void @ChecksExtractScores(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2) { 385; CHECK-LABEL: @ChecksExtractScores( 386; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1 387; CHECK-NEXT: [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4 388; CHECK-NEXT: [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4 389; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4 390; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4 391; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 392; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer 393; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] 394; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 395; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer 396; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]] 397; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] 398; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8 399; CHECK-NEXT: ret void 400; 401 %idx1 = getelementptr inbounds double, ptr %array, i64 1 402 %loadA0 = load double, ptr %array, align 4 403 %loadA1 = load double, ptr %idx1, align 4 404 405 %loadVec = load <2 x double>, ptr %vecPtr1, align 4 406 %extrA0 = extractelement <2 x double> %loadVec, i32 0 407 %extrA1 = extractelement <2 x double> %loadVec, i32 1 408 %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4 409 %extrB0 = extractelement <2 x double> %loadVec2, i32 0 410 %extrB1 = extractelement <2 x double> %loadVec2, i32 1 411 412 %mul0 = fmul double %extrA0, %loadA0 413 %mul1 = fmul double %extrA1, %loadA0 414 %mul3 = fmul double %extrB0, %loadA1 415 %mul4 = fmul double %extrB1, %loadA1 416 %add0 = fadd double %mul0, %mul3 417 %add1 = fadd double %mul1, %mul4 418 419 %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1 420 store double %add0, ptr %storeArray, align 8 421 store double %add1, ptr %sidx1, align 8 422 ret void 423} 424 425 426define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { 427; SSE-LABEL: @ExtractIdxNotConstantInt1( 428; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef 429; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] 430; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] 431; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 432; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 433; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 434; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 435; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] 436; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0 437; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] 438; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00> 439; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 440; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 441; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] 442; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 443; SSE-NEXT: ret i1 [[CMP_I185]] 444; 445; AVX-LABEL: @ExtractIdxNotConstantInt1( 446; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef 447; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] 448; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] 449; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] 450; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] 451; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 452; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] 453; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 454; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] 455; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] 456; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 457; AVX-NEXT: ret i1 [[CMP_I185]] 458; 459 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef 460 %sub14.i167 = fsub float undef, %vecext.i291.i166 461 %fm = fmul float %a, %sub14.i167 462 %sub25.i168 = fsub float %fm, %b 463 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2 464 %add36.i173 = fadd float %sub25.i168, 10.0 465 %mul72.i179 = fmul float %c, %vecext.i276.i169 466 %add78.i180 = fsub float %mul72.i179, 30.0 467 %add79.i181 = fadd float 2.0, %add78.i180 468 %mul123.i184 = fmul float %add36.i173, %add79.i181 469 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 470 ret i1 %cmp.i185 471} 472 473 474define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { 475; SSE-LABEL: @ExtractIdxNotConstantInt2( 476; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 477; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] 478; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] 479; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 480; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 481; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 482; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 483; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] 484; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0 485; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] 486; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00> 487; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 488; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 489; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] 490; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 491; SSE-NEXT: ret i1 [[CMP_I185]] 492; 493; AVX-LABEL: @ExtractIdxNotConstantInt2( 494; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 495; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] 496; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] 497; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] 498; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] 499; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 500; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] 501; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 502; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] 503; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] 504; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 505; AVX-NEXT: ret i1 [[CMP_I185]] 506; 507 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1 508 %sub14.i167 = fsub float undef, %vecext.i291.i166 509 %fm = fmul float %a, %sub14.i167 510 %sub25.i168 = fsub float %fm, %b 511 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2 512 %add36.i173 = fadd float %sub25.i168, 10.0 513 %mul72.i179 = fmul float %c, %vecext.i276.i169 514 %add78.i180 = fsub float %mul72.i179, 30.0 515 %add79.i181 = fadd float 2.0, %add78.i180 516 %mul123.i184 = fmul float %add36.i173, %add79.i181 517 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 518 ret i1 %cmp.i185 519} 520 521 522define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { 523; CHECK-LABEL: @foo( 524; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 525; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] 526; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 527; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 528; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 poison, i32 1> 529; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0 530; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] 531; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0 532; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] 533; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00> 534; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 535; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 536; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] 537; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 538; CHECK-NEXT: ret i1 [[CMP_I185]] 539; 540 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0 541 %sub14.i167 = fsub float undef, %vecext.i291.i166 542 %fm = fmul float %a, %sub14.i167 543 %sub25.i168 = fsub float %fm, %b 544 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 1 545 %add36.i173 = fadd float %sub25.i168, 10.0 546 %mul72.i179 = fmul float %c, %vecext.i276.i169 547 %add78.i180 = fsub float %mul72.i179, 30.0 548 %add79.i181 = fadd float 2.0, %add78.i180 549 %mul123.i184 = fmul float %add36.i173, %add79.i181 550 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 551 ret i1 %cmp.i185 552} 553 554; Same as @ChecksExtractScores, but the extratelement vector operands do not match. 555define void @ChecksExtractScores_different_vectors(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2, ptr %vecPtr3, ptr %vecPtr4) { 556; 557; SSE-LABEL: @ChecksExtractScores_different_vectors( 558; SSE-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4 559; SSE-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4 560; SSE-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4 561; SSE-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4 562; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY:%.*]], align 4 563; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[LOADVEC2]], <2 x double> [[LOADVEC3]], <2 x i32> <i32 1, i32 2> 564; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]] 565; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 566; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3> 567; SSE-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]] 568; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]] 569; SSE-NEXT: store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8 570; SSE-NEXT: ret void 571; 572; AVX-LABEL: @ChecksExtractScores_different_vectors( 573; AVX-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1 574; AVX-NEXT: [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4 575; AVX-NEXT: [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4 576; AVX-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4 577; AVX-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4 578; AVX-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4 579; AVX-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4 580; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC2]], <2 x i32> <i32 0, i32 3> 581; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 582; AVX-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer 583; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] 584; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC3]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3> 585; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 586; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer 587; AVX-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]] 588; AVX-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]] 589; AVX-NEXT: store <2 x double> [[TMP9]], ptr [[STOREARRAY:%.*]], align 8 590; AVX-NEXT: ret void 591; 592 %idx1 = getelementptr inbounds double, ptr %array, i64 1 593 %loadA0 = load double, ptr %array, align 4 594 %loadA1 = load double, ptr %idx1, align 4 595 596 %loadVec = load <2 x double>, ptr %vecPtr1, align 4 597 %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4 598 %extrA0 = extractelement <2 x double> %loadVec, i32 0 599 %extrA1 = extractelement <2 x double> %loadVec2, i32 1 600 %loadVec3= load <2 x double>, ptr %vecPtr3, align 4 601 %loadVec4 = load <2 x double>, ptr %vecPtr4, align 4 602 %extrB0 = extractelement <2 x double> %loadVec3, i32 0 603 %extrB1 = extractelement <2 x double> %loadVec4, i32 1 604 605 %mul0 = fmul double %extrA0, %loadA0 606 %mul1 = fmul double %extrA1, %loadA0 607 %mul3 = fmul double %extrB0, %loadA1 608 %mul4 = fmul double %extrB1, %loadA1 609 %add0 = fadd double %mul0, %mul3 610 %add1 = fadd double %mul1, %mul4 611 612 %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1 613 store double %add0, ptr %storeArray, align 8 614 store double %add1, ptr %sidx1, align 8 615 ret void 616} 617 618; This checks that we we prefer splats rather than reverse load vectors + shuffles. 619; 2-wide splat loads in x86 use a single instruction so they are quite cheap. 620define double @splat_loads(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) { 621; SSE-LABEL: @splat_loads( 622; SSE-NEXT: entry: 623; SSE-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8 624; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8 625; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 626; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] 627; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] 628; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]] 629; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 630; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 631; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]] 632; SSE-NEXT: ret double [[ADD3]] 633; 634; AVX-LABEL: @splat_loads( 635; AVX-NEXT: entry: 636; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1 637; AVX-NEXT: [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8 638; AVX-NEXT: [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8 639; AVX-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8 640; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 641; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer 642; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] 643; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 644; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer 645; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]] 646; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] 647; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 648; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 649; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP8]], [[TMP9]] 650; AVX-NEXT: ret double [[ADD3]] 651; 652entry: 653 %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1 654 %ld_1_0 = load double, ptr %array1, align 8 655 %ld_1_1 = load double, ptr %gep_1_1, align 8 656 657 %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1 658 %ld_2_0 = load double, ptr %array2, align 8 659 %ld_2_1 = load double, ptr %gep_2_1, align 8 660 661 %mul1 = fmul double %ld_1_0, %ld_2_0 662 %mul2 = fmul double %ld_1_1, %ld_2_0 663 664 %mul3 = fmul double %ld_1_0, %ld_2_1 665 %mul4 = fmul double %ld_1_1, %ld_2_1 666 667 %add1 = fadd double %mul1, %mul3 668 %add2 = fadd double %mul2, %mul4 669 670 %add3 = fadd double %add1, %add2 671 ret double %add3 672} 673 674 675; Same as splat_loads() but the splat load has internal uses in the slp graph. 676define double @splat_loads_with_internal_uses(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) { 677; SSE-LABEL: @splat_loads_with_internal_uses( 678; SSE-NEXT: entry: 679; SSE-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8 680; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8 681; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 682; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] 683; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] 684; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]] 685; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer 686; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] 687; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 688; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 689; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]] 690; SSE-NEXT: ret double [[RES]] 691; 692; AVX-LABEL: @splat_loads_with_internal_uses( 693; AVX-NEXT: entry: 694; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1 695; AVX-NEXT: [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8 696; AVX-NEXT: [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8 697; AVX-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8 698; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 699; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer 700; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] 701; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 702; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer 703; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]] 704; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] 705; AVX-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]] 706; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 707; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 708; AVX-NEXT: [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]] 709; AVX-NEXT: ret double [[RES]] 710; 711entry: 712 %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1 713 %ld_1_0 = load double, ptr %array1, align 8 714 %ld_1_1 = load double, ptr %gep_1_1, align 8 715 716 %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1 717 %ld_2_0 = load double, ptr %array2, align 8 718 %ld_2_1 = load double, ptr %gep_2_1, align 8 719 720 %mul1 = fmul double %ld_1_0, %ld_2_0 721 %mul2 = fmul double %ld_1_1, %ld_2_0 722 723 %mul3 = fmul double %ld_1_0, %ld_2_1 724 %mul4 = fmul double %ld_1_1, %ld_2_1 725 726 %add1 = fadd double %mul1, %mul3 727 %add2 = fadd double %mul2, %mul4 728 729 ; One more user for the broadcast of %ld_2_0 730 %sub1 = fsub double %add1, %ld_2_0 731 %sub2 = fsub double %add2, %ld_2_0 732 733 %res = fadd double %sub1, %sub2 734 735 ret double %res 736} 737