1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s 3; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 4 5target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" 6 7; Make sure we order the operands of commutative operations so that we get 8; bigger vectorizable trees. 9 10define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) { 11; CHECK-LABEL: @shuffle_operands1( 12; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 13; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 14; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1 15; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] 16; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 17; CHECK-NEXT: ret void 18; 19; SSE2-LABEL: @shuffle_operands1( 20; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 21; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 22; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1 23; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] 24; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 25; SSE2-NEXT: ret void 26; 27 %from_1 = getelementptr double, ptr %from, i64 1 28 %v0_1 = load double , ptr %from 29 %v0_2 = load double , ptr %from_1 30 %v1_1 = fadd double %v0_1, %v1 31 %v1_2 = fadd double %v2, %v0_2 32 %to_2 = getelementptr double, ptr %to, i64 1 33 store double %v1_1, ptr %to 34 store double %v1_2, ptr %to_2 35 ret void 36} 37 38define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) { 39; CHECK-LABEL: @vecload_vs_broadcast( 40; CHECK-NEXT: entry: 41; CHECK-NEXT: br label [[LP:%.*]] 42; CHECK: lp: 43; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 44; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 45; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> 46; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 47; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]] 48; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 49; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 50; CHECK: ext: 51; CHECK-NEXT: ret void 52; 53; SSE2-LABEL: @vecload_vs_broadcast( 54; SSE2-NEXT: entry: 55; SSE2-NEXT: br label [[LP:%.*]] 56; SSE2: lp: 57; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 58; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 59; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> 60; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 61; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]] 62; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 63; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 64; SSE2: ext: 65; SSE2-NEXT: ret void 66; 67entry: 68br label %lp 69 70lp: 71 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ] 72 %from_1 = getelementptr double, ptr %from, i64 1 73 %v0_1 = load double , ptr %from 74 %v0_2 = load double , ptr %from_1 75 %v1_1 = fadd double %v0_1, %p 76 %v1_2 = fadd double %v0_1, %v0_2 77 %to_2 = getelementptr double, ptr %to, i64 1 78 store double %v1_1, ptr %to 79 store double %v1_2, ptr %to_2 80 br i1 %c, label %lp, label %ext 81 82ext: 83 ret void 84} 85 86define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) { 87; CHECK-LABEL: @vecload_vs_broadcast2( 88; CHECK-NEXT: entry: 89; CHECK-NEXT: br label [[LP:%.*]] 90; CHECK: lp: 91; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 92; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 93; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> 94; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 95; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] 96; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 97; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 98; CHECK: ext: 99; CHECK-NEXT: ret void 100; 101; SSE2-LABEL: @vecload_vs_broadcast2( 102; SSE2-NEXT: entry: 103; SSE2-NEXT: br label [[LP:%.*]] 104; SSE2: lp: 105; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 106; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 107; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> 108; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 109; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] 110; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 111; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 112; SSE2: ext: 113; SSE2-NEXT: ret void 114; 115entry: 116br label %lp 117 118lp: 119 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ] 120 %from_1 = getelementptr double, ptr %from, i64 1 121 %v0_1 = load double , ptr %from 122 %v0_2 = load double , ptr %from_1 123 %v1_1 = fadd double %p, %v0_1 124 %v1_2 = fadd double %v0_2, %v0_1 125 %to_2 = getelementptr double, ptr %to, i64 1 126 store double %v1_1, ptr %to 127 store double %v1_2, ptr %to_2 128 br i1 %c, label %lp, label %ext 129 130ext: 131 ret void 132} 133 134define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) { 135; CHECK-LABEL: @vecload_vs_broadcast3( 136; CHECK-NEXT: entry: 137; CHECK-NEXT: br label [[LP:%.*]] 138; CHECK: lp: 139; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 140; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 141; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> 142; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 143; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] 144; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 145; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 146; CHECK: ext: 147; CHECK-NEXT: ret void 148; 149; SSE2-LABEL: @vecload_vs_broadcast3( 150; SSE2-NEXT: entry: 151; SSE2-NEXT: br label [[LP:%.*]] 152; SSE2: lp: 153; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 154; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 155; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> 156; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 157; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] 158; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 159; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 160; SSE2: ext: 161; SSE2-NEXT: ret void 162; 163entry: 164br label %lp 165 166lp: 167 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ] 168 %from_1 = getelementptr double, ptr %from, i64 1 169 %v0_1 = load double , ptr %from 170 %v0_2 = load double , ptr %from_1 171 %v1_1 = fadd double %p, %v0_1 172 %v1_2 = fadd double %v0_1, %v0_2 173 %to_2 = getelementptr double, ptr %to, i64 1 174 store double %v1_1, ptr %to 175 store double %v1_2, ptr %to_2 176 br i1 %c, label %lp, label %ext 177 178ext: 179 ret void 180} 181 182define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) { 183; CHECK-LABEL: @shuffle_nodes_match1( 184; CHECK-NEXT: entry: 185; CHECK-NEXT: br label [[LP:%.*]] 186; CHECK: lp: 187; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 188; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8 189; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4 190; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4 191; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 192; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 193; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 194; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer 195; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] 196; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 197; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 198; CHECK: ext: 199; CHECK-NEXT: ret void 200; 201; SSE2-LABEL: @shuffle_nodes_match1( 202; SSE2-NEXT: entry: 203; SSE2-NEXT: br label [[LP:%.*]] 204; SSE2: lp: 205; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 206; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 207; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 208; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 209; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] 210; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 211; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 212; SSE2: ext: 213; SSE2-NEXT: ret void 214; 215entry: 216br label %lp 217 218lp: 219 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ] 220 %from_1 = getelementptr double, ptr %from, i64 1 221 %v0_1 = load double , ptr %from 222 %v0_2 = load double , ptr %from_1 223 %v1_1 = fadd double %v0_2, %v0_1 224 %v1_2 = fadd double %p, %v0_1 225 %to_2 = getelementptr double, ptr %to, i64 1 226 store double %v1_1, ptr %to 227 store double %v1_2, ptr %to_2 228 br i1 %c, label %lp, label %ext 229 230ext: 231 ret void 232} 233 234define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) { 235; CHECK-LABEL: @vecload_vs_broadcast4( 236; CHECK-NEXT: entry: 237; CHECK-NEXT: br label [[LP:%.*]] 238; CHECK: lp: 239; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 240; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 241; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 242; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 243; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] 244; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 245; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 246; CHECK: ext: 247; CHECK-NEXT: ret void 248; 249; SSE2-LABEL: @vecload_vs_broadcast4( 250; SSE2-NEXT: entry: 251; SSE2-NEXT: br label [[LP:%.*]] 252; SSE2: lp: 253; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 254; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 255; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 256; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 257; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] 258; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 259; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 260; SSE2: ext: 261; SSE2-NEXT: ret void 262; 263entry: 264br label %lp 265 266lp: 267 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ] 268 %from_1 = getelementptr double, ptr %from, i64 1 269 %v0_1 = load double , ptr %from 270 %v0_2 = load double , ptr %from_1 271 %v1_1 = fadd double %v0_1, %v0_2 272 %v1_2 = fadd double %p, %v0_1 273 %to_2 = getelementptr double, ptr %to, i64 1 274 store double %v1_1, ptr %to 275 store double %v1_2, ptr %to_2 276 br i1 %c, label %lp, label %ext 277 278ext: 279 ret void 280} 281 282 283define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) { 284; CHECK-LABEL: @shuffle_nodes_match2( 285; CHECK-NEXT: entry: 286; CHECK-NEXT: br label [[LP:%.*]] 287; CHECK: lp: 288; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 289; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8 290; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4 291; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4 292; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 293; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer 294; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 295; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1 296; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] 297; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 298; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 299; CHECK: ext: 300; CHECK-NEXT: ret void 301; 302; SSE2-LABEL: @shuffle_nodes_match2( 303; SSE2-NEXT: entry: 304; SSE2-NEXT: br label [[LP:%.*]] 305; SSE2: lp: 306; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 307; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 308; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 309; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 310; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] 311; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 312; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]] 313; SSE2: ext: 314; SSE2-NEXT: ret void 315; 316entry: 317br label %lp 318 319lp: 320 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ] 321 %from_1 = getelementptr double, ptr %from, i64 1 322 %v0_1 = load double , ptr %from 323 %v0_2 = load double , ptr %from_1 324 %v1_1 = fadd double %v0_1, %v0_2 325 %v1_2 = fadd double %v0_1, %p 326 %to_2 = getelementptr double, ptr %to, i64 1 327 store double %v1_1, ptr %to 328 store double %v1_2, ptr %to_2 329 br i1 %c, label %lp, label %ext 330 331ext: 332 ret void 333} 334 335; Make sure we don't scramble operands when we reorder them and destroy 336; 'good' source order. 337 338@a = common global [32000 x float] zeroinitializer, align 16 339 340define void @good_load_order() { 341; CHECK-LABEL: @good_load_order( 342; CHECK-NEXT: entry: 343; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] 344; CHECK: for.cond1.preheader: 345; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16 346; CHECK-NEXT: br label [[FOR_BODY3:%.*]] 347; CHECK: for.body3: 348; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ] 349; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] 350; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 351; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 352; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]] 353; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 354; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]] 355; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 356; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 357; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] 358; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 359; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 360; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2> 361; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0 362; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] 363; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 364; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 365; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 366; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]] 367; CHECK-NEXT: [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4 368; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]] 369; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 370; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 371; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 372; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] 373; CHECK: for.end: 374; CHECK-NEXT: ret void 375; 376; SSE2-LABEL: @good_load_order( 377; SSE2-NEXT: entry: 378; SSE2-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] 379; SSE2: for.cond1.preheader: 380; SSE2-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16 381; SSE2-NEXT: br label [[FOR_BODY3:%.*]] 382; SSE2: for.body3: 383; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ] 384; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] 385; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 386; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 387; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]] 388; SSE2-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 389; SSE2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]] 390; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 391; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 392; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] 393; SSE2-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 394; SSE2-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 395; SSE2-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2> 396; SSE2-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0 397; SSE2-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] 398; SSE2-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 399; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 400; SSE2-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 401; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]] 402; SSE2-NEXT: [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4 403; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]] 404; SSE2-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 405; SSE2-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 406; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 407; SSE2-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] 408; SSE2: for.end: 409; SSE2-NEXT: ret void 410; 411entry: 412 br label %for.cond1.preheader 413 414for.cond1.preheader: 415 %0 = load float, ptr @a, align 16 416 br label %for.body3 417 418for.body3: 419 %1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ] 420 %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] 421 %2 = add nsw i64 %indvars.iv, 1 422 %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2 423 %3 = load float, ptr %arrayidx, align 4 424 %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv 425 %mul6 = fmul float %3, %1 426 store float %mul6, ptr %arrayidx5, align 4 427 %4 = add nsw i64 %indvars.iv, 2 428 %arrayidx11 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %4 429 %5 = load float, ptr %arrayidx11, align 4 430 %mul15 = fmul float %5, %3 431 store float %mul15, ptr %arrayidx, align 4 432 %6 = add nsw i64 %indvars.iv, 3 433 %arrayidx21 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %6 434 %7 = load float, ptr %arrayidx21, align 4 435 %mul25 = fmul float %7, %5 436 store float %mul25, ptr %arrayidx11, align 4 437 %8 = add nsw i64 %indvars.iv, 4 438 %arrayidx31 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %8 439 %9 = load float, ptr %arrayidx31, align 4 440 %mul35 = fmul float %9, %7 441 store float %mul35, ptr %arrayidx21, align 4 442 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 443 %arrayidx41 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv.next 444 %10 = load float, ptr %arrayidx41, align 4 445 %mul45 = fmul float %10, %9 446 store float %mul45, ptr %arrayidx31, align 4 447 %11 = trunc i64 %indvars.iv.next to i32 448 %cmp2 = icmp slt i32 %11, 31995 449 br i1 %cmp2, label %for.body3, label %for.end 450 451for.end: 452 ret void 453} 454 455; Check vectorization of following code for double data type- 456; c[0] = a[0]+b[0]; 457; c[1] = b[1]+a[1]; // swapped b[1] and a[1] 458 459define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){ 460; CHECK-LABEL: @load_reorder_double( 461; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 462; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 463; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] 464; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4 465; CHECK-NEXT: ret void 466; 467; SSE2-LABEL: @load_reorder_double( 468; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 469; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 470; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] 471; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4 472; SSE2-NEXT: ret void 473; 474 %1 = load double, ptr %a 475 %2 = load double, ptr %b 476 %3 = fadd double %1, %2 477 store double %3, ptr %c 478 %4 = getelementptr inbounds double, ptr %b, i64 1 479 %5 = load double, ptr %4 480 %6 = getelementptr inbounds double, ptr %a, i64 1 481 %7 = load double, ptr %6 482 %8 = fadd double %5, %7 483 %9 = getelementptr inbounds double, ptr %c, i64 1 484 store double %8, ptr %9 485 ret void 486} 487 488; Check vectorization of following code for float data type- 489; c[0] = a[0]+b[0]; 490; c[1] = b[1]+a[1]; // swapped b[1] and a[1] 491; c[2] = a[2]+b[2]; 492; c[3] = a[3]+b[3]; 493 494define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){ 495; CHECK-LABEL: @load_reorder_float( 496; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 497; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 498; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] 499; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4 500; CHECK-NEXT: ret void 501; 502; SSE2-LABEL: @load_reorder_float( 503; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 504; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 505; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] 506; SSE2-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4 507; SSE2-NEXT: ret void 508; 509 %1 = load float, ptr %a 510 %2 = load float, ptr %b 511 %3 = fadd float %1, %2 512 store float %3, ptr %c 513 %4 = getelementptr inbounds float, ptr %b, i64 1 514 %5 = load float, ptr %4 515 %6 = getelementptr inbounds float, ptr %a, i64 1 516 %7 = load float, ptr %6 517 %8 = fadd float %5, %7 518 %9 = getelementptr inbounds float, ptr %c, i64 1 519 store float %8, ptr %9 520 %10 = getelementptr inbounds float, ptr %a, i64 2 521 %11 = load float, ptr %10 522 %12 = getelementptr inbounds float, ptr %b, i64 2 523 %13 = load float, ptr %12 524 %14 = fadd float %11, %13 525 %15 = getelementptr inbounds float, ptr %c, i64 2 526 store float %14, ptr %15 527 %16 = getelementptr inbounds float, ptr %a, i64 3 528 %17 = load float, ptr %16 529 %18 = getelementptr inbounds float, ptr %b, i64 3 530 %19 = load float, ptr %18 531 %20 = fadd float %17, %19 532 %21 = getelementptr inbounds float, ptr %c, i64 3 533 store float %20, ptr %21 534 ret void 535} 536 537; Check we properly reorder the below code so that it gets vectorized optimally- 538; a[0] = (b[0]+c[0])+d[0]; 539; a[1] = d[1]+(b[1]+c[1]); 540; a[2] = (b[2]+c[2])+d[2]; 541; a[3] = (b[3]+c[3])+d[3]; 542 543define void @opcode_reorder(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c,ptr noalias nocapture readonly %d) { 544; CHECK-LABEL: @opcode_reorder( 545; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 546; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 547; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] 548; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 549; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]] 550; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4 551; CHECK-NEXT: ret void 552; 553; SSE2-LABEL: @opcode_reorder( 554; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 555; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 556; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] 557; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 558; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]] 559; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4 560; SSE2-NEXT: ret void 561; 562 %1 = load float, ptr %b 563 %2 = load float, ptr %c 564 %3 = fadd float %1, %2 565 %4 = load float, ptr %d 566 %5 = fadd float %3, %4 567 store float %5, ptr %a 568 %6 = getelementptr inbounds float, ptr %d, i64 1 569 %7 = load float, ptr %6 570 %8 = getelementptr inbounds float, ptr %b, i64 1 571 %9 = load float, ptr %8 572 %10 = getelementptr inbounds float, ptr %c, i64 1 573 %11 = load float, ptr %10 574 %12 = fadd float %9, %11 575 %13 = fadd float %7, %12 576 %14 = getelementptr inbounds float, ptr %a, i64 1 577 store float %13, ptr %14 578 %15 = getelementptr inbounds float, ptr %b, i64 2 579 %16 = load float, ptr %15 580 %17 = getelementptr inbounds float, ptr %c, i64 2 581 %18 = load float, ptr %17 582 %19 = fadd float %16, %18 583 %20 = getelementptr inbounds float, ptr %d, i64 2 584 %21 = load float, ptr %20 585 %22 = fadd float %19, %21 586 %23 = getelementptr inbounds float, ptr %a, i64 2 587 store float %22, ptr %23 588 %24 = getelementptr inbounds float, ptr %b, i64 3 589 %25 = load float, ptr %24 590 %26 = getelementptr inbounds float, ptr %c, i64 3 591 %27 = load float, ptr %26 592 %28 = fadd float %25, %27 593 %29 = getelementptr inbounds float, ptr %d, i64 3 594 %30 = load float, ptr %29 595 %31 = fadd float %28, %30 596 %32 = getelementptr inbounds float, ptr %a, i64 3 597 store float %31, ptr %32 598 ret void 599} 600