1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -passes=loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: opt -S -passes=loop-vectorize -mtriple=x86_64-apple-darwin -mattr=+avx %s | FileCheck %s --check-prefixes=CHECK,AVX 4 5; Two mostly identical functions. The only difference is the presence of 6; fast-math flags on the second. The loop is a pretty simple reduction: 7 8; for (int i = 0; i < 32; ++i) 9; if (arr[i] != 42) 10; tot += arr[i]; 11 12define double @sumIfScalar(ptr nocapture readonly %arr) { 13; CHECK-LABEL: @sumIfScalar( 14; CHECK-NEXT: entry: 15; CHECK-NEXT: br label [[LOOP:%.*]] 16; CHECK: loop: 17; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] 18; CHECK-NEXT: [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] 19; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[I]] 20; CHECK-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8 21; CHECK-NEXT: [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01 22; CHECK-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]] 23; CHECK: do.add: 24; CHECK-NEXT: [[TOT_NEW:%.*]] = fadd double [[TOT]], [[NEXTVAL]] 25; CHECK-NEXT: br label [[NEXT_ITER]] 26; CHECK: no.add: 27; CHECK-NEXT: br label [[NEXT_ITER]] 28; CHECK: next.iter: 29; CHECK-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ] 30; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 31; CHECK-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 32; CHECK-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]] 33; CHECK: done: 34; CHECK-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ] 35; CHECK-NEXT: ret double [[TOT_NEXT_LCSSA]] 36; 37entry: 38 br label %loop 39 40loop: 41 %i = phi i32 [0, %entry], [%i.next, %next.iter] 42 %tot = phi double [0.0, %entry], [%tot.next, %next.iter] 43 44 %addr = getelementptr double, ptr %arr, i32 %i 45 %nextval = load double, ptr %addr 46 47 %tst = fcmp une double %nextval, 42.0 48 br i1 %tst, label %do.add, label %no.add 49 50do.add: 51 %tot.new = fadd double %tot, %nextval 52 br label %next.iter 53 54no.add: 55 br label %next.iter 56 57next.iter: 58 %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add] 59 %i.next = add i32 %i, 1 60 %again = icmp ult i32 %i.next, 32 61 br i1 %again, label %loop, label %done 62 63done: 64 ret double %tot.next 65} 66 67define double @sumIfVector(ptr nocapture readonly %arr) { 68; SSE-LABEL: @sumIfVector( 69; SSE-NEXT: entry: 70; SSE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 71; SSE: vector.ph: 72; SSE-NEXT: br label [[VECTOR_BODY:%.*]] 73; SSE: vector.body: 74; SSE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 75; SSE-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] 76; SSE-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ] 77; SSE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 78; SSE-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]] 79; SSE-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[TMP2]], i32 0 80; SSE-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[TMP2]], i32 2 81; SSE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP4]], align 8 82; SSE-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8 83; SSE-NEXT: [[TMP6:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], splat (double 4.200000e+01) 84; SSE-NEXT: [[TMP7:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD2]], splat (double 4.200000e+01) 85; SSE-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]] 86; SSE-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[VEC_PHI1]], [[WIDE_LOAD2]] 87; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP6]], <2 x double> [[TMP8]], <2 x double> [[VEC_PHI]] 88; SSE-NEXT: [[PREDPHI3]] = select <2 x i1> [[TMP7]], <2 x double> [[TMP9]], <2 x double> [[VEC_PHI1]] 89; SSE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 90; SSE-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 91; SSE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 92; SSE: middle.block: 93; SSE-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI3]], [[PREDPHI]] 94; SSE-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]]) 95; SSE-NEXT: br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]] 96; SSE: scalar.ph: 97; SSE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 98; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] 99; SSE-NEXT: br label [[LOOP:%.*]] 100; SSE: loop: 101; SSE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] 102; SSE-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] 103; SSE-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]] 104; SSE-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8 105; SSE-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01 106; SSE-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]] 107; SSE: do.add: 108; SSE-NEXT: [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]] 109; SSE-NEXT: br label [[NEXT_ITER]] 110; SSE: no.add: 111; SSE-NEXT: br label [[NEXT_ITER]] 112; SSE: next.iter: 113; SSE-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ] 114; SSE-NEXT: [[I_NEXT]] = add i32 [[I]], 1 115; SSE-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 116; SSE-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]] 117; SSE: done: 118; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] 119; SSE-NEXT: ret double [[TOT_NEXT_LCSSA]] 120; 121; AVX-LABEL: @sumIfVector( 122; AVX-NEXT: entry: 123; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 124; AVX: vector.ph: 125; AVX-NEXT: br label [[VECTOR_BODY:%.*]] 126; AVX: vector.body: 127; AVX-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 128; AVX-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] 129; AVX-NEXT: [[VEC_PHI1:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ] 130; AVX-NEXT: [[VEC_PHI2:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI8:%.*]], [[VECTOR_BODY]] ] 131; AVX-NEXT: [[VEC_PHI3:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI9:%.*]], [[VECTOR_BODY]] ] 132; AVX-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 133; AVX-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]] 134; AVX-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[TMP4]], i32 0 135; AVX-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[TMP4]], i32 4 136; AVX-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[TMP4]], i32 8 137; AVX-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP4]], i32 12 138; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP8]], align 8 139; AVX-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x double>, ptr [[TMP9]], align 8 140; AVX-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x double>, ptr [[TMP10]], align 8 141; AVX-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP11]], align 8 142; AVX-NEXT: [[TMP12:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], splat (double 4.200000e+01) 143; AVX-NEXT: [[TMP13:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD4]], splat (double 4.200000e+01) 144; AVX-NEXT: [[TMP14:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD5]], splat (double 4.200000e+01) 145; AVX-NEXT: [[TMP15:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD6]], splat (double 4.200000e+01) 146; AVX-NEXT: [[TMP16:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] 147; AVX-NEXT: [[TMP17:%.*]] = fadd fast <4 x double> [[VEC_PHI1]], [[WIDE_LOAD4]] 148; AVX-NEXT: [[TMP18:%.*]] = fadd fast <4 x double> [[VEC_PHI2]], [[WIDE_LOAD5]] 149; AVX-NEXT: [[TMP19:%.*]] = fadd fast <4 x double> [[VEC_PHI3]], [[WIDE_LOAD6]] 150; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP12]], <4 x double> [[TMP16]], <4 x double> [[VEC_PHI]] 151; AVX-NEXT: [[PREDPHI7]] = select <4 x i1> [[TMP13]], <4 x double> [[TMP17]], <4 x double> [[VEC_PHI1]] 152; AVX-NEXT: [[PREDPHI8]] = select <4 x i1> [[TMP14]], <4 x double> [[TMP18]], <4 x double> [[VEC_PHI2]] 153; AVX-NEXT: [[PREDPHI9]] = select <4 x i1> [[TMP15]], <4 x double> [[TMP19]], <4 x double> [[VEC_PHI3]] 154; AVX-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 155; AVX-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 156; AVX-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 157; AVX: middle.block: 158; AVX-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[PREDPHI7]], [[PREDPHI]] 159; AVX-NEXT: [[BIN_RDX10:%.*]] = fadd fast <4 x double> [[PREDPHI8]], [[BIN_RDX]] 160; AVX-NEXT: [[BIN_RDX11:%.*]] = fadd fast <4 x double> [[PREDPHI9]], [[BIN_RDX10]] 161; AVX-NEXT: [[TMP21:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[BIN_RDX11]]) 162; AVX-NEXT: br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]] 163; AVX: scalar.ph: 164; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 165; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] 166; AVX-NEXT: br label [[LOOP:%.*]] 167; AVX: loop: 168; AVX-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] 169; AVX-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] 170; AVX-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]] 171; AVX-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8 172; AVX-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01 173; AVX-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]] 174; AVX: do.add: 175; AVX-NEXT: [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]] 176; AVX-NEXT: br label [[NEXT_ITER]] 177; AVX: no.add: 178; AVX-NEXT: br label [[NEXT_ITER]] 179; AVX: next.iter: 180; AVX-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ] 181; AVX-NEXT: [[I_NEXT]] = add i32 [[I]], 1 182; AVX-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 183; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]] 184; AVX: done: 185; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] 186; AVX-NEXT: ret double [[TOT_NEXT_LCSSA]] 187; 188entry: 189 br label %loop 190 191loop: 192 %i = phi i32 [0, %entry], [%i.next, %next.iter] 193 %tot = phi double [0.0, %entry], [%tot.next, %next.iter] 194 195 %addr = getelementptr double, ptr %arr, i32 %i 196 %nextval = load double, ptr %addr 197 198 %tst = fcmp fast une double %nextval, 42.0 199 br i1 %tst, label %do.add, label %no.add 200 201do.add: 202 %tot.new = fadd fast double %tot, %nextval 203 br label %next.iter 204 205no.add: 206 br label %next.iter 207 208next.iter: 209 %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add] 210 %i.next = add i32 %i, 1 211 %again = icmp ult i32 %i.next, 32 212 br i1 %again, label %loop, label %done 213 214done: 215 ret double %tot.next 216} 217 218