1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-apple-macosx10.9.0 | FileCheck %s --check-prefixes=CHECK-X86 %} 3; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=CHECK-AARCH64 %} 4 5@A = common global [2000 x double] zeroinitializer, align 16 6@B = common global [2000 x double] zeroinitializer, align 16 7@C = common global [2000 x float] zeroinitializer, align 16 8@D = common global [2000 x float] zeroinitializer, align 16 9 10; Function Attrs: nounwind ssp uwtable 11define void @foo_3double(i32 %u) #0 { 12; CHECK-LABEL: @foo_3double( 13; CHECK-NEXT: entry: 14; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 15; CHECK-NEXT: store i32 [[U:%.*]], ptr [[U_ADDR]], align 4 16; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[U]], 3 17; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 18; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 [[IDXPROM]] 19; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 [[IDXPROM]] 20; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 21; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 8 22; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]] 23; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[ARRAYIDX]], align 8 24; CHECK-NEXT: [[ADD24:%.*]] = add nsw i32 [[MUL]], 2 25; CHECK-NEXT: [[IDXPROM25:%.*]] = sext i32 [[ADD24]] to i64 26; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 [[IDXPROM25]] 27; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX26]], align 8 28; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 [[IDXPROM25]] 29; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX30]], align 8 30; CHECK-NEXT: [[ADD31:%.*]] = fadd double [[TMP3]], [[TMP4]] 31; CHECK-NEXT: store double [[ADD31]], ptr [[ARRAYIDX26]], align 8 32; CHECK-NEXT: ret void 33; 34entry: 35 %u.addr = alloca i32, align 4 36 store i32 %u, ptr %u.addr, align 4 37 %mul = mul nsw i32 %u, 3 38 %idxprom = sext i32 %mul to i64 39 %arrayidx = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom 40 %0 = load double, ptr %arrayidx, align 8 41 %arrayidx4 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom 42 %1 = load double, ptr %arrayidx4, align 8 43 %add5 = fadd double %0, %1 44 store double %add5, ptr %arrayidx, align 8 45 %add11 = add nsw i32 %mul, 1 46 %idxprom12 = sext i32 %add11 to i64 47 %arrayidx13 = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom12 48 %2 = load double, ptr %arrayidx13, align 8 49 %arrayidx17 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom12 50 %3 = load double, ptr %arrayidx17, align 8 51 %add18 = fadd double %2, %3 52 store double %add18, ptr %arrayidx13, align 8 53 %add24 = add nsw i32 %mul, 2 54 %idxprom25 = sext i32 %add24 to i64 55 %arrayidx26 = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom25 56 %4 = load double, ptr %arrayidx26, align 8 57 %arrayidx30 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom25 58 %5 = load double, ptr %arrayidx30, align 8 59 %add31 = fadd double %4, %5 60 store double %add31, ptr %arrayidx26, align 8 61 ret void 62} 63 64; SCEV should be able to tell that accesses A[C1 + C2*i], A[C1 + C2*i], ... 65; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0. 66; Thus, the following code should be vectorized. 67; Function Attrs: nounwind ssp uwtable 68define void @foo_2double(i32 %u) #0 { 69; CHECK-LABEL: @foo_2double( 70; CHECK-NEXT: entry: 71; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 72; CHECK-NEXT: store i32 [[U:%.*]], ptr [[U_ADDR]], align 4 73; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[U]], 2 74; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 75; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 [[IDXPROM]] 76; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 [[IDXPROM]] 77; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 78; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 8 79; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]] 80; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[ARRAYIDX]], align 8 81; CHECK-NEXT: ret void 82; 83entry: 84 %u.addr = alloca i32, align 4 85 store i32 %u, ptr %u.addr, align 4 86 %mul = mul nsw i32 %u, 2 87 %idxprom = sext i32 %mul to i64 88 %arrayidx = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom 89 %0 = load double, ptr %arrayidx, align 8 90 %arrayidx4 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom 91 %1 = load double, ptr %arrayidx4, align 8 92 %add5 = fadd double %0, %1 93 store double %add5, ptr %arrayidx, align 8 94 %add11 = add nsw i32 %mul, 1 95 %idxprom12 = sext i32 %add11 to i64 96 %arrayidx13 = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom12 97 %2 = load double, ptr %arrayidx13, align 8 98 %arrayidx17 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom12 99 %3 = load double, ptr %arrayidx17, align 8 100 %add18 = fadd double %2, %3 101 store double %add18, ptr %arrayidx13, align 8 102 ret void 103} 104 105; Similar to the previous test, but with different datatype. 106; Function Attrs: nounwind ssp uwtable 107define void @foo_4float(i32 %u) #0 { 108; CHECK-LABEL: @foo_4float( 109; CHECK-NEXT: entry: 110; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 111; CHECK-NEXT: store i32 [[U:%.*]], ptr [[U_ADDR]], align 4 112; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[U]], 4 113; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 114; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x float], ptr @C, i32 0, i64 [[IDXPROM]] 115; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x float], ptr @D, i32 0, i64 [[IDXPROM]] 116; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 117; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 4 118; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]] 119; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[ARRAYIDX]], align 4 120; CHECK-NEXT: ret void 121; 122entry: 123 %u.addr = alloca i32, align 4 124 store i32 %u, ptr %u.addr, align 4 125 %mul = mul nsw i32 %u, 4 126 %idxprom = sext i32 %mul to i64 127 %arrayidx = getelementptr inbounds [2000 x float], ptr @C, i32 0, i64 %idxprom 128 %0 = load float, ptr %arrayidx, align 4 129 %arrayidx4 = getelementptr inbounds [2000 x float], ptr @D, i32 0, i64 %idxprom 130 %1 = load float, ptr %arrayidx4, align 4 131 %add5 = fadd float %0, %1 132 store float %add5, ptr %arrayidx, align 4 133 %add11 = add nsw i32 %mul, 1 134 %idxprom12 = sext i32 %add11 to i64 135 %arrayidx13 = getelementptr inbounds [2000 x float], ptr @C, i32 0, i64 %idxprom12 136 %2 = load float, ptr %arrayidx13, align 4 137 %arrayidx17 = getelementptr inbounds [2000 x float], ptr @D, i32 0, i64 %idxprom12 138 %3 = load float, ptr %arrayidx17, align 4 139 %add18 = fadd float %2, %3 140 store float %add18, ptr %arrayidx13, align 4 141 %add24 = add nsw i32 %mul, 2 142 %idxprom25 = sext i32 %add24 to i64 143 %arrayidx26 = getelementptr inbounds [2000 x float], ptr @C, i32 0, i64 %idxprom25 144 %4 = load float, ptr %arrayidx26, align 4 145 %arrayidx30 = getelementptr inbounds [2000 x float], ptr @D, i32 0, i64 %idxprom25 146 %5 = load float, ptr %arrayidx30, align 4 147 %add31 = fadd float %4, %5 148 store float %add31, ptr %arrayidx26, align 4 149 %add37 = add nsw i32 %mul, 3 150 %idxprom38 = sext i32 %add37 to i64 151 %arrayidx39 = getelementptr inbounds [2000 x float], ptr @C, i32 0, i64 %idxprom38 152 %6 = load float, ptr %arrayidx39, align 4 153 %arrayidx43 = getelementptr inbounds [2000 x float], ptr @D, i32 0, i64 %idxprom38 154 %7 = load float, ptr %arrayidx43, align 4 155 %add44 = fadd float %6, %7 156 store float %add44, ptr %arrayidx39, align 4 157 ret void 158} 159 160; Similar to the previous tests, but now we are dealing with AddRec SCEV. 161; Function Attrs: nounwind ssp uwtable 162define i32 @foo_loop(ptr %A, i32 %n) #0 { 163; CHECK-LABEL: @foo_loop( 164; CHECK-NEXT: entry: 165; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 166; CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 167; CHECK-NEXT: [[SUM:%.*]] = alloca double, align 8 168; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 169; CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR]], align 8 170; CHECK-NEXT: store i32 [[N:%.*]], ptr [[N_ADDR]], align 4 171; CHECK-NEXT: store double 0.000000e+00, ptr [[SUM]], align 8 172; CHECK-NEXT: store i32 0, ptr [[I]], align 4 173; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[N]] 174; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 175; CHECK: for.body.lr.ph: 176; CHECK-NEXT: br label [[FOR_BODY:%.*]] 177; CHECK: for.body: 178; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 179; CHECK-NEXT: [[TMP1:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY]] ] 180; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 2 181; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 182; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IDXPROM]] 183; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 184; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> <double 7.000000e+00, double 7.000000e+00>, [[TMP2]] 185; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 186; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 187; CHECK-NEXT: [[ADD6:%.*]] = fadd double [[TMP4]], [[TMP5]] 188; CHECK-NEXT: [[ADD7]] = fadd double [[TMP1]], [[ADD6]] 189; CHECK-NEXT: store double [[ADD7]], ptr [[SUM]], align 8 190; CHECK-NEXT: [[INC]] = add nsw i32 [[TMP0]], 1 191; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 192; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] 193; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]] 194; CHECK: for.cond.for.end_crit_edge: 195; CHECK-NEXT: [[SPLIT:%.*]] = phi double [ [[ADD7]], [[FOR_BODY]] ] 196; CHECK-NEXT: br label [[FOR_END]] 197; CHECK: for.end: 198; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi double [ [[SPLIT]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 199; CHECK-NEXT: [[CONV:%.*]] = fptosi double [[DOTLCSSA]] to i32 200; CHECK-NEXT: ret i32 [[CONV]] 201; 202entry: 203 %A.addr = alloca ptr, align 8 204 %n.addr = alloca i32, align 4 205 %sum = alloca double, align 8 206 %i = alloca i32, align 4 207 store ptr %A, ptr %A.addr, align 8 208 store i32 %n, ptr %n.addr, align 4 209 store double 0.000000e+00, ptr %sum, align 8 210 store i32 0, ptr %i, align 4 211 %cmp1 = icmp slt i32 0, %n 212 br i1 %cmp1, label %for.body.lr.ph, label %for.end 213 214for.body.lr.ph: ; preds = %entry 215 br label %for.body 216 217for.body: ; preds = %for.body.lr.ph, %for.body 218 %0 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 219 %1 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ] 220 %mul = mul nsw i32 %0, 2 221 %idxprom = sext i32 %mul to i64 222 %arrayidx = getelementptr inbounds double, ptr %A, i64 %idxprom 223 %2 = load double, ptr %arrayidx, align 8 224 %mul1 = fmul double 7.000000e+00, %2 225 %add = add nsw i32 %mul, 1 226 %idxprom3 = sext i32 %add to i64 227 %arrayidx4 = getelementptr inbounds double, ptr %A, i64 %idxprom3 228 %3 = load double, ptr %arrayidx4, align 8 229 %mul5 = fmul double 7.000000e+00, %3 230 %add6 = fadd double %mul1, %mul5 231 %add7 = fadd double %1, %add6 232 store double %add7, ptr %sum, align 8 233 %inc = add nsw i32 %0, 1 234 store i32 %inc, ptr %i, align 4 235 %cmp = icmp slt i32 %inc, %n 236 br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge 237 238for.cond.for.end_crit_edge: ; preds = %for.body 239 %split = phi double [ %add7, %for.body ] 240 br label %for.end 241 242for.end: ; preds = %for.cond.for.end_crit_edge, %entry 243 %.lcssa = phi double [ %split, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry ] 244 %conv = fptosi double %.lcssa to i32 245 ret i32 %conv 246} 247 248; Similar to foo_2double but with a non-power-of-2 factor and potential 249; wrapping (both indices wrap or both don't in the same time) 250; Function Attrs: nounwind ssp uwtable 251define void @foo_2double_non_power_of_2(i32 %u) #0 { 252; CHECK-LABEL: @foo_2double_non_power_of_2( 253; CHECK-NEXT: entry: 254; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 255; CHECK-NEXT: store i32 [[U:%.*]], ptr [[U_ADDR]], align 4 256; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[U]], 6 257; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[MUL]], 6 258; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD6]] to i64 259; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 [[IDXPROM]] 260; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 [[IDXPROM]] 261; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 262; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 8 263; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]] 264; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[ARRAYIDX]], align 8 265; CHECK-NEXT: ret void 266; 267entry: 268 %u.addr = alloca i32, align 4 269 store i32 %u, ptr %u.addr, align 4 270 %mul = mul i32 %u, 6 271 %add6 = add i32 %mul, 6 272 %idxprom = sext i32 %add6 to i64 273 %arrayidx = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom 274 %0 = load double, ptr %arrayidx, align 8 275 %arrayidx4 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom 276 %1 = load double, ptr %arrayidx4, align 8 277 %add5 = fadd double %0, %1 278 store double %add5, ptr %arrayidx, align 8 279 %add7 = add i32 %mul, 7 280 %idxprom12 = sext i32 %add7 to i64 281 %arrayidx13 = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom12 282 %2 = load double, ptr %arrayidx13, align 8 283 %arrayidx17 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom12 284 %3 = load double, ptr %arrayidx17, align 8 285 %add18 = fadd double %2, %3 286 store double %add18, ptr %arrayidx13, align 8 287 ret void 288} 289 290; Similar to foo_2double_non_power_of_2 but with zext's instead of sext's 291; Function Attrs: nounwind ssp uwtable 292define void @foo_2double_non_power_of_2_zext(i32 %u) #0 { 293; CHECK-LABEL: @foo_2double_non_power_of_2_zext( 294; CHECK-NEXT: entry: 295; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 296; CHECK-NEXT: store i32 [[U:%.*]], ptr [[U_ADDR]], align 4 297; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[U]], 6 298; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[MUL]], 6 299; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[ADD6]] to i64 300; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 [[IDXPROM]] 301; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 [[IDXPROM]] 302; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 303; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 8 304; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]] 305; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[ARRAYIDX]], align 8 306; CHECK-NEXT: ret void 307; 308entry: 309 %u.addr = alloca i32, align 4 310 store i32 %u, ptr %u.addr, align 4 311 %mul = mul i32 %u, 6 312 %add6 = add i32 %mul, 6 313 %idxprom = zext i32 %add6 to i64 314 %arrayidx = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom 315 %0 = load double, ptr %arrayidx, align 8 316 %arrayidx4 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom 317 %1 = load double, ptr %arrayidx4, align 8 318 %add5 = fadd double %0, %1 319 store double %add5, ptr %arrayidx, align 8 320 %add7 = add i32 %mul, 7 321 %idxprom12 = zext i32 %add7 to i64 322 %arrayidx13 = getelementptr inbounds [2000 x double], ptr @A, i32 0, i64 %idxprom12 323 %2 = load double, ptr %arrayidx13, align 8 324 %arrayidx17 = getelementptr inbounds [2000 x double], ptr @B, i32 0, i64 %idxprom12 325 %3 = load double, ptr %arrayidx17, align 8 326 %add18 = fadd double %2, %3 327 store double %add18, ptr %arrayidx13, align 8 328 ret void 329} 330 331; Similar to foo_2double_non_power_of_2, but now we are dealing with AddRec SCEV. 332; Alternatively, this is like foo_loop, but with a non-power-of-2 factor and 333; potential wrapping (both indices wrap or both don't in the same time) 334; Function Attrs: nounwind ssp uwtable 335define i32 @foo_loop_non_power_of_2(ptr %A, i32 %n) #0 { 336; CHECK-LABEL: @foo_loop_non_power_of_2( 337; CHECK-NEXT: entry: 338; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 339; CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 340; CHECK-NEXT: [[SUM:%.*]] = alloca double, align 8 341; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 342; CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR]], align 8 343; CHECK-NEXT: store i32 [[N:%.*]], ptr [[N_ADDR]], align 4 344; CHECK-NEXT: store double 0.000000e+00, ptr [[SUM]], align 8 345; CHECK-NEXT: store i32 0, ptr [[I]], align 4 346; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[N]] 347; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 348; CHECK: for.body.lr.ph: 349; CHECK-NEXT: br label [[FOR_BODY:%.*]] 350; CHECK: for.body: 351; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 352; CHECK-NEXT: [[TMP1:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY]] ] 353; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0]], 12 354; CHECK-NEXT: [[ADD_5:%.*]] = add i32 [[MUL]], 5 355; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD_5]] to i64 356; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IDXPROM]] 357; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 358; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> <double 7.000000e+00, double 7.000000e+00>, [[TMP2]] 359; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 360; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 361; CHECK-NEXT: [[ADD6:%.*]] = fadd double [[TMP4]], [[TMP5]] 362; CHECK-NEXT: [[ADD7]] = fadd double [[TMP1]], [[ADD6]] 363; CHECK-NEXT: store double [[ADD7]], ptr [[SUM]], align 8 364; CHECK-NEXT: [[INC]] = add i32 [[TMP0]], 1 365; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 366; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] 367; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]] 368; CHECK: for.cond.for.end_crit_edge: 369; CHECK-NEXT: [[SPLIT:%.*]] = phi double [ [[ADD7]], [[FOR_BODY]] ] 370; CHECK-NEXT: br label [[FOR_END]] 371; CHECK: for.end: 372; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi double [ [[SPLIT]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] 373; CHECK-NEXT: [[CONV:%.*]] = fptosi double [[DOTLCSSA]] to i32 374; CHECK-NEXT: ret i32 [[CONV]] 375; 376entry: 377 %A.addr = alloca ptr, align 8 378 %n.addr = alloca i32, align 4 379 %sum = alloca double, align 8 380 %i = alloca i32, align 4 381 store ptr %A, ptr %A.addr, align 8 382 store i32 %n, ptr %n.addr, align 4 383 store double 0.000000e+00, ptr %sum, align 8 384 store i32 0, ptr %i, align 4 385 %cmp1 = icmp slt i32 0, %n 386 br i1 %cmp1, label %for.body.lr.ph, label %for.end 387 388for.body.lr.ph: ; preds = %entry 389 br label %for.body 390 391for.body: ; preds = %for.body.lr.ph, %for.body 392 %0 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 393 %1 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ] 394 %mul = mul i32 %0, 12 395 %add.5 = add i32 %mul, 5 396 %idxprom = sext i32 %add.5 to i64 397 %arrayidx = getelementptr inbounds double, ptr %A, i64 %idxprom 398 %2 = load double, ptr %arrayidx, align 8 399 %mul1 = fmul double 7.000000e+00, %2 400 %add.6 = add i32 %mul, 6 401 %idxprom3 = sext i32 %add.6 to i64 402 %arrayidx4 = getelementptr inbounds double, ptr %A, i64 %idxprom3 403 %3 = load double, ptr %arrayidx4, align 8 404 %mul5 = fmul double 7.000000e+00, %3 405 %add6 = fadd double %mul1, %mul5 406 %add7 = fadd double %1, %add6 407 store double %add7, ptr %sum, align 8 408 %inc = add i32 %0, 1 409 store i32 %inc, ptr %i, align 4 410 %cmp = icmp slt i32 %inc, %n 411 br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge 412 413for.cond.for.end_crit_edge: ; preds = %for.body 414 %split = phi double [ %add7, %for.body ] 415 br label %for.end 416 417for.end: ; preds = %for.cond.for.end_crit_edge, %entry 418 %.lcssa = phi double [ %split, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry ] 419 %conv = fptosi double %.lcssa to i32 420 ret i32 %conv 421} 422 423; This is generated by `clang -std=c11 -Wpedantic -Wall -O3 main.c -S -o - -emit-llvm` 424; with !{!"clang version 7.0.0 (trunk 337339) (llvm/trunk 337344)"} and stripping off 425; the !tbaa metadata nodes to fit the rest of the test file, where `cat main.c` is: 426; 427; double bar(ptr a, unsigned n) { 428; double x = 0.0; 429; double y = 0.0; 430; for (unsigned i = 0; i < n; i += 2) { 431; x += a[i]; 432; y += a[i + 1]; 433; } 434; return x * y; 435; } 436; 437; The resulting IR is similar to @foo_loop, but with zext's instead of sext's. 438; 439; Make sure we are able to vectorize this from now on: 440; 441define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr #0 { 442; CHECK-X86-LABEL: @bar( 443; CHECK-X86-NEXT: entry: 444; CHECK-X86-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0 445; CHECK-X86-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] 446; CHECK-X86: for.cond.cleanup: 447; CHECK-X86-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] 448; CHECK-X86-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 0 449; CHECK-X86-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 450; CHECK-X86-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]] 451; CHECK-X86-NEXT: ret double [[MUL]] 452; CHECK-X86: for.body: 453; CHECK-X86-NEXT: [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] 454; CHECK-X86-NEXT: [[TMP3:%.*]] = phi <2 x double> [ [[TMP5]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ] 455; CHECK-X86-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64 456; CHECK-X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[IDXPROM]] 457; CHECK-X86-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 458; CHECK-X86-NEXT: [[TMP5]] = fadd <2 x double> [[TMP3]], [[TMP4]] 459; CHECK-X86-NEXT: [[ADD5]] = add i32 [[I_018]], 2 460; CHECK-X86-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]] 461; CHECK-X86-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] 462; 463 464; CHECK-AARCH64-LABEL: @bar( 465; CHECK-AARCH64-NEXT: entry: 466; CHECK-AARCH64-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0 467; CHECK-AARCH64-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] 468; CHECK-AARCH64: for.cond.cleanup: 469; CHECK-AARCH64-NEXT: [[X_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] 470; CHECK-AARCH64-NEXT: [[Y_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY]] ] 471; CHECK-AARCH64-NEXT: [[MUL:%.*]] = fmul double [[X_0_LCSSA]], [[Y_0_LCSSA]] 472; CHECK-AARCH64-NEXT: ret double [[MUL]] 473; CHECK-AARCH64: for.body: 474; CHECK-AARCH64-NEXT: [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] 475; CHECK-AARCH64-NEXT: [[Y_017:%.*]] = phi double [ [[ADD4]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] 476; CHECK-AARCH64-NEXT: [[X_016:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] 477; CHECK-AARCH64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64 478; CHECK-AARCH64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[IDXPROM]] 479; CHECK-AARCH64-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8 480; CHECK-AARCH64-NEXT: [[ADD]] = fadd double [[X_016]], [[TMP0]] 481; CHECK-AARCH64-NEXT: [[ADD1:%.*]] = or disjoint i32 [[I_018]], 1 482; CHECK-AARCH64-NEXT: [[IDXPROM2:%.*]] = zext i32 [[ADD1]] to i64 483; CHECK-AARCH64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IDXPROM2]] 484; CHECK-AARCH64-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 485; CHECK-AARCH64-NEXT: [[ADD4]] = fadd double [[Y_017]], [[TMP1]] 486; CHECK-AARCH64-NEXT: [[ADD5]] = add i32 [[I_018]], 2 487; CHECK-AARCH64-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]] 488; CHECK-AARCH64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] 489; 490 491entry: 492 %cmp15 = icmp eq i32 %n, 0 493 br i1 %cmp15, label %for.cond.cleanup, label %for.body 494 495for.cond.cleanup: ; preds = %for.body, %entry 496 %x.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] 497 %y.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add4, %for.body ] 498 %mul = fmul double %x.0.lcssa, %y.0.lcssa 499 ret double %mul 500 501for.body: ; preds = %entry, %for.body 502 %i.018 = phi i32 [ %add5, %for.body ], [ 0, %entry ] 503 %y.017 = phi double [ %add4, %for.body ], [ 0.000000e+00, %entry ] 504 %x.016 = phi double [ %add, %for.body ], [ 0.000000e+00, %entry ] 505 %idxprom = zext i32 %i.018 to i64 506 %arrayidx = getelementptr inbounds double, ptr %a, i64 %idxprom 507 %0 = load double, ptr %arrayidx, align 8 508 %add = fadd double %x.016, %0 509 %add1 = or disjoint i32 %i.018, 1 510 %idxprom2 = zext i32 %add1 to i64 511 %arrayidx3 = getelementptr inbounds double, ptr %a, i64 %idxprom2 512 %1 = load double, ptr %arrayidx3, align 8 513 %add4 = fadd double %y.017, %1 514 %add5 = add i32 %i.018, 2 515 %cmp = icmp ult i32 %add5, %n 516 br i1 %cmp, label %for.body, label %for.cond.cleanup 517} 518 519; Globals/constant expressions are not normal constants. 520; They should not be treated as the usual vectorization candidates. 521 522@g1 = external global i32, align 4 523@g2 = external global i32, align 4 524 525define void @PR33958(ptr nocapture %p) { 526; CHECK-LABEL: @PR33958( 527; CHECK-NEXT: store ptr @g1, ptr [[P:%.*]], align 8 528; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds ptr, ptr [[P]], i64 1 529; CHECK-NEXT: store ptr @g2, ptr [[ARRAYIDX1]], align 8 530; CHECK-NEXT: ret void 531; 532 store ptr @g1, ptr %p, align 8 533 %arrayidx1 = getelementptr inbounds ptr, ptr %p, i64 1 534 store ptr @g2, ptr %arrayidx1, align 8 535 ret void 536} 537 538define void @store_constant_expression(ptr %p) { 539; CHECK-LABEL: @store_constant_expression( 540; CHECK-NEXT: store i64 ptrtoint (ptr @g1 to i64), ptr [[P:%.*]], align 8 541; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1 542; CHECK-NEXT: store i64 ptrtoint (ptr @g2 to i64), ptr [[ARRAYIDX1]], align 8 543; CHECK-NEXT: ret void 544; 545 store i64 ptrtoint (ptr @g1 to i64), ptr %p, align 8 546 %arrayidx1 = getelementptr inbounds i64, ptr %p, i64 1 547 store i64 ptrtoint (ptr @g2 to i64), ptr %arrayidx1, align 8 548 ret void 549} 550 551attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } 552 553!llvm.ident = !{!0} 554 555!0 = !{!"clang version 3.5.0 "} 556