1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s 3; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s 4 5; #include <stdint.h> 6; 7; int foo(float *A, int n) { 8; float sum = 0; 9; for (intptr_t i=0; i < n; ++i) { 10; sum += 7*A[i*4 ] + 11; 7*A[i*4+1] + 12; 7*A[i*4+2] + 13; 7*A[i*4+3]; 14; } 15; return sum; 16; } 17 18define i32 @add_red(ptr %A, i32 %n) { 19; CHECK-LABEL: @add_red( 20; CHECK-NEXT: entry: 21; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 22; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 23; CHECK: for.body.lr.ph: 24; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 25; CHECK-NEXT: br label [[FOR_BODY:%.*]] 26; CHECK: for.body: 27; CHECK-NEXT: [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 28; CHECK-NEXT: [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ] 29; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_033]], 2 30; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]] 31; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 32; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x float> [[TMP1]], splat (float 7.000000e+00) 33; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) 34; CHECK-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP3]] 35; CHECK-NEXT: [[INC]] = add nsw i64 [[I_033]], 1 36; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] 37; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] 38; CHECK: for.cond.for.end_crit_edge: 39; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32 40; CHECK-NEXT: br label [[FOR_END]] 41; CHECK: for.end: 42; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] 43; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 44; 45entry: 46 %cmp31 = icmp sgt i32 %n, 0 47 br i1 %cmp31, label %for.body.lr.ph, label %for.end 48 49for.body.lr.ph: 50 %0 = sext i32 %n to i64 51 br label %for.body 52 53for.body: 54 %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 55 %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ] 56 %mul = shl nsw i64 %i.033, 2 57 %arrayidx = getelementptr inbounds float, ptr %A, i64 %mul 58 %1 = load float, ptr %arrayidx, align 4 59 %mul2 = fmul float %1, 7.000000e+00 60 %add28 = or disjoint i64 %mul, 1 61 %arrayidx4 = getelementptr inbounds float, ptr %A, i64 %add28 62 %2 = load float, ptr %arrayidx4, align 4 63 %mul5 = fmul float %2, 7.000000e+00 64 %add6 = fadd fast float %mul2, %mul5 65 %add829 = or disjoint i64 %mul, 2 66 %arrayidx9 = getelementptr inbounds float, ptr %A, i64 %add829 67 %3 = load float, ptr %arrayidx9, align 4 68 %mul10 = fmul float %3, 7.000000e+00 69 %add11 = fadd fast float %add6, %mul10 70 %add1330 = or disjoint i64 %mul, 3 71 %arrayidx14 = getelementptr inbounds float, ptr %A, i64 %add1330 72 %4 = load float, ptr %arrayidx14, align 4 73 %mul15 = fmul float %4, 7.000000e+00 74 %add16 = fadd fast float %add11, %mul15 75 %add17 = fadd fast float %sum.032, %add16 76 %inc = add nsw i64 %i.033, 1 77 %exitcond = icmp eq i64 %inc, %0 78 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 79 80for.cond.for.end_crit_edge: 81 %phitmp = fptosi float %add17 to i32 82 br label %for.end 83 84for.end: 85 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 86 ret i32 %sum.0.lcssa 87} 88 89; int foo(float * restrict A, float * restrict B, int n) { 90; float sum = 0; 91; for (intptr_t i=0; i < n; ++i) { 92; sum *= B[0]*A[i*4 ] + 93; B[1]*A[i*4+1] + 94; B[2]*A[i*4+2] + 95; B[3]*A[i*4+3]; 96; } 97; return sum; 98; } 99 100define i32 @mul_red(ptr noalias %A, ptr noalias %B, i32 %n) { 101; CHECK-LABEL: @mul_red( 102; CHECK-NEXT: entry: 103; CHECK-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0 104; CHECK-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 105; CHECK: for.body.lr.ph: 106; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 107; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64 108; CHECK-NEXT: br label [[FOR_BODY:%.*]] 109; CHECK: for.body: 110; CHECK-NEXT: [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 111; CHECK-NEXT: [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ] 112; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_040]], 2 113; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]] 114; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4 115; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP0]], [[TMP2]] 116; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) 117; CHECK-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP4]] 118; CHECK-NEXT: [[INC]] = add nsw i64 [[I_040]], 1 119; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]] 120; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] 121; CHECK: for.cond.for.end_crit_edge: 122; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32 123; CHECK-NEXT: br label [[FOR_END]] 124; CHECK: for.end: 125; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] 126; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 127; 128entry: 129 %cmp38 = icmp sgt i32 %n, 0 130 br i1 %cmp38, label %for.body.lr.ph, label %for.end 131 132for.body.lr.ph: 133 %0 = load float, ptr %B, align 4 134 %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1 135 %1 = load float, ptr %arrayidx4, align 4 136 %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2 137 %2 = load float, ptr %arrayidx9, align 4 138 %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3 139 %3 = load float, ptr %arrayidx15, align 4 140 %4 = sext i32 %n to i64 141 br label %for.body 142 143for.body: 144 %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 145 %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ] 146 %mul = shl nsw i64 %i.040, 2 147 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul 148 %5 = load float, ptr %arrayidx2, align 4 149 %mul3 = fmul float %0, %5 150 %add35 = or disjoint i64 %mul, 1 151 %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add35 152 %6 = load float, ptr %arrayidx6, align 4 153 %mul7 = fmul float %1, %6 154 %add8 = fadd fast float %mul3, %mul7 155 %add1136 = or disjoint i64 %mul, 2 156 %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add1136 157 %7 = load float, ptr %arrayidx12, align 4 158 %mul13 = fmul float %2, %7 159 %add14 = fadd fast float %add8, %mul13 160 %add1737 = or disjoint i64 %mul, 3 161 %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add1737 162 %8 = load float, ptr %arrayidx18, align 4 163 %mul19 = fmul float %3, %8 164 %add20 = fadd fast float %add14, %mul19 165 %mul21 = fmul float %sum.039, %add20 166 %inc = add nsw i64 %i.040, 1 167 %exitcond = icmp eq i64 %inc, %4 168 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 169 170for.cond.for.end_crit_edge: 171 %phitmp = fptosi float %mul21 to i32 172 br label %for.end 173 174for.end: 175 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 176 ret i32 %sum.0.lcssa 177} 178 179; int foo(float * restrict A, float * restrict B, int n) { 180; float sum = 0; 181; for (intptr_t i=0; i < n; ++i) { 182; sum += B[0]*A[i*6 ] + 183; B[1]*A[i*6+1] + 184; B[2]*A[i*6+2] + 185; B[3]*A[i*6+3] + 186; B[4]*A[i*6+4] + 187; B[5]*A[i*6+5] + 188; B[6]*A[i*6+6] + 189; B[7]*A[i*6+7] + 190; B[8]*A[i*6+8]; 191; } 192; return sum; 193; } 194 195define i32 @long_red(ptr noalias %A, ptr noalias %B, i32 %n) { 196; CHECK-LABEL: @long_red( 197; CHECK-NEXT: entry: 198; CHECK-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0 199; CHECK-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 200; CHECK: for.body.lr.ph: 201; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[B:%.*]], align 4 202; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[B]], i64 8 203; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX45]], align 4 204; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 205; CHECK-NEXT: br label [[FOR_BODY:%.*]] 206; CHECK: for.body: 207; CHECK-NEXT: [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 208; CHECK-NEXT: [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ] 209; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[I_083]], 6 210; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]] 211; CHECK-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[ARRAYIDX2]], align 4 212; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x float> [[TMP0]], [[TMP3]] 213; CHECK-NEXT: [[ADD47:%.*]] = add nsw i64 [[MUL]], 8 214; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[ADD47]] 215; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX48]], align 4 216; CHECK-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP1]], [[TMP5]] 217; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP4]]) 218; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP6]], [[MUL49]] 219; CHECK-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[OP_RDX]] 220; CHECK-NEXT: [[INC]] = add nsw i64 [[I_083]], 1 221; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] 222; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] 223; CHECK: for.cond.for.end_crit_edge: 224; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32 225; CHECK-NEXT: br label [[FOR_END]] 226; CHECK: for.end: 227; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] 228; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 229; 230entry: 231 %cmp81 = icmp sgt i32 %n, 0 232 br i1 %cmp81, label %for.body.lr.ph, label %for.end 233 234for.body.lr.ph: 235 %0 = load float, ptr %B, align 4 236 %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1 237 %1 = load float, ptr %arrayidx4, align 4 238 %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2 239 %2 = load float, ptr %arrayidx9, align 4 240 %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3 241 %3 = load float, ptr %arrayidx15, align 4 242 %arrayidx21 = getelementptr inbounds float, ptr %B, i64 4 243 %4 = load float, ptr %arrayidx21, align 4 244 %arrayidx27 = getelementptr inbounds float, ptr %B, i64 5 245 %5 = load float, ptr %arrayidx27, align 4 246 %arrayidx33 = getelementptr inbounds float, ptr %B, i64 6 247 %6 = load float, ptr %arrayidx33, align 4 248 %arrayidx39 = getelementptr inbounds float, ptr %B, i64 7 249 %7 = load float, ptr %arrayidx39, align 4 250 %arrayidx45 = getelementptr inbounds float, ptr %B, i64 8 251 %8 = load float, ptr %arrayidx45, align 4 252 %9 = sext i32 %n to i64 253 br label %for.body 254 255for.body: 256 %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 257 %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ] 258 %mul = mul nsw i64 %i.083, 6 259 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul 260 %10 = load float, ptr %arrayidx2, align 4 261 %mul3 = fmul fast float %0, %10 262 %add80 = or disjoint i64 %mul, 1 263 %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add80 264 %11 = load float, ptr %arrayidx6, align 4 265 %mul7 = fmul fast float %1, %11 266 %add8 = fadd fast float %mul3, %mul7 267 %add11 = add nsw i64 %mul, 2 268 %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add11 269 %12 = load float, ptr %arrayidx12, align 4 270 %mul13 = fmul fast float %2, %12 271 %add14 = fadd fast float %add8, %mul13 272 %add17 = add nsw i64 %mul, 3 273 %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add17 274 %13 = load float, ptr %arrayidx18, align 4 275 %mul19 = fmul fast float %3, %13 276 %add20 = fadd fast float %add14, %mul19 277 %add23 = add nsw i64 %mul, 4 278 %arrayidx24 = getelementptr inbounds float, ptr %A, i64 %add23 279 %14 = load float, ptr %arrayidx24, align 4 280 %mul25 = fmul fast float %4, %14 281 %add26 = fadd fast float %add20, %mul25 282 %add29 = add nsw i64 %mul, 5 283 %arrayidx30 = getelementptr inbounds float, ptr %A, i64 %add29 284 %15 = load float, ptr %arrayidx30, align 4 285 %mul31 = fmul fast float %5, %15 286 %add32 = fadd fast float %add26, %mul31 287 %add35 = add nsw i64 %mul, 6 288 %arrayidx36 = getelementptr inbounds float, ptr %A, i64 %add35 289 %16 = load float, ptr %arrayidx36, align 4 290 %mul37 = fmul fast float %6, %16 291 %add38 = fadd fast float %add32, %mul37 292 %add41 = add nsw i64 %mul, 7 293 %arrayidx42 = getelementptr inbounds float, ptr %A, i64 %add41 294 %17 = load float, ptr %arrayidx42, align 4 295 %mul43 = fmul fast float %7, %17 296 %add44 = fadd fast float %add38, %mul43 297 %add47 = add nsw i64 %mul, 8 298 %arrayidx48 = getelementptr inbounds float, ptr %A, i64 %add47 299 %18 = load float, ptr %arrayidx48, align 4 300 %mul49 = fmul fast float %8, %18 301 %add50 = fadd fast float %add44, %mul49 302 %add51 = fadd fast float %sum.082, %add50 303 %inc = add nsw i64 %i.083, 1 304 %exitcond = icmp eq i64 %inc, %9 305 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 306 307for.cond.for.end_crit_edge: 308 %phitmp = fptosi float %add51 to i32 309 br label %for.end 310 311for.end: 312 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 313 ret i32 %sum.0.lcssa 314} 315 316; int foo(float * restrict A, float * restrict B, int n) { 317; float sum = 0; 318; for (intptr_t i=0; i < n; ++i) { 319; sum += B[0]*A[i*4 ]; 320; sum += B[1]*A[i*4+1]; 321; sum += B[2]*A[i*4+2]; 322; sum += B[3]*A[i*4+3]; 323; } 324; return sum; 325; } 326 327define i32 @chain_red(ptr noalias %A, ptr noalias %B, i32 %n) { 328; CHECK-LABEL: @chain_red( 329; CHECK-NEXT: entry: 330; CHECK-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0 331; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 332; CHECK: for.body.lr.ph: 333; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 334; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64 335; CHECK-NEXT: br label [[FOR_BODY:%.*]] 336; CHECK: for.body: 337; CHECK-NEXT: [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 338; CHECK-NEXT: [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] 339; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_043]], 2 340; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]] 341; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4 342; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP0]], [[TMP2]] 343; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) 344; CHECK-NEXT: [[OP_RDX]] = fadd fast float [[TMP4]], [[SUM_042]] 345; CHECK-NEXT: [[INC]] = add nsw i64 [[I_043]], 1 346; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]] 347; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]] 348; CHECK: for.cond.for.end_crit_edge: 349; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[OP_RDX]] to i32 350; CHECK-NEXT: br label [[FOR_END]] 351; CHECK: for.end: 352; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] 353; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 354; 355entry: 356 %cmp41 = icmp sgt i32 %n, 0 357 br i1 %cmp41, label %for.body.lr.ph, label %for.end 358 359for.body.lr.ph: 360 %0 = load float, ptr %B, align 4 361 %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1 362 %1 = load float, ptr %arrayidx4, align 4 363 %arrayidx10 = getelementptr inbounds float, ptr %B, i64 2 364 %2 = load float, ptr %arrayidx10, align 4 365 %arrayidx16 = getelementptr inbounds float, ptr %B, i64 3 366 %3 = load float, ptr %arrayidx16, align 4 367 %4 = sext i32 %n to i64 368 br label %for.body 369 370for.body: 371 %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 372 %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ] 373 %mul = shl nsw i64 %i.043, 2 374 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul 375 %5 = load float, ptr %arrayidx2, align 4 376 %mul3 = fmul fast float %0, %5 377 %add = fadd fast float %sum.042, %mul3 378 %add638 = or disjoint i64 %mul, 1 379 %arrayidx7 = getelementptr inbounds float, ptr %A, i64 %add638 380 %6 = load float, ptr %arrayidx7, align 4 381 %mul8 = fmul fast float %1, %6 382 %add9 = fadd fast float %add, %mul8 383 %add1239 = or disjoint i64 %mul, 2 384 %arrayidx13 = getelementptr inbounds float, ptr %A, i64 %add1239 385 %7 = load float, ptr %arrayidx13, align 4 386 %mul14 = fmul fast float %2, %7 387 %add15 = fadd fast float %add9, %mul14 388 %add1840 = or disjoint i64 %mul, 3 389 %arrayidx19 = getelementptr inbounds float, ptr %A, i64 %add1840 390 %8 = load float, ptr %arrayidx19, align 4 391 %mul20 = fmul fast float %3, %8 392 %add21 = fadd fast float %add15, %mul20 393 %inc = add nsw i64 %i.043, 1 394 %exitcond = icmp eq i64 %inc, %4 395 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 396 397for.cond.for.end_crit_edge: 398 %phitmp = fptosi float %add21 to i32 399 br label %for.end 400 401for.end: 402 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 403 ret i32 %sum.0.lcssa 404} 405 406; void foo(const float *arg_A, unsigned arg_B, float *array) { 407; for (uint32_t i = 0; i < 6; ++i) { 408; const float *ptr = arg_A + i; 409; float w0 = array[i * 4 + 0]; 410; float w1 = array[i * 4 + 1]; 411; float w2 = array[i * 4 + 2]; 412; float w3 = array[i * 4 + 3]; 413; 414; for (unsigned j = 0; j < arg_B; ++j) { 415; const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1); 416; const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1); 417; const float x3 = x2 - (-3.1f * w2) - (3.2f * w3); 418; const float x4 = x3 + (-4.0f * w2) + w3; 419; w1 = w0; 420; w0 = x1; 421; w3 = w2; 422; w2 = x3; 423; } 424; 425; array[i * 4 + 0] = w0; 426; array[i * 4 + 1] = w1; 427; array[i * 4 + 2] = w2; 428; array[i * 4 + 3] = w3; 429; } 430; } 431 432define void @foo(ptr nocapture readonly %arg_A, i32 %arg_B, ptr nocapture %array) { 433; CHECK-LABEL: @foo( 434; CHECK-NEXT: entry: 435; CHECK-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0 436; CHECK-NEXT: br label [[FOR_BODY:%.*]] 437; CHECK: for.cond.cleanup: 438; CHECK-NEXT: ret void 439; CHECK: for.body: 440; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ] 441; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2 442; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[ARRAY:%.*]], i64 [[TMP0]] 443; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 444; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[TMP0]], 1 445; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP2]] 446; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 447; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[TMP0]], 2 448; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP4]] 449; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 450; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 3 451; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP6]] 452; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4 453; CHECK-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]] 454; CHECK: for.body16.lr.ph: 455; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[ARG_A:%.*]], i64 [[INDVARS_IV]] 456; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ADD_PTR]], align 4 457; CHECK-NEXT: br label [[FOR_BODY16:%.*]] 458; CHECK: for.cond.cleanup15: 459; CHECK-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ] 460; CHECK-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ] 461; CHECK-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ] 462; CHECK-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] 463; CHECK-NEXT: store float [[W0_0_LCSSA]], ptr [[ARRAYIDX]], align 4 464; CHECK-NEXT: store float [[W1_0_LCSSA]], ptr [[ARRAYIDX4]], align 4 465; CHECK-NEXT: store float [[W2_0_LCSSA]], ptr [[ARRAYIDX8]], align 4 466; CHECK-NEXT: store float [[W3_0_LCSSA]], ptr [[ARRAYIDX12]], align 4 467; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 468; CHECK-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6 469; CHECK-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] 470; CHECK: for.body16: 471; CHECK-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ] 472; CHECK-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ] 473; CHECK-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ] 474; CHECK-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ] 475; CHECK-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ] 476; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000 477; CHECK-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000 478; CHECK-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]] 479; CHECK-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]] 480; CHECK-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000 481; CHECK-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000 482; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000 483; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000 484; CHECK-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000 485; CHECK-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]] 486; CHECK-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]] 487; CHECK-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]] 488; CHECK-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]] 489; CHECK-NEXT: [[INC]] = add nuw i32 [[J_098]], 1 490; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] 491; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] 492; 493entry: 494 %cmp1495 = icmp eq i32 %arg_B, 0 495 br label %for.body 496 497for.cond.cleanup: ; preds = %for.cond.cleanup15 498 ret void 499 500for.body: ; preds = %for.cond.cleanup15, %entry 501 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ] 502 %0 = shl i64 %indvars.iv, 2 503 %arrayidx = getelementptr inbounds float, ptr %array, i64 %0 504 %1 = load float, ptr %arrayidx, align 4 505 %2 = or disjoint i64 %0, 1 506 %arrayidx4 = getelementptr inbounds float, ptr %array, i64 %2 507 %3 = load float, ptr %arrayidx4, align 4 508 %4 = or disjoint i64 %0, 2 509 %arrayidx8 = getelementptr inbounds float, ptr %array, i64 %4 510 %5 = load float, ptr %arrayidx8, align 4 511 %6 = or disjoint i64 %0, 3 512 %arrayidx12 = getelementptr inbounds float, ptr %array, i64 %6 513 %7 = load float, ptr %arrayidx12, align 4 514 br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph 515 516for.body16.lr.ph: ; preds = %for.body 517 %add.ptr = getelementptr inbounds float, ptr %arg_A, i64 %indvars.iv 518 %8 = load float, ptr %add.ptr, align 4 519 br label %for.body16 520 521for.cond.cleanup15: ; preds = %for.body16, %for.body 522 %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ] 523 %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ] 524 %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ] 525 %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ] 526 store float %w0.0.lcssa, ptr %arrayidx, align 4 527 store float %w1.0.lcssa, ptr %arrayidx4, align 4 528 store float %w2.0.lcssa, ptr %arrayidx8, align 4 529 store float %w3.0.lcssa, ptr %arrayidx12, align 4 530 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 531 %exitcond109 = icmp eq i64 %indvars.iv.next, 6 532 br i1 %exitcond109, label %for.cond.cleanup, label %for.body 533 534for.body16: ; preds = %for.body16, %for.body16.lr.ph 535 %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ] 536 %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ] 537 %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ] 538 %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ] 539 %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ] 540 %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000 541 %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000 542 %sub92 = fadd fast float %mul17, %mul18.neg 543 %sub19 = fadd fast float %sub92, %8 544 %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000 545 %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000 546 %mul23 = fmul fast float %w1.099, 0x4002666660000000 547 %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000 548 %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000 549 %add2293 = fadd fast float %mul27.neg, %mul25 550 %add24 = fadd fast float %add2293, %mul23 551 %sub2694 = fadd fast float %add24, %mul21.neg 552 %sub28 = fadd fast float %sub2694, %mul20 553 %inc = add nuw i32 %j.098, 1 554 %exitcond = icmp eq i32 %inc, %arg_B 555 br i1 %exitcond, label %for.cond.cleanup15, label %for.body16 556} 557 558 559; void foo(double * restrict A, double * restrict B, double * restrict C, 560; int n) { 561; for (intptr_t i=0; i < n; ++i) { 562; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1]; 563; } 564; } 565 566define void @store_red_double(ptr noalias %A, ptr noalias %B, ptr noalias %C, i32 %n) { 567; CHECK-LABEL: @store_red_double( 568; CHECK-NEXT: entry: 569; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0 570; CHECK-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 571; CHECK: for.body.lr.ph: 572; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8 573; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64 574; CHECK-NEXT: br label [[FOR_BODY:%.*]] 575; CHECK: for.body: 576; CHECK-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 577; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2 578; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[MUL]] 579; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 8 580; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP0]], [[TMP2]] 581; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 582; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 583; CHECK-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP4]], [[TMP5]] 584; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[C:%.*]], i64 [[I_018]] 585; CHECK-NEXT: store double [[ADD8]], ptr [[ARRAYIDX9]], align 8 586; CHECK-NEXT: [[INC]] = add nsw i64 [[I_018]], 1 587; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]] 588; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] 589; CHECK: for.end: 590; CHECK-NEXT: ret void 591; 592entry: 593 %cmp17 = icmp sgt i32 %n, 0 594 br i1 %cmp17, label %for.body.lr.ph, label %for.end 595 596for.body.lr.ph: 597 %0 = load double, ptr %B, align 8 598 %arrayidx4 = getelementptr inbounds double, ptr %B, i64 1 599 %1 = load double, ptr %arrayidx4, align 8 600 %2 = sext i32 %n to i64 601 br label %for.body 602 603for.body: 604 %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 605 %mul = shl nsw i64 %i.018, 2 606 %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %mul 607 %3 = load double, ptr %arrayidx2, align 8 608 %mul3 = fmul fast double %0, %3 609 %add16 = or disjoint i64 %mul, 1 610 %arrayidx6 = getelementptr inbounds double, ptr %A, i64 %add16 611 %4 = load double, ptr %arrayidx6, align 8 612 %mul7 = fmul fast double %1, %4 613 %add8 = fadd fast double %mul3, %mul7 614 %arrayidx9 = getelementptr inbounds double, ptr %C, i64 %i.018 615 store double %add8, ptr %arrayidx9, align 8 616 %inc = add nsw i64 %i.018, 1 617 %exitcond = icmp eq i64 %inc, %2 618 br i1 %exitcond, label %for.end, label %for.body 619 620for.end: 621 ret void 622} 623 624; int foo(float * restrict A, float * restrict B, float * restrict C, int n) { 625; float sum = 0; 626; for (intptr_t i=0; i < n; ++i) { 627; C[i] = B[0] *A[i*4 ] + 628; B[1] *A[i*4+1] + 629; B[2] *A[i*4+2] + 630; B[3] *A[i*4+3]; 631; } 632; return sum; 633; } 634 635define i32 @store_red(ptr noalias %A, ptr noalias %B, ptr noalias %C, i32 %n) { 636; CHECK-LABEL: @store_red( 637; CHECK-NEXT: entry: 638; CHECK-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0 639; CHECK-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] 640; CHECK: for.body.lr.ph: 641; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64 642; CHECK-NEXT: br label [[FOR_BODY:%.*]] 643; CHECK: for.body: 644; CHECK-NEXT: [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] 645; CHECK-NEXT: [[C_ADDR_038:%.*]] = phi ptr [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] 646; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_039]], 2 647; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]] 648; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 649; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4 650; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP2]] 651; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) 652; CHECK-NEXT: store float [[TMP4]], ptr [[C_ADDR_038]], align 4 653; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[C_ADDR_038]], i64 1 654; CHECK-NEXT: [[INC]] = add nsw i64 [[I_039]], 1 655; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] 656; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]] 657; CHECK: for.end: 658; CHECK-NEXT: ret i32 0 659; 660entry: 661 %cmp37 = icmp sgt i32 %n, 0 662 br i1 %cmp37, label %for.body.lr.ph, label %for.end 663 664for.body.lr.ph: 665 %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1 666 %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2 667 %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3 668 %0 = sext i32 %n to i64 669 br label %for.body 670 671for.body: 672 %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 673 %C.addr.038 = phi ptr [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] 674 %1 = load float, ptr %B, align 4 675 %mul = shl nsw i64 %i.039, 2 676 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul 677 %2 = load float, ptr %arrayidx2, align 4 678 %mul3 = fmul fast float %1, %2 679 %3 = load float, ptr %arrayidx4, align 4 680 %add34 = or disjoint i64 %mul, 1 681 %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add34 682 %4 = load float, ptr %arrayidx6, align 4 683 %mul7 = fmul fast float %3, %4 684 %add8 = fadd fast float %mul3, %mul7 685 %5 = load float, ptr %arrayidx9, align 4 686 %add1135 = or disjoint i64 %mul, 2 687 %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add1135 688 %6 = load float, ptr %arrayidx12, align 4 689 %mul13 = fmul fast float %5, %6 690 %add14 = fadd fast float %add8, %mul13 691 %7 = load float, ptr %arrayidx15, align 4 692 %add1736 = or disjoint i64 %mul, 3 693 %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add1736 694 %8 = load float, ptr %arrayidx18, align 4 695 %mul19 = fmul fast float %7, %8 696 %add20 = fadd fast float %add14, %mul19 697 store float %add20, ptr %C.addr.038, align 4 698 %incdec.ptr = getelementptr inbounds float, ptr %C.addr.038, i64 1 699 %inc = add nsw i64 %i.039, 1 700 %exitcond = icmp eq i64 %inc, %0 701 br i1 %exitcond, label %for.end, label %for.body 702 703for.end: 704 ret i32 0 705} 706 707@arr_i32 = global [32 x i32] zeroinitializer, align 16 708@arr_float = global [32 x float] zeroinitializer, align 16 709 710define void @float_red_example4(ptr %res) { 711; CHECK-LABEL: @float_red_example4( 712; CHECK-NEXT: entry: 713; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr @arr_float, align 16 714; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]]) 715; CHECK-NEXT: store float [[TMP1]], ptr [[RES:%.*]], align 16 716; CHECK-NEXT: ret void 717; 718entry: 719 %0 = load float, ptr @arr_float, align 16 720 %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4 721 %add = fadd fast float %1, %0 722 %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8 723 %add.1 = fadd fast float %2, %add 724 %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4 725 %add.2 = fadd fast float %3, %add.1 726 store float %add.2, ptr %res, align 16 727 ret void 728} 729 730define void @float_red_example8(ptr %res) { 731; CHECK-LABEL: @float_red_example8( 732; CHECK-NEXT: entry: 733; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr @arr_float, align 16 734; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) 735; CHECK-NEXT: store float [[TMP1]], ptr [[RES:%.*]], align 16 736; CHECK-NEXT: ret void 737; 738entry: 739 %0 = load float, ptr @arr_float, align 16 740 %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4 741 %add = fadd fast float %1, %0 742 %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8 743 %add.1 = fadd fast float %2, %add 744 %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4 745 %add.2 = fadd fast float %3, %add.1 746 %4 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 4), align 16 747 %add.3 = fadd fast float %4, %add.2 748 %5 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 5), align 4 749 %add.4 = fadd fast float %5, %add.3 750 %6 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 6), align 8 751 %add.5 = fadd fast float %6, %add.4 752 %7 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 7), align 4 753 %add.6 = fadd fast float %7, %add.5 754 store float %add.6, ptr %res, align 16 755 ret void 756} 757 758define void @float_red_example16(ptr %res) { 759; CHECK-LABEL: @float_red_example16( 760; CHECK-NEXT: entry: 761; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr @arr_float, align 16 762; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) 763; CHECK-NEXT: store float [[TMP1]], ptr [[RES:%.*]], align 16 764; CHECK-NEXT: ret void 765; 766entry: 767 %0 = load float, ptr @arr_float, align 16 768 %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4 769 %add = fadd fast float %1, %0 770 %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8 771 %add.1 = fadd fast float %2, %add 772 %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4 773 %add.2 = fadd fast float %3, %add.1 774 %4 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 4), align 16 775 %add.3 = fadd fast float %4, %add.2 776 %5 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 5), align 4 777 %add.4 = fadd fast float %5, %add.3 778 %6 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 6), align 8 779 %add.5 = fadd fast float %6, %add.4 780 %7 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 7), align 4 781 %add.6 = fadd fast float %7, %add.5 782 %8 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 8), align 16 783 %add.7 = fadd fast float %8, %add.6 784 %9 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 9), align 4 785 %add.8 = fadd fast float %9, %add.7 786 %10 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 10), align 8 787 %add.9 = fadd fast float %10, %add.8 788 %11 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 11), align 4 789 %add.10 = fadd fast float %11, %add.9 790 %12 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 12), align 16 791 %add.11 = fadd fast float %12, %add.10 792 %13 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 13), align 4 793 %add.12 = fadd fast float %13, %add.11 794 %14 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 14), align 8 795 %add.13 = fadd fast float %14, %add.12 796 %15 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 15), align 4 797 %add.14 = fadd fast float %15, %add.13 798 store float %add.14, ptr %res, align 16 799 ret void 800} 801 802define void @i32_red_example4(ptr %res) { 803; CHECK-LABEL: @i32_red_example4( 804; CHECK-NEXT: entry: 805; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @arr_i32, align 16 806; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) 807; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES:%.*]], align 16 808; CHECK-NEXT: ret void 809; 810entry: 811 %0 = load i32, ptr @arr_i32, align 16 812 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4 813 %add = add nsw i32 %1, %0 814 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8 815 %add.1 = add nsw i32 %2, %add 816 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4 817 %add.2 = add nsw i32 %3, %add.1 818 store i32 %add.2, ptr %res, align 16 819 ret void 820} 821 822define void @i32_red_example8(ptr %res) { 823; CHECK-LABEL: @i32_red_example8( 824; CHECK-NEXT: entry: 825; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16 826; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) 827; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES:%.*]], align 16 828; CHECK-NEXT: ret void 829; 830entry: 831 %0 = load i32, ptr @arr_i32, align 16 832 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4 833 %add = add nsw i32 %1, %0 834 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8 835 %add.1 = add nsw i32 %2, %add 836 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4 837 %add.2 = add nsw i32 %3, %add.1 838 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16 839 %add.3 = add nsw i32 %4, %add.2 840 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4 841 %add.4 = add nsw i32 %5, %add.3 842 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8 843 %add.5 = add nsw i32 %6, %add.4 844 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4 845 %add.6 = add nsw i32 %7, %add.5 846 store i32 %add.6, ptr %res, align 16 847 ret void 848} 849 850define void @i32_red_example16(ptr %res) { 851; CHECK-LABEL: @i32_red_example16( 852; CHECK-NEXT: entry: 853; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @arr_i32, align 16 854; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]]) 855; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES:%.*]], align 16 856; CHECK-NEXT: ret void 857; 858entry: 859 %0 = load i32, ptr @arr_i32, align 16 860 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4 861 %add = add nsw i32 %1, %0 862 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8 863 %add.1 = add nsw i32 %2, %add 864 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4 865 %add.2 = add nsw i32 %3, %add.1 866 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16 867 %add.3 = add nsw i32 %4, %add.2 868 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4 869 %add.4 = add nsw i32 %5, %add.3 870 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8 871 %add.5 = add nsw i32 %6, %add.4 872 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4 873 %add.6 = add nsw i32 %7, %add.5 874 %8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 8), align 16 875 %add.7 = add nsw i32 %8, %add.6 876 %9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 9), align 4 877 %add.8 = add nsw i32 %9, %add.7 878 %10 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 10), align 8 879 %add.9 = add nsw i32 %10, %add.8 880 %11 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 11), align 4 881 %add.10 = add nsw i32 %11, %add.9 882 %12 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 12), align 16 883 %add.11 = add nsw i32 %12, %add.10 884 %13 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 13), align 4 885 %add.12 = add nsw i32 %13, %add.11 886 %14 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 14), align 8 887 %add.13 = add nsw i32 %14, %add.12 888 %15 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 15), align 4 889 %add.14 = add nsw i32 %15, %add.13 890 store i32 %add.14, ptr %res, align 16 891 ret void 892} 893 894define void @i32_red_example32(ptr %res) { 895; CHECK-LABEL: @i32_red_example32( 896; CHECK-NEXT: entry: 897; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i32>, ptr @arr_i32, align 16 898; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]]) 899; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES:%.*]], align 16 900; CHECK-NEXT: ret void 901; 902entry: 903 %0 = load i32, ptr @arr_i32, align 16 904 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4 905 %add = add nsw i32 %1, %0 906 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8 907 %add.1 = add nsw i32 %2, %add 908 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4 909 %add.2 = add nsw i32 %3, %add.1 910 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16 911 %add.3 = add nsw i32 %4, %add.2 912 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4 913 %add.4 = add nsw i32 %5, %add.3 914 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8 915 %add.5 = add nsw i32 %6, %add.4 916 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4 917 %add.6 = add nsw i32 %7, %add.5 918 %8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 8), align 16 919 %add.7 = add nsw i32 %8, %add.6 920 %9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 9), align 4 921 %add.8 = add nsw i32 %9, %add.7 922 %10 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 10), align 8 923 %add.9 = add nsw i32 %10, %add.8 924 %11 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 11), align 4 925 %add.10 = add nsw i32 %11, %add.9 926 %12 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 12), align 16 927 %add.11 = add nsw i32 %12, %add.10 928 %13 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 13), align 4 929 %add.12 = add nsw i32 %13, %add.11 930 %14 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 14), align 8 931 %add.13 = add nsw i32 %14, %add.12 932 %15 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 15), align 4 933 %add.14 = add nsw i32 %15, %add.13 934 %16 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 16), align 16 935 %add.15 = add nsw i32 %16, %add.14 936 %17 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 17), align 4 937 %add.16 = add nsw i32 %17, %add.15 938 %18 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 18), align 8 939 %add.17 = add nsw i32 %18, %add.16 940 %19 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 19), align 4 941 %add.18 = add nsw i32 %19, %add.17 942 %20 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 20), align 16 943 %add.19 = add nsw i32 %20, %add.18 944 %21 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 21), align 4 945 %add.20 = add nsw i32 %21, %add.19 946 %22 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 22), align 8 947 %add.21 = add nsw i32 %22, %add.20 948 %23 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 23), align 4 949 %add.22 = add nsw i32 %23, %add.21 950 %24 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 24), align 16 951 %add.23 = add nsw i32 %24, %add.22 952 %25 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 25), align 4 953 %add.24 = add nsw i32 %25, %add.23 954 %26 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 26), align 8 955 %add.25 = add nsw i32 %26, %add.24 956 %27 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 27), align 4 957 %add.26 = add nsw i32 %27, %add.25 958 %28 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 28), align 16 959 %add.27 = add nsw i32 %28, %add.26 960 %29 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 29), align 4 961 %add.28 = add nsw i32 %29, %add.27 962 %30 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 30), align 8 963 %add.29 = add nsw i32 %30, %add.28 964 %31 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 31), align 4 965 %add.30 = add nsw i32 %31, %add.29 966 store i32 %add.30, ptr %res, align 16 967 ret void 968} 969 970declare i32 @foobar(i32) 971 972define void @i32_red_call(i32 %val) { 973; CHECK-LABEL: @i32_red_call( 974; CHECK-NEXT: entry: 975; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16 976; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) 977; CHECK-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]]) 978; CHECK-NEXT: ret void 979; 980entry: 981 %0 = load i32, ptr @arr_i32, align 16 982 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4 983 %add = add nsw i32 %1, %0 984 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8 985 %add.1 = add nsw i32 %2, %add 986 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4 987 %add.2 = add nsw i32 %3, %add.1 988 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16 989 %add.3 = add nsw i32 %4, %add.2 990 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4 991 %add.4 = add nsw i32 %5, %add.3 992 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8 993 %add.5 = add nsw i32 %6, %add.4 994 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4 995 %add.6 = add nsw i32 %7, %add.5 996 %res = call i32 @foobar(i32 %add.6) 997 ret void 998} 999 1000define void @i32_red_invoke(i32 %val) personality ptr @__gxx_personality_v0 { 1001; CHECK-LABEL: @i32_red_invoke( 1002; CHECK-NEXT: entry: 1003; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16 1004; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) 1005; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]]) 1006; CHECK-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]] 1007; CHECK: exception: 1008; CHECK-NEXT: [[CLEANUP:%.*]] = landingpad i8 1009; CHECK-NEXT: cleanup 1010; CHECK-NEXT: br label [[NORMAL]] 1011; CHECK: normal: 1012; CHECK-NEXT: ret void 1013; 1014entry: 1015 %0 = load i32, ptr @arr_i32, align 16 1016 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4 1017 %add = add nsw i32 %1, %0 1018 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8 1019 %add.1 = add nsw i32 %2, %add 1020 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4 1021 %add.2 = add nsw i32 %3, %add.1 1022 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16 1023 %add.3 = add nsw i32 %4, %add.2 1024 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4 1025 %add.4 = add nsw i32 %5, %add.3 1026 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8 1027 %add.5 = add nsw i32 %6, %add.4 1028 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4 1029 %add.6 = add nsw i32 %7, %add.5 1030 %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception 1031exception: 1032 %cleanup = landingpad i8 cleanup 1033 br label %normal 1034normal: 1035 ret void 1036} 1037 1038; Test case from PR47670. Reduction result is used as incoming value in phi. 1039define i32 @reduction_result_used_in_phi(ptr nocapture readonly %data, i1 zeroext %b) { 1040; CHECK-LABEL: @reduction_result_used_in_phi( 1041; CHECK-NEXT: entry: 1042; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] 1043; CHECK: bb: 1044; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[DATA:%.*]], align 4 1045; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) 1046; CHECK-NEXT: br label [[EXIT]] 1047; CHECK: exit: 1048; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[BB]] ] 1049; CHECK-NEXT: ret i32 [[SUM_1]] 1050; 1051entry: 1052 br i1 %b, label %bb, label %exit 1053 1054bb: 1055 %l.0 = load i32, ptr %data, align 4 1056 %idx.1 = getelementptr inbounds i32, ptr %data, i64 1 1057 %l.1 = load i32, ptr %idx.1, align 4 1058 %add.1 = add i32 %l.1, %l.0 1059 %idx.2 = getelementptr inbounds i32, ptr %data, i64 2 1060 %l.2 = load i32, ptr %idx.2, align 4 1061 %add.2 = add i32 %l.2, %add.1 1062 %idx.3 = getelementptr inbounds i32, ptr %data, i64 3 1063 %l.3 = load i32, ptr %idx.3, align 4 1064 %add.3 = add i32 %l.3, %add.2 1065 br label %exit 1066 1067exit: 1068 %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb] 1069 ret i32 %sum.1 1070} 1071 1072define i32 @reduction_result_used_in_phi_loop(ptr nocapture readonly %data, i1 zeroext %b) { 1073; CHECK-LABEL: @reduction_result_used_in_phi_loop( 1074; CHECK-NEXT: entry: 1075; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] 1076; CHECK: bb: 1077; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[DATA:%.*]], align 4 1078; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) 1079; CHECK-NEXT: br label [[EXIT]] 1080; CHECK: exit: 1081; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[BB]] ] 1082; CHECK-NEXT: ret i32 [[SUM_1]] 1083; 1084entry: 1085 br i1 %b, label %bb, label %exit 1086 1087bb: 1088 %l.0 = load i32, ptr %data, align 4 1089 %idx.1 = getelementptr inbounds i32, ptr %data, i64 1 1090 %l.1 = load i32, ptr %idx.1, align 4 1091 %add.1 = add i32 %l.1, %l.0 1092 %idx.2 = getelementptr inbounds i32, ptr %data, i64 2 1093 %l.2 = load i32, ptr %idx.2, align 4 1094 %add.2 = add i32 %l.2, %add.1 1095 %idx.3 = getelementptr inbounds i32, ptr %data, i64 3 1096 %l.3 = load i32, ptr %idx.3, align 4 1097 %add.3 = add i32 %l.3, %add.2 1098 br label %exit 1099 1100exit: 1101 %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb] 1102 ret i32 %sum.1 1103} 1104 1105; Make sure we do not crash or infinite loop on ill-formed IR. 1106 1107define void @unreachable_block() { 1108; CHECK-LABEL: @unreachable_block( 1109; CHECK-NEXT: bb.0: 1110; CHECK-NEXT: br label [[BB_1:%.*]] 1111; CHECK: dead: 1112; CHECK-NEXT: [[T0:%.*]] = add i16 [[T0]], undef 1113; CHECK-NEXT: br label [[BB_1]] 1114; CHECK: bb.1: 1115; CHECK-NEXT: [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ] 1116; CHECK-NEXT: ret void 1117; 1118bb.0: 1119 br label %bb.1 1120 1121dead: 1122 %t0 = add i16 %t0, undef ; unreachable IR may depend on itself 1123 br label %bb.1 1124 1125bb.1: 1126 %t1 = phi i16 [ undef, %bb.0 ], [ %t0, %dead ] 1127 ret void 1128} 1129 1130; The FMF on the reduction should match the incoming insts. 1131 1132define float @fadd_v4f32_fmf(ptr %p) { 1133; CHECK-LABEL: @fadd_v4f32_fmf( 1134; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4 1135; CHECK-NEXT: [[TMP2:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP1]]) 1136; CHECK-NEXT: ret float [[TMP2]] 1137; 1138 %p1 = getelementptr inbounds float, ptr %p, i64 1 1139 %p2 = getelementptr inbounds float, ptr %p, i64 2 1140 %p3 = getelementptr inbounds float, ptr %p, i64 3 1141 %t0 = load float, ptr %p, align 4 1142 %t1 = load float, ptr %p1, align 4 1143 %t2 = load float, ptr %p2, align 4 1144 %t3 = load float, ptr %p3, align 4 1145 %add1 = fadd reassoc nsz float %t1, %t0 1146 %add2 = fadd reassoc nsz float %t2, %add1 1147 %add3 = fadd reassoc nsz float %t3, %add2 1148 ret float %add3 1149} 1150 1151; The minimal FMF for fadd reduction are "reassoc nsz". 1152; Only the common FMF of all operations in the reduction propagate to the result. 1153; In this example, "contract nnan arcp" are dropped, but "ninf" transfers with the required flags. 1154 1155define float @fadd_v4f32_fmf_intersect(ptr %p) { 1156; CHECK-LABEL: @fadd_v4f32_fmf_intersect( 1157; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4 1158; CHECK-NEXT: [[TMP2:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP1]]) 1159; CHECK-NEXT: ret float [[TMP2]] 1160; 1161 %p1 = getelementptr inbounds float, ptr %p, i64 1 1162 %p2 = getelementptr inbounds float, ptr %p, i64 2 1163 %p3 = getelementptr inbounds float, ptr %p, i64 3 1164 %t0 = load float, ptr %p, align 4 1165 %t1 = load float, ptr %p1, align 4 1166 %t2 = load float, ptr %p2, align 4 1167 %t3 = load float, ptr %p3, align 4 1168 %add1 = fadd ninf reassoc nsz nnan float %t1, %t0 1169 %add2 = fadd ninf reassoc nsz nnan arcp float %t2, %add1 1170 %add3 = fadd ninf reassoc nsz contract float %t3, %add2 1171 ret float %add3 1172} 1173 1174; This must not propagate 'nsw' to a new add instruction. 1175 1176define void @nsw_propagation_v4i32(ptr %res, i32 %start) { 1177; CHECK-LABEL: @nsw_propagation_v4i32( 1178; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @arr_i32, align 16 1179; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) 1180; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[START:%.*]] 1181; CHECK-NEXT: store i32 [[OP_RDX]], ptr [[RES:%.*]], align 16 1182; CHECK-NEXT: ret void 1183; 1184 1185; STORE-LABEL: @nsw_propagation_v4i32( 1186; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @arr_i32, align 16 1187; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) 1188; STORE-NEXT: [[OP_RDX:%.*]] = add i32 [[START:%.*]], [[TMP2]] 1189; STORE-NEXT: store i32 [[OP_RDX]], ptr [[RES:%.*]], align 16 1190; STORE-NEXT: ret void 1191 %t0 = load i32, ptr @arr_i32, align 16 1192 %t1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4 1193 %t2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8 1194 %t3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4 1195 %s = add nsw i32 %start, %t0 1196 %add = add nsw i32 %t1, %s 1197 %add.1 = add nsw i32 %t2, %add 1198 %add.2 = add nsw i32 %t3, %add.1 1199 store i32 %add.2, ptr %res, align 16 1200 ret void 1201} 1202 1203declare i32 @__gxx_personality_v0(...) 1204