1; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on \ 2; RUN: -riscv-v-vector-bits-min=128 -riscv-v-vector-bits-max=128 \ 3; RUN: -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \ 4; RUN: -pass-remarks-missed=loop-vectorize -mtriple riscv64-linux-gnu \ 5; RUN: -force-target-max-vector-interleave=2 -mattr=+v,+f -S 2>%t \ 6; RUN: | FileCheck %s -check-prefix=CHECK 7; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK 8 9; Reduction can be vectorized 10 11; ADD 12 13; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 14define i32 @add(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { 15; CHECK-LABEL: @add 16; CHECK: vector.body: 17; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32> 18; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32> 19; CHECK: %[[ADD1:.*]] = add <vscale x 8 x i32> %[[LOAD1]] 20; CHECK: %[[ADD2:.*]] = add <vscale x 8 x i32> %[[LOAD2]] 21; CHECK: middle.block: 22; CHECK: %[[ADD:.*]] = add <vscale x 8 x i32> %[[ADD2]], %[[ADD1]] 23; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %[[ADD]]) 24entry: 25 br label %for.body 26 27for.body: ; preds = %entry, %for.body 28 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 29 %sum.07 = phi i32 [ 2, %entry ], [ %add, %for.body ] 30 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv 31 %0 = load i32, ptr %arrayidx, align 4 32 %add = add nsw i32 %0, %sum.07 33 %iv.next = add nuw nsw i64 %iv, 1 34 %exitcond.not = icmp eq i64 %iv.next, %n 35 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 36 37for.end: ; preds = %for.body, %entry 38 ret i32 %add 39} 40 41; OR 42 43; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 44define i32 @or(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { 45; CHECK-LABEL: @or 46; CHECK: vector.body: 47; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32> 48; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32> 49; CHECK: %[[OR1:.*]] = or <vscale x 8 x i32> %[[LOAD1]] 50; CHECK: %[[OR2:.*]] = or <vscale x 8 x i32> %[[LOAD2]] 51; CHECK: middle.block: 52; CHECK: %[[OR:.*]] = or <vscale x 8 x i32> %[[OR2]], %[[OR1]] 53; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> %[[OR]]) 54entry: 55 br label %for.body 56 57for.body: ; preds = %entry, %for.body 58 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 59 %sum.07 = phi i32 [ 2, %entry ], [ %or, %for.body ] 60 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv 61 %0 = load i32, ptr %arrayidx, align 4 62 %or = or i32 %0, %sum.07 63 %iv.next = add nuw nsw i64 %iv, 1 64 %exitcond.not = icmp eq i64 %iv.next, %n 65 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 66 67for.end: ; preds = %for.body, %entry 68 ret i32 %or 69} 70 71; AND 72 73; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 74define i32 @and(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { 75; CHECK-LABEL: @and 76; CHECK: vector.body: 77; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32> 78; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32> 79; CHECK: %[[AND1:.*]] = and <vscale x 8 x i32> %[[LOAD1]] 80; CHECK: %[[AND2:.*]] = and <vscale x 8 x i32> %[[LOAD2]] 81; CHECK: middle.block: 82; CHECK: %[[ABD:.*]] = and <vscale x 8 x i32> %[[ADD2]], %[[AND1]] 83; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> %[[ADD]]) 84entry: 85 br label %for.body 86 87for.body: ; preds = %entry, %for.body 88 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 89 %sum.07 = phi i32 [ 2, %entry ], [ %and, %for.body ] 90 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv 91 %0 = load i32, ptr %arrayidx, align 4 92 %and = and i32 %0, %sum.07 93 %iv.next = add nuw nsw i64 %iv, 1 94 %exitcond.not = icmp eq i64 %iv.next, %n 95 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 96 97for.end: ; preds = %for.body, %entry 98 ret i32 %and 99} 100 101; XOR 102 103; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 104define i32 @xor(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { 105; CHECK-LABEL: @xor 106; CHECK: vector.body: 107; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32> 108; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32> 109; CHECK: %[[XOR1:.*]] = xor <vscale x 8 x i32> %[[LOAD1]] 110; CHECK: %[[XOR2:.*]] = xor <vscale x 8 x i32> %[[LOAD2]] 111; CHECK: middle.block: 112; CHECK: %[[XOR:.*]] = xor <vscale x 8 x i32> %[[XOR2]], %[[XOR1]] 113; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> %[[XOR]]) 114entry: 115 br label %for.body 116 117for.body: ; preds = %entry, %for.body 118 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 119 %sum.07 = phi i32 [ 2, %entry ], [ %xor, %for.body ] 120 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv 121 %0 = load i32, ptr %arrayidx, align 4 122 %xor = xor i32 %0, %sum.07 123 %iv.next = add nuw nsw i64 %iv, 1 124 %exitcond.not = icmp eq i64 %iv.next, %n 125 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 126 127for.end: ; preds = %for.body, %entry 128 ret i32 %xor 129} 130 131; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 132; SMIN 133 134define i32 @smin(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { 135; CHECK-LABEL: @smin 136; CHECK: vector.body: 137; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32> 138; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32> 139; CHECK: %[[ICMP1:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD1]] 140; CHECK: %[[ICMP2:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD2]] 141; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]] 142; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]] 143; CHECK: middle.block: 144; CHECK: %[[RDX:.*]] = call <vscale x 8 x i32> @llvm.smin.nxv8i32(<vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]) 145; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> %[[RDX]]) 146entry: 147 br label %for.body 148 149for.body: ; preds = %entry, %for.body 150 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 151 %sum.010 = phi i32 [ 2, %entry ], [ %.sroa.speculated, %for.body ] 152 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv 153 %0 = load i32, ptr %arrayidx, align 4 154 %cmp.i = icmp slt i32 %0, %sum.010 155 %.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010 156 %iv.next = add nuw nsw i64 %iv, 1 157 %exitcond.not = icmp eq i64 %iv.next, %n 158 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 159 160for.end: 161 ret i32 %.sroa.speculated 162} 163 164; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 165; UMAX 166 167define i32 @umax(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { 168; CHECK-LABEL: @umax 169; CHECK: vector.body: 170; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32> 171; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32> 172; CHECK: %[[ICMP1:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD1]] 173; CHECK: %[[ICMP2:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD2]] 174; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]] 175; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]] 176; CHECK: middle.block: 177; CHECK: %[[RDX:.*]] = call <vscale x 8 x i32> @llvm.umax.nxv8i32(<vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]) 178; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> %[[RDX]]) 179entry: 180 br label %for.body 181 182for.body: ; preds = %entry, %for.body 183 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 184 %sum.010 = phi i32 [ 2, %entry ], [ %.sroa.speculated, %for.body ] 185 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv 186 %0 = load i32, ptr %arrayidx, align 4 187 %cmp.i = icmp ugt i32 %0, %sum.010 188 %.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010 189 %iv.next = add nuw nsw i64 %iv, 1 190 %exitcond.not = icmp eq i64 %iv.next, %n 191 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 192 193for.end: 194 ret i32 %.sroa.speculated 195} 196 197; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 198; FADD (FAST) 199 200define float @fadd_fast(ptr noalias nocapture readonly %a, i64 %n) { 201; CHECK-LABEL: @fadd_fast 202; CHECK: vector.body: 203; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float> 204; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float> 205; CHECK: %[[ADD1:.*]] = fadd fast <vscale x 8 x float> %[[LOAD1]] 206; CHECK: %[[ADD2:.*]] = fadd fast <vscale x 8 x float> %[[LOAD2]] 207; CHECK: middle.block: 208; CHECK: %[[ADD:.*]] = fadd fast <vscale x 8 x float> %[[ADD2]], %[[ADD1]] 209; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> %[[ADD]]) 210entry: 211 br label %for.body 212 213for.body: 214 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 215 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] 216 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv 217 %0 = load float, ptr %arrayidx, align 4 218 %add = fadd fast float %0, %sum.07 219 %iv.next = add nuw nsw i64 %iv, 1 220 %exitcond.not = icmp eq i64 %iv.next, %n 221 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 222 223for.end: 224 ret float %add 225} 226 227; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 228define half @fadd_fast_half_zvfh(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfh" { 229; CHECK-LABEL: @fadd_fast_half_zvfh 230; CHECK: vector.body: 231; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half> 232; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half> 233; CHECK: %[[FADD1:.*]] = fadd fast <vscale x 8 x half> %[[LOAD1]] 234; CHECK: %[[FADD2:.*]] = fadd fast <vscale x 8 x half> %[[LOAD2]] 235; CHECK: middle.block: 236; CHECK: %[[RDX:.*]] = fadd fast <vscale x 8 x half> %[[FADD2]], %[[FADD1]] 237; CHECK: call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> %[[RDX]]) 238entry: 239 br label %for.body 240 241for.body: 242 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 243 %sum.07 = phi half [ 0.000000e+00, %entry ], [ %add, %for.body ] 244 %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv 245 %0 = load half, ptr %arrayidx, align 4 246 %add = fadd fast half %0, %sum.07 247 %iv.next = add nuw nsw i64 %iv, 1 248 %exitcond.not = icmp eq i64 %iv.next, %n 249 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 250 251for.end: 252 ret half %add 253} 254 255; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. 256; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) 257define half @fadd_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfhmin" { 258; CHECK-LABEL: @fadd_fast_half_zvfhmin 259; CHECK: vector.body: 260; CHECK: %[[LOAD1:.*]] = load <16 x half> 261; CHECK: %[[LOAD2:.*]] = load <16 x half> 262; CHECK: %[[FADD1:.*]] = fadd fast <16 x half> %[[LOAD1]] 263; CHECK: %[[FADD2:.*]] = fadd fast <16 x half> %[[LOAD2]] 264; CHECK: middle.block: 265; CHECK: %[[RDX:.*]] = fadd fast <16 x half> %[[FADD2]], %[[FADD1]] 266; CHECK: call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> %[[RDX]]) 267entry: 268 br label %for.body 269 270for.body: 271 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 272 %sum.07 = phi half [ 0.000000e+00, %entry ], [ %add, %for.body ] 273 %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv 274 %0 = load half, ptr %arrayidx, align 4 275 %add = fadd fast half %0, %sum.07 276 %iv.next = add nuw nsw i64 %iv, 1 277 %exitcond.not = icmp eq i64 %iv.next, %n 278 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 279 280for.end: 281 ret half %add 282} 283 284; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. 285; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) 286define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfbfmin" { 287; CHECK-LABEL: @fadd_fast_bfloat 288; CHECK: vector.body: 289; CHECK: %[[LOAD1:.*]] = load <16 x bfloat> 290; CHECK: %[[LOAD2:.*]] = load <16 x bfloat> 291; CHECK: %[[FADD1:.*]] = fadd fast <16 x bfloat> %[[LOAD1]] 292; CHECK: %[[FADD2:.*]] = fadd fast <16 x bfloat> %[[LOAD2]] 293; CHECK: middle.block: 294; CHECK: %[[RDX:.*]] = fadd fast <16 x bfloat> %[[FADD2]], %[[FADD1]] 295; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> %[[RDX]]) 296entry: 297 br label %for.body 298 299for.body: 300 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 301 %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %add, %for.body ] 302 %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv 303 %0 = load bfloat, ptr %arrayidx, align 4 304 %add = fadd fast bfloat %0, %sum.07 305 %iv.next = add nuw nsw i64 %iv, 1 306 %exitcond.not = icmp eq i64 %iv.next, %n 307 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 308 309for.end: 310 ret bfloat %add 311} 312 313; FMIN (FAST) 314 315; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 316define float @fmin_fast(ptr noalias nocapture readonly %a, i64 %n) #0 { 317; CHECK-LABEL: @fmin_fast 318; CHECK: vector.body: 319; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float> 320; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float> 321; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD1]] 322; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD2]] 323; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]] 324; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]] 325; CHECK: middle.block: 326; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x float> %[[SEL1]], %[[SEL2]] 327; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]] 328; CHECK-NEXT: call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> %[[SEL]]) 329entry: 330 br label %for.body 331 332for.body: 333 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 334 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] 335 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv 336 %0 = load float, ptr %arrayidx, align 4 337 %cmp.i = fcmp olt float %0, %sum.07 338 %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07 339 %iv.next = add nuw nsw i64 %iv, 1 340 %exitcond.not = icmp eq i64 %iv.next, %n 341 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 342 343for.end: 344 ret float %.sroa.speculated 345} 346 347; FMAX (FAST) 348 349; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 350define float @fmax_fast(ptr noalias nocapture readonly %a, i64 %n) #0 { 351; CHECK-LABEL: @fmax_fast 352; CHECK: vector.body: 353; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float> 354; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float> 355; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD1]] 356; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD2]] 357; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]] 358; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]] 359; CHECK: middle.block: 360; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x float> %[[SEL1]], %[[SEL2]] 361; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]] 362; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> %[[SEL]]) 363entry: 364 br label %for.body 365 366for.body: 367 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 368 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] 369 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv 370 %0 = load float, ptr %arrayidx, align 4 371 %cmp.i = fcmp fast ogt float %0, %sum.07 372 %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07 373 %iv.next = add nuw nsw i64 %iv, 1 374 %exitcond.not = icmp eq i64 %iv.next, %n 375 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 376 377for.end: 378 ret float %.sroa.speculated 379} 380 381; Reduction cannot be vectorized 382 383; MUL 384 385; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. 386; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) 387define i32 @mul(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { 388; CHECK-LABEL: @mul 389; CHECK: vector.body: 390; CHECK: %[[LOAD1:.*]] = load <8 x i32> 391; CHECK: %[[LOAD2:.*]] = load <8 x i32> 392; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]] 393; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]] 394; CHECK: middle.block: 395; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] 396; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) 397entry: 398 br label %for.body 399 400for.body: ; preds = %entry, %for.body 401 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 402 %sum.07 = phi i32 [ 2, %entry ], [ %mul, %for.body ] 403 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv 404 %0 = load i32, ptr %arrayidx, align 4 405 %mul = mul nsw i32 %0, %sum.07 406 %iv.next = add nuw nsw i64 %iv, 1 407 %exitcond.not = icmp eq i64 %iv.next, %n 408 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 409 410for.end: ; preds = %for.body, %entry 411 ret i32 %mul 412} 413 414; Note: This test was added to ensure we always check the legality of reductions (and emit a warning if necessary) before checking for memory dependencies 415; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. 416; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) 417define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { 418; CHECK-LABEL: @memory_dependence 419; CHECK: vector.body: 420; CHECK: %[[LOAD1:.*]] = load <8 x i32> 421; CHECK: %[[LOAD2:.*]] = load <8 x i32> 422; CHECK: %[[LOAD3:.*]] = load <8 x i32> 423; CHECK: %[[LOAD4:.*]] = load <8 x i32> 424; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]] 425; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]] 426; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]] 427; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]] 428; CHECK: middle.block: 429; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] 430; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) 431entry: 432 br label %for.body 433 434for.body: 435 %i = phi i64 [ %inc, %for.body ], [ 0, %entry ] 436 %sum = phi i32 [ %mul, %for.body ], [ 2, %entry ] 437 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %i 438 %0 = load i32, ptr %arrayidx, align 4 439 %arrayidx1 = getelementptr inbounds i32, ptr %b, i64 %i 440 %1 = load i32, ptr %arrayidx1, align 4 441 %add = add nsw i32 %1, %0 442 %add2 = add nuw nsw i64 %i, 32 443 %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 %add2 444 store i32 %add, ptr %arrayidx3, align 4 445 %mul = mul nsw i32 %1, %sum 446 %inc = add nuw nsw i64 %i, 1 447 %exitcond.not = icmp eq i64 %inc, %n 448 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 449 450for.end: 451 ret i32 %mul 452} 453 454; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2) 455define float @fmuladd(ptr %a, ptr %b, i64 %n) { 456; CHECK-LABEL: @fmuladd( 457; CHECK: vector.body: 458; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float> 459; CHECK: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float> 460; CHECK: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float> 461; CHECK: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float> 462; CHECK: [[MULADD1:%.*]] = call reassoc <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD3]], 463; CHECK: [[MULADD2:%.*]] = call reassoc <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD2]], <vscale x 4 x float> [[WIDE_LOAD4]], 464; CHECK: middle.block: 465; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <vscale x 4 x float> [[MULADD2]], [[MULADD1]] 466; CHECK: call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[BIN_RDX]]) 467; 468entry: 469 br label %for.body 470 471for.body: 472 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 473 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] 474 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv 475 %0 = load float, ptr %arrayidx, align 4 476 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv 477 %1 = load float, ptr %arrayidx2, align 4 478 %muladd = tail call reassoc float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) 479 %iv.next = add nuw nsw i64 %iv, 1 480 %exitcond.not = icmp eq i64 %iv.next, %n 481 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 482 483for.end: 484 ret float %muladd 485} 486 487; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) 488define half @fmuladd_f16_zvfh(ptr %a, ptr %b, i64 %n) "target-features"="+zvfh" { 489; CHECK-LABEL: @fmuladd_f16_zvfh( 490; CHECK: vector.body: 491; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half> 492; CHECK: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x half> 493; CHECK: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x half> 494; CHECK: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x half> 495; CHECK: [[MULADD1:%.*]] = call reassoc <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD3]], 496; CHECK: [[MULADD2:%.*]] = call reassoc <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD2]], <vscale x 8 x half> [[WIDE_LOAD4]], 497; CHECK: middle.block: 498; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <vscale x 8 x half> [[MULADD2]], [[MULADD1]] 499; CHECK: call reassoc half @llvm.vector.reduce.fadd.nxv8f16(half 0xH8000, <vscale x 8 x half> [[BIN_RDX]]) 500; 501entry: 502 br label %for.body 503 504for.body: 505 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 506 %sum.07 = phi half [ 0.000000e+00, %entry ], [ %muladd, %for.body ] 507 %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv 508 %0 = load half, ptr %arrayidx, align 4 509 %arrayidx2 = getelementptr inbounds half, ptr %b, i64 %iv 510 %1 = load half, ptr %arrayidx2, align 4 511 %muladd = tail call reassoc half @llvm.fmuladd.f16(half %0, half %1, half %sum.07) 512 %iv.next = add nuw nsw i64 %iv, 1 513 %exitcond.not = icmp eq i64 %iv.next, %n 514 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 515 516for.end: 517 ret half %muladd 518} 519 520 521; We can't scalably vectorize reductions of f16 with zvfhmin or bf16 with zvfbfmin, so make sure we use fixed-length vectors instead. 522 523; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. 524; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) 525define half @fmuladd_f16_zvfhmin(ptr %a, ptr %b, i64 %n) "target-features"="+zvfhmin" { 526; CHECK-LABEL: @fmuladd_f16_zvfhmin( 527; CHECK: vector.body: 528; CHECK: [[WIDE_LOAD:%.*]] = load <16 x half> 529; CHECK: [[WIDE_LOAD2:%.*]] = load <16 x half> 530; CHECK: [[WIDE_LOAD3:%.*]] = load <16 x half> 531; CHECK: [[WIDE_LOAD4:%.*]] = load <16 x half> 532; CHECK: [[MULADD1:%.*]] = call reassoc <16 x half> @llvm.fmuladd.v16f16(<16 x half> [[WIDE_LOAD]], <16 x half> [[WIDE_LOAD3]], 533; CHECK: [[MULADD2:%.*]] = call reassoc <16 x half> @llvm.fmuladd.v16f16(<16 x half> [[WIDE_LOAD2]], <16 x half> [[WIDE_LOAD4]], 534; CHECK: middle.block: 535; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <16 x half> [[MULADD2]], [[MULADD1]] 536; CHECK: call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> [[BIN_RDX]]) 537; 538entry: 539 br label %for.body 540 541for.body: 542 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 543 %sum.07 = phi half [ 0.000000e+00, %entry ], [ %muladd, %for.body ] 544 %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv 545 %0 = load half, ptr %arrayidx, align 4 546 %arrayidx2 = getelementptr inbounds half, ptr %b, i64 %iv 547 %1 = load half, ptr %arrayidx2, align 4 548 %muladd = tail call reassoc half @llvm.fmuladd.f16(half %0, half %1, half %sum.07) 549 %iv.next = add nuw nsw i64 %iv, 1 550 %exitcond.not = icmp eq i64 %iv.next, %n 551 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 552 553for.end: 554 ret half %muladd 555} 556 557; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. 558; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) 559define bfloat @fmuladd_bf16(ptr %a, ptr %b, i64 %n) "target-features"="+zvfbfmin" { 560; CHECK-LABEL: @fmuladd_bf16( 561; CHECK: vector.body: 562; CHECK: [[WIDE_LOAD:%.*]] = load <16 x bfloat> 563; CHECK: [[WIDE_LOAD2:%.*]] = load <16 x bfloat> 564; CHECK: [[WIDE_LOAD3:%.*]] = load <16 x bfloat> 565; CHECK: [[WIDE_LOAD4:%.*]] = load <16 x bfloat> 566; CHECK: [[MULADD1:%.*]] = call reassoc <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> [[WIDE_LOAD]], <16 x bfloat> [[WIDE_LOAD3]], 567; CHECK: [[MULADD2:%.*]] = call reassoc <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> [[WIDE_LOAD2]], <16 x bfloat> [[WIDE_LOAD4]], 568; CHECK: middle.block: 569; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <16 x bfloat> [[MULADD2]], [[MULADD1]] 570; CHECK: call reassoc bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR8000, <16 x bfloat> [[BIN_RDX]]) 571; 572entry: 573 br label %for.body 574 575for.body: 576 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 577 %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %muladd, %for.body ] 578 %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv 579 %0 = load bfloat, ptr %arrayidx, align 4 580 %arrayidx2 = getelementptr inbounds bfloat, ptr %b, i64 %iv 581 %1 = load bfloat, ptr %arrayidx2, align 4 582 %muladd = tail call reassoc bfloat @llvm.fmuladd.bf16(bfloat %0, bfloat %1, bfloat %sum.07) 583 %iv.next = add nuw nsw i64 %iv, 1 584 %exitcond.not = icmp eq i64 %iv.next, %n 585 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 586 587for.end: 588 ret bfloat %muladd 589} 590 591declare float @llvm.fmuladd.f32(float, float, float) 592 593attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } 594 595!0 = distinct !{!0, !1, !2, !3, !4} 596!1 = !{!"llvm.loop.vectorize.width", i32 8} 597!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} 598!3 = !{!"llvm.loop.interleave.count", i32 2} 599!4 = !{!"llvm.loop.vectorize.enable", i1 true} 600