; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s ; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) { ; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store( ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], splat (i32 10) ; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store( ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10 ; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], splat (i32 10) ; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 ; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 %l.src.0 = load i32, ptr %gep.src.0, align 4 %mul.0 = mul nsw i32 %l.src.0, 10 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 %l.src.1 = load i32, ptr %gep.src.1, align 4 %mul.1 = mul nsw i32 %l.src.1, 10 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 %l.src.2 = load i32, ptr %gep.src.2, align 4 %mul.2 = mul nsw i32 %l.src.2, 10 store i32 %mul.0, ptr %dst %dst.1 = getelementptr i32, ptr %dst, i32 1 store i32 %mul.1, ptr %dst.1 %dst.2 = getelementptr i32, ptr %dst, i32 2 store i32 %mul.2, ptr %dst.2 ret void } ; Should no be vectorized with a undef/poison element as padding, as ; division by undef/poison may cause UB. Must use VL predication or ; masking instead, where RISCV wins. define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) { ; NON-POW2-LABEL: @v3_load_i32_udiv_by_constant_store( ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = udiv <3 x i32> splat (i32 10), [[TMP0]] ; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @v3_load_i32_udiv_by_constant_store( ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 ; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]] ; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1 ; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4 ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]] ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]] ; POW2-ONLY-NEXT: store i32 [[MUL_0]], ptr [[DST:%.*]], align 4 ; POW2-ONLY-NEXT: [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1 ; POW2-ONLY-NEXT: store i32 [[MUL_1]], ptr [[DST_1]], align 4 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 ; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 %l.src.0 = load i32, ptr %gep.src.0, align 4 %mul.0 = udiv i32 10, %l.src.0 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 %l.src.1 = load i32, ptr %gep.src.1, align 4 %mul.1 = udiv i32 10, %l.src.1 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 %l.src.2 = load i32, ptr %gep.src.2, align 4 %mul.2 = udiv i32 10, %l.src.2 store i32 %mul.0, ptr %dst %dst.1 = getelementptr i32, ptr %dst, i32 1 store i32 %mul.1, ptr %dst.1 %dst.2 = getelementptr i32, ptr %dst, i32 2 store i32 %mul.2, ptr %dst.2 ret void } define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) { ; NON-POW2-LABEL: @v3_load_i32_mul_store( ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 ; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]] ; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @v3_load_i32_mul_store( ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]] ; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]] ; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 ; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0 %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4 %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0 %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4 %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0 %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1 %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4 %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1 %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4 %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1 %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2 %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4 %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2 %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4 %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2 store i32 %mul.0, ptr %dst %dst.1 = getelementptr i32, ptr %dst, i32 1 store i32 %mul.1, ptr %dst.1 %dst.2 = getelementptr i32, ptr %dst, i32 2 store i32 %mul.2, ptr %dst.2 ret void } define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) { ; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store( ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 ; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]] ; NON-POW2-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], splat (i32 9) ; NON-POW2-NEXT: store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store( ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]] ; POW2-ONLY-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9 ; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]] ; POW2-ONLY-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], splat (i32 9) ; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 ; POW2-ONLY-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0 %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4 %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0 %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4 %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0 %add.0 = add i32 %mul.0, 9 %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1 %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4 %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1 %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4 %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1 %add.1 = add i32 %mul.1, 9 %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2 %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4 %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2 %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4 %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2 %add.2 = add i32 %mul.2, 9 store i32 %add.0, ptr %dst %dst.1 = getelementptr i32, ptr %dst, i32 1 store i32 %add.1, ptr %dst.1 %dst.2 = getelementptr i32, ptr %dst, i32 2 store i32 %add.2, ptr %dst.2 ret void } define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) { ; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], splat (float 1.000000e+01) ; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 ; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01 ; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], splat (float 1.000000e+01) ; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2 ; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0 %l.src.0 = load float , ptr %gep.src.0, align 4 %fadd.0 = fadd float %l.src.0, 10.0 %gep.src.1 = getelementptr inbounds float , ptr %src, i32 1 %l.src.1 = load float, ptr %gep.src.1, align 4 %fadd.1 = fadd float %l.src.1, 10.0 %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2 %l.src.2 = load float, ptr %gep.src.2, align 4 %fadd.2 = fadd float %l.src.2, 10.0 store float %fadd.0, ptr %dst %dst.1 = getelementptr float, ptr %dst, i32 1 store float %fadd.1, ptr %dst.1 %dst.2 = getelementptr float, ptr %dst, i32 2 store float %fadd.2, ptr %dst.2 ret void } define void @phi_store3(ptr %dst) { ; NON-POW2-LABEL: @phi_store3( ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: br label [[EXIT:%.*]] ; NON-POW2: invoke.cont8.loopexit: ; NON-POW2-NEXT: br label [[EXIT]] ; NON-POW2: exit: ; NON-POW2-NEXT: [[TMP0:%.*]] = phi <3 x i32> [ , [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ] ; NON-POW2-NEXT: store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @phi_store3( ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: br label [[EXIT:%.*]] ; POW2-ONLY: invoke.cont8.loopexit: ; POW2-ONLY-NEXT: br label [[EXIT]] ; POW2-ONLY: exit: ; POW2-ONLY-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ] ; POW2-ONLY-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ , [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ] ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2 ; POW2-ONLY-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4 ; POW2-ONLY-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: br label %exit invoke.cont8.loopexit: ; No predecessors! br label %exit exit: %p.0 = phi i32 [ 1, %entry ], [ 0, %invoke.cont8.loopexit ] %p.1 = phi i32 [ 2, %entry ], [ 0, %invoke.cont8.loopexit ] %p.2 = phi i32 [ 3, %entry ], [ 0, %invoke.cont8.loopexit ] %dst.1 = getelementptr i32, ptr %dst, i32 1 %dst.2 = getelementptr i32, ptr %dst, i32 2 store i32 %p.0, ptr %dst, align 4 store i32 %p.1, ptr %dst.1, align 4 store i32 %p.2, ptr %dst.2, align 4 ret void } define void @store_try_reorder(ptr %dst) { ; NON-POW2-LABEL: @store_try_reorder( ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @store_try_reorder( ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 ; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 ; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 ; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: %add = add i32 0, 0 store i32 %add, ptr %dst, align 4 %add207 = sub i32 0, 0 %arrayidx.i1887 = getelementptr i32, ptr %dst, i64 1 store i32 %add207, ptr %arrayidx.i1887, align 4 %add216 = sub i32 0, 0 %arrayidx.i1891 = getelementptr i32, ptr %dst, i64 2 store i32 %add216, ptr %arrayidx.i1891, align 4 ret void } define void @vec3_fpext_cost(ptr %Colour, float %0) { ; NON-POW2-LABEL: @vec3_fpext_cost( ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer ; NON-POW2-NEXT: [[TMP3:%.*]] = fpext <3 x float> [[TMP2]] to <3 x double> ; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> [[TMP3]], <3 x double> zeroinitializer, <3 x double> zeroinitializer) ; NON-POW2-NEXT: [[TMP5:%.*]] = fptrunc <3 x double> [[TMP4]] to <3 x float> ; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR:%.*]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @vec3_fpext_cost( ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0:%.*]], i32 0 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> ; POW2-ONLY-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> zeroinitializer, <2 x double> zeroinitializer) ; POW2-ONLY-NEXT: [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float> ; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COLOUR]], align 4 ; POW2-ONLY-NEXT: [[CONV78:%.*]] = fpext float [[TMP0]] to double ; POW2-ONLY-NEXT: [[TMP6:%.*]] = call double @llvm.fmuladd.f64(double [[CONV78]], double 0.000000e+00, double 0.000000e+00) ; POW2-ONLY-NEXT: [[CONV82:%.*]] = fptrunc double [[TMP6]] to float ; POW2-ONLY-NEXT: store float [[CONV82]], ptr [[ARRAYIDX80]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: %arrayidx72 = getelementptr float, ptr %Colour, i64 1 %arrayidx80 = getelementptr float, ptr %Colour, i64 2 %conv62 = fpext float %0 to double %1 = call double @llvm.fmuladd.f64(double %conv62, double 0.000000e+00, double 0.000000e+00) %conv66 = fptrunc double %1 to float store float %conv66, ptr %Colour, align 4 %conv70 = fpext float %0 to double %2 = call double @llvm.fmuladd.f64(double %conv70, double 0.000000e+00, double 0.000000e+00) %conv74 = fptrunc double %2 to float store float %conv74, ptr %arrayidx72, align 4 %conv78 = fpext float %0 to double %3 = call double @llvm.fmuladd.f64(double %conv78, double 0.000000e+00, double 0.000000e+00) %conv82 = fptrunc double %3 to float store float %conv82, ptr %arrayidx80, align 4 ret void } define void @fpext_scatter(ptr %dst, double %conv) { ; CHECK-LABEL: @fpext_scatter( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV25:%.*]] = fptrunc double [[CONV:%.*]] to float ; CHECK-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: store float [[CONV25]], ptr [[LENGTHS]], align 4 ; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1 ; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX32]], align 4 ; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2 ; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX37]], align 4 ; CHECK-NEXT: ret void ; entry: %conv25 = fptrunc double %conv to float %Lengths = getelementptr float, ptr %dst, i64 0 store float %conv25, ptr %Lengths, align 4 %arrayidx32 = getelementptr float, ptr %dst, i64 1 store float %conv25, ptr %arrayidx32, align 4 %arrayidx37 = getelementptr float, ptr %dst, i64 2 store float %conv25, ptr %arrayidx37, align 4 ret void } define i32 @reduce_add(ptr %src) { ; CHECK-LABEL: @reduce_add( ; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 ; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4 ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1 ; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4 ; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 ; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 ; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[L_SRC_0]], [[L_SRC_1]] ; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[L_SRC_2]] ; CHECK-NEXT: ret i32 [[ADD_1]] ; %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 %l.src.0 = load i32, ptr %gep.src.0, align 4 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 %l.src.1 = load i32, ptr %gep.src.1, align 4 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 %l.src.2 = load i32, ptr %gep.src.2, align 4 %add.0 = add i32 %l.src.0, %l.src.1 %add.1 = add i32 %add.0, %l.src.2 ret i32 %add.1 } define float @reduce_fadd(ptr %src) { ; NON-POW2-LABEL: @reduce_fadd( ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP1]]) ; NON-POW2-NEXT: ret float [[TMP2]] ; ; POW2-ONLY-LABEL: @reduce_fadd( ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 ; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4 ; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 1 ; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[L_SRC_0]], [[L_SRC_1]] ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[L_SRC_2]] ; POW2-ONLY-NEXT: ret float [[ADD_1]] ; %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0 %l.src.0 = load float, ptr %gep.src.0, align 4 %gep.src.1 = getelementptr inbounds float, ptr %src, i32 1 %l.src.1 = load float, ptr %gep.src.1, align 4 %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2 %l.src.2 = load float, ptr %gep.src.2, align 4 %add.0 = fadd fast float %l.src.0, %l.src.1 %add.1 = fadd fast float %add.0, %l.src.2 ret float %add.1 } define i32 @reduce_add_after_mul(ptr %src) { ; NON-POW2-LABEL: @reduce_add_after_mul( ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP1]], splat (i32 10) ; NON-POW2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP2]]) ; NON-POW2-NEXT: ret i32 [[TMP3]] ; ; POW2-ONLY-LABEL: @reduce_add_after_mul( ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 ; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4 ; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1 ; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10 ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]] ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret i32 [[ADD_1]] ; %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 %l.src.0 = load i32, ptr %gep.src.0, align 4 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 %l.src.1 = load i32, ptr %gep.src.1, align 4 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 %l.src.2 = load i32, ptr %gep.src.2, align 4 %mul.0 = mul nsw i32 %l.src.0, 10 %mul.1 = mul nsw i32 %l.src.1, 10 %mul.2 = mul nsw i32 %l.src.2, 10 %add.0 = add i32 %mul.0, %mul.1 %add.1 = add i32 %add.0, %mul.2 ret i32 %add.1 } define i32 @dot_product_i32(ptr %a, ptr %b) { ; NON-POW2-LABEL: @dot_product_i32( ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4 ; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]] ; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]]) ; NON-POW2-NEXT: ret i32 [[TMP4]] ; ; POW2-ONLY-LABEL: @dot_product_i32( ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 ; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 ; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 ; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 ; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 ; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]] ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret i32 [[ADD_1]] ; %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0 %l.a.0 = load i32, ptr %gep.a.0, align 4 %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1 %l.a.1 = load i32, ptr %gep.a.1, align 4 %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2 %l.a.2 = load i32, ptr %gep.a.2, align 4 %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0 %l.b.0 = load i32, ptr %gep.b.0, align 4 %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1 %l.b.1 = load i32, ptr %gep.b.1, align 4 %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2 %l.b.2 = load i32, ptr %gep.b.2, align 4 %mul.0 = mul nsw i32 %l.a.0, %l.b.0 %mul.1 = mul nsw i32 %l.a.1, %l.b.1 %mul.2 = mul nsw i32 %l.a.2, %l.b.2 %add.0 = add i32 %mul.0, %mul.1 %add.1 = add i32 %add.0, %mul.2 ret i32 %add.1 } ; Same as above, except the reduction order has been perturbed. This ; is checking for our ability to reorder. define i32 @dot_product_i32_reorder(ptr %a, ptr %b) { ; NON-POW2-LABEL: @dot_product_i32_reorder( ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4 ; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]] ; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]]) ; NON-POW2-NEXT: ret i32 [[TMP4]] ; ; POW2-ONLY-LABEL: @dot_product_i32_reorder( ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 ; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 ; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 ; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 ; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 ; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]] ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret i32 [[ADD_1]] ; %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0 %l.a.0 = load i32, ptr %gep.a.0, align 4 %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1 %l.a.1 = load i32, ptr %gep.a.1, align 4 %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2 %l.a.2 = load i32, ptr %gep.a.2, align 4 %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0 %l.b.0 = load i32, ptr %gep.b.0, align 4 %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1 %l.b.1 = load i32, ptr %gep.b.1, align 4 %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2 %l.b.2 = load i32, ptr %gep.b.2, align 4 %mul.0 = mul nsw i32 %l.a.0, %l.b.0 %mul.1 = mul nsw i32 %l.a.1, %l.b.1 %mul.2 = mul nsw i32 %l.a.2, %l.b.2 %add.0 = add i32 %mul.1, %mul.0 %add.1 = add i32 %add.0, %mul.2 ret i32 %add.1 } define float @dot_product_fp32(ptr %a, ptr %b) { ; NON-POW2-LABEL: @dot_product_fp32( ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 ; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] ; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) ; NON-POW2-NEXT: ret float [[TMP4]] ; ; POW2-ONLY-LABEL: @dot_product_fp32( ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] ; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 ; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret float [[ADD_1]] ; %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 %l.a.0 = load float, ptr %gep.a.0, align 4 %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1 %l.a.1 = load float, ptr %gep.a.1, align 4 %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2 %l.a.2 = load float, ptr %gep.a.2, align 4 %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0 %l.b.0 = load float, ptr %gep.b.0, align 4 %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1 %l.b.1 = load float, ptr %gep.b.1, align 4 %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2 %l.b.2 = load float, ptr %gep.b.2, align 4 %mul.0 = fmul fast float %l.a.0, %l.b.0 %mul.1 = fmul fast float %l.a.1, %l.b.1 %mul.2 = fmul fast float %l.a.2, %l.b.2 %add.0 = fadd fast float %mul.0, %mul.1 %add.1 = fadd fast float %add.0, %mul.2 ret float %add.1 } ; Same as above, except the reduction order has been perturbed. This ; is checking for our ability to reorder. define float @dot_product_fp32_reorder(ptr %a, ptr %b) { ; NON-POW2-LABEL: @dot_product_fp32_reorder( ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 ; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] ; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) ; NON-POW2-NEXT: ret float [[TMP4]] ; ; POW2-ONLY-LABEL: @dot_product_fp32_reorder( ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] ; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 ; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]] ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret float [[ADD_1]] ; %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 %l.a.0 = load float, ptr %gep.a.0, align 4 %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1 %l.a.1 = load float, ptr %gep.a.1, align 4 %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2 %l.a.2 = load float, ptr %gep.a.2, align 4 %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0 %l.b.0 = load float, ptr %gep.b.0, align 4 %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1 %l.b.1 = load float, ptr %gep.b.1, align 4 %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2 %l.b.2 = load float, ptr %gep.b.2, align 4 %mul.0 = fmul fast float %l.a.0, %l.b.0 %mul.1 = fmul fast float %l.a.1, %l.b.1 %mul.2 = fmul fast float %l.a.2, %l.b.2 %add.0 = fadd fast float %mul.1, %mul.0 %add.1 = fadd fast float %add.0, %mul.2 ret float %add.1 } define double @dot_product_fp64(ptr %a, ptr %b) { ; NON-POW2-LABEL: @dot_product_fp64( ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4 ; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]] ; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]]) ; NON-POW2-NEXT: ret double [[TMP4]] ; ; POW2-ONLY-LABEL: @dot_product_fp64( ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]] ; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 ; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]] ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret double [[ADD_1]] ; %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0 %l.a.0 = load double, ptr %gep.a.0, align 4 %gep.a.1 = getelementptr inbounds double, ptr %a, i32 1 %l.a.1 = load double, ptr %gep.a.1, align 4 %gep.a.2 = getelementptr inbounds double, ptr %a, i32 2 %l.a.2 = load double, ptr %gep.a.2, align 4 %gep.b.0 = getelementptr inbounds double, ptr %b, i32 0 %l.b.0 = load double, ptr %gep.b.0, align 4 %gep.b.1 = getelementptr inbounds double, ptr %b, i32 1 %l.b.1 = load double, ptr %gep.b.1, align 4 %gep.b.2 = getelementptr inbounds double, ptr %b, i32 2 %l.b.2 = load double, ptr %gep.b.2, align 4 %mul.0 = fmul fast double %l.a.0, %l.b.0 %mul.1 = fmul fast double %l.a.1, %l.b.1 %mul.2 = fmul fast double %l.a.2, %l.b.2 %add.0 = fadd fast double %mul.0, %mul.1 %add.1 = fadd fast double %add.0, %mul.2 ret double %add.1 } ;; Covers a case where SLP would previous crash due to a ;; missing bailout in TryToFindDuplicates for the case ;; where a VL=3 list was vectorized directly (without ;; a root instruction such as a store or reduce). define double @no_root_reshuffle(ptr %ptr) { ; CHECK-LABEL: @no_root_reshuffle( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[TMP0]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8 ; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 16 ; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]] ; CHECK-NEXT: [[MUL6:%.*]] = fmul fast double [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL6]], [[MUL]] ; CHECK-NEXT: ret double [[ADD]] ; entry: %0 = load double, ptr %ptr, align 8 %mul = fmul fast double %0, %0 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 8 %1 = load double, ptr %arrayidx2, align 8 %arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 16 %2 = load double, ptr %arrayidx3, align 8 %3 = fmul fast double %2, %2 %mul6 = fmul fast double %3, %1 %add = fadd fast double %mul6, %mul ret double %add } define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) { ; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec( ; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0 ; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1 ; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2 ; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01) ; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]]) ; NON-POW2-NEXT: ret float [[TMP5]] ; ; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec( ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01 ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret float [[ADD_1]] ; %mul.0 = fmul fast float %a, 10.0 %mul.1 = fmul fast float %b, 10.0 %mul.2 = fmul fast float %c, 10.0 %add.0 = fadd fast float %mul.0, %mul.1 %add.1 = fadd fast float %add.0, %mul.2 ret float %add.1 } declare float @llvm.fmuladd.f32(float, float, float) declare double @llvm.fmuladd.f64(double, double, double)