1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -mtriple=x86_64 -passes=slp-vectorizer -S -mcpu=skylake-avx512 | FileCheck %s 3 4; The test represents the case with multiple vectorization possibilities 5; but the most effective way to vectorize it is to match both 8-way reductions 6; feeding the insertelement vector build sequence. 7 8declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg, <2 x i1>) 9 10define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) { 11; CHECK-LABEL: @test( 12; CHECK-NEXT: entry: 13; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0 14; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer 15; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15> 16; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 17; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison) 18; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8 19; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]] 20; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 21; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]] 22; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]]) 23; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]]) 24; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 25; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1 26; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16> 27; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true)) 28; CHECK-NEXT: ret void 29; 30entry: 31 %gep1.0 = getelementptr inbounds double, ptr %arg, i64 1 32 %ld1.0 = load double, ptr %gep1.0, align 8 33 %ld0.0 = load double, ptr %arg1, align 8 34 %mul1.0 = fmul fast double %ld0.0, %ld1.0 35 %gep2.0 = getelementptr inbounds double, ptr %arg1, i64 16 36 %ld2.0 = load double, ptr %gep2.0, align 8 37 %mul2.0 = fmul fast double %ld2.0, %ld1.0 38 %gep1.1 = getelementptr inbounds double, ptr %arg, i64 3 39 %ld1.1 = load double, ptr %gep1.1, align 8 40 %gep0.1 = getelementptr inbounds double, ptr %arg1, i64 1 41 %ld0.1 = load double, ptr %gep0.1, align 8 42 %mul1.1 = fmul fast double %ld0.1, %ld1.1 43 %rdx1.0 = fadd fast double %mul1.0, %mul1.1 44 %gep2.1 = getelementptr inbounds double, ptr %arg1, i64 17 45 %ld2.1 = load double, ptr %gep2.1, align 8 46 %mul2.1 = fmul fast double %ld2.1, %ld1.1 47 %rdx2.0 = fadd fast double %mul2.0, %mul2.1 48 %gep1.2 = getelementptr inbounds double, ptr %arg, i64 5 49 %ld1.2 = load double, ptr %gep1.2, align 8 50 %gep0.2 = getelementptr inbounds double, ptr %arg1, i64 2 51 %ld0.2 = load double, ptr %gep0.2, align 8 52 %mul1.2 = fmul fast double %ld0.2, %ld1.2 53 %rdx1.1 = fadd fast double %rdx1.0, %mul1.2 54 %gep2.2 = getelementptr inbounds double, ptr %arg1, i64 18 55 %ld2.2 = load double, ptr %gep2.2, align 8 56 %mul2.2 = fmul fast double %ld2.2, %ld1.2 57 %rdx2.1 = fadd fast double %rdx2.0, %mul2.2 58 %gep1.3 = getelementptr inbounds double, ptr %arg, i64 7 59 %ld1.3 = load double, ptr %gep1.3, align 8 60 %gep0.3 = getelementptr inbounds double, ptr %arg1, i64 3 61 %ld0.3 = load double, ptr %gep0.3, align 8 62 %mul1.3 = fmul fast double %ld0.3, %ld1.3 63 %rdx1.2 = fadd fast double %rdx1.1, %mul1.3 64 %gep2.3 = getelementptr inbounds double, ptr %arg1, i64 19 65 %ld2.3 = load double, ptr %gep2.3, align 8 66 %mul2.3 = fmul fast double %ld2.3, %ld1.3 67 %rdx2.2 = fadd fast double %rdx2.1, %mul2.3 68 %gep1.4 = getelementptr inbounds double, ptr %arg, i64 9 69 %ld1.4 = load double, ptr %gep1.4, align 8 70 %gep0.4 = getelementptr inbounds double, ptr %arg1, i64 4 71 %ld0.4 = load double, ptr %gep0.4, align 8 72 %mul1.4 = fmul fast double %ld0.4, %ld1.4 73 %rdx1.3 = fadd fast double %rdx1.2, %mul1.4 74 %gep2.4 = getelementptr inbounds double, ptr %arg1, i64 20 75 %ld2.4 = load double, ptr %gep2.4, align 8 76 %mul2.4 = fmul fast double %ld2.4, %ld1.4 77 %rdx2.3 = fadd fast double %rdx2.2, %mul2.4 78 %gep1.5 = getelementptr inbounds double, ptr %arg, i64 11 79 %ld1.5 = load double, ptr %gep1.5, align 8 80 %gep0.5 = getelementptr inbounds double, ptr %arg1, i64 5 81 %ld0.5 = load double, ptr %gep0.5, align 8 82 %mul1.5 = fmul fast double %ld0.5, %ld1.5 83 %rdx1.4 = fadd fast double %rdx1.3, %mul1.5 84 %gep2.5 = getelementptr inbounds double, ptr %arg1, i64 21 85 %ld2.5 = load double, ptr %gep2.5, align 8 86 %mul2.5 = fmul fast double %ld2.5, %ld1.5 87 %rdx2.4 = fadd fast double %rdx2.3, %mul2.5 88 %gep1.6 = getelementptr inbounds double, ptr %arg, i64 13 89 %ld1.6 = load double, ptr %gep1.6, align 8 90 %gep0.6 = getelementptr inbounds double, ptr %arg1, i64 6 91 %ld0.6 = load double, ptr %gep0.6, align 8 92 %mul1.6 = fmul fast double %ld0.6, %ld1.6 93 %rdx1.5 = fadd fast double %rdx1.4, %mul1.6 94 %gep2.6 = getelementptr inbounds double, ptr %arg1, i64 22 95 %ld2.6 = load double, ptr %gep2.6, align 8 96 %mul2.6 = fmul fast double %ld2.6, %ld1.6 97 %rdx2.5 = fadd fast double %rdx2.4, %mul2.6 98 %gep1.7 = getelementptr inbounds double, ptr %arg, i64 15 99 %ld1.7 = load double, ptr %gep1.7, align 8 100 %gep0.7 = getelementptr inbounds double, ptr %arg1, i64 7 101 %ld0.7 = load double, ptr %gep0.7, align 8 102 %mul1.7 = fmul fast double %ld0.7, %ld1.7 103 %rdx1 = fadd fast double %rdx1.5, %mul1.7 104 %gep2.7 = getelementptr inbounds double, ptr %arg1, i64 23 105 %ld2.7 = load double, ptr %gep2.7, align 8 106 %mul2.7 = fmul fast double %ld2.7, %ld1.7 107 %rdx2 = fadd fast double %rdx2.5, %mul2.7 108 %i142 = insertelement <2 x double> poison, double %rdx1, i64 0 109 %i143 = insertelement <2 x double> %i142, double %rdx2, i64 1 110 %p = getelementptr inbounds double, ptr %arg2, <2 x i64> <i64 0, i64 16> 111 call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %i143, <2 x ptr> %p, i32 8, <2 x i1> <i1 true, i1 true>) 112 ret void 113} 114