1*0a6a1f1dSLionel Sambuc; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s 2*0a6a1f1dSLionel Sambuc; 3*0a6a1f1dSLionel Sambuc; Test multiple peephole-time folds in a single basic block. 4*0a6a1f1dSLionel Sambuc; <rdar://problem/16478629> 5*0a6a1f1dSLionel Sambuc 6*0a6a1f1dSLionel Sambucdefine <8 x float> @test_peephole_multi_fold(<8 x float>* %p1, <8 x float>* %p2) { 7*0a6a1f1dSLionel Sambucentry: 8*0a6a1f1dSLionel Sambuc br label %loopbody 9*0a6a1f1dSLionel Sambuc 10*0a6a1f1dSLionel Sambucloopbody: 11*0a6a1f1dSLionel Sambuc; CHECK: test_peephole_multi_fold: 12*0a6a1f1dSLionel Sambuc; CHECK: vfmadd231ps ({{%rdi|%rcx}}), 13*0a6a1f1dSLionel Sambuc; CHECK: vfmadd231ps ({{%rsi|%rdx}}), 14*0a6a1f1dSLionel Sambuc %vsum1 = phi <8 x float> [ %vsum1.next, %loopbody ], [ zeroinitializer, %entry ] 15*0a6a1f1dSLionel Sambuc %vsum2 = phi <8 x float> [ %vsum2.next, %loopbody ], [ zeroinitializer, %entry ] 16*0a6a1f1dSLionel Sambuc %m1 = load <8 x float>* %p1, align 1 17*0a6a1f1dSLionel Sambuc %m2 = load <8 x float>* %p2, align 1 18*0a6a1f1dSLionel Sambuc %vsum1.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m1, <8 x float> zeroinitializer, <8 x float> %vsum1) 19*0a6a1f1dSLionel Sambuc %vsum2.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m2, <8 x float> zeroinitializer, <8 x float> %vsum2) 20*0a6a1f1dSLionel Sambuc %vsum1.next.1 = extractelement <8 x float> %vsum1.next, i32 0 21*0a6a1f1dSLionel Sambuc %c = fcmp oeq float %vsum1.next.1, 0.0 22*0a6a1f1dSLionel Sambuc br i1 %c, label %loopbody, label %loopexit 23*0a6a1f1dSLionel Sambuc 24*0a6a1f1dSLionel Sambucloopexit: 25*0a6a1f1dSLionel Sambuc %r = fadd <8 x float> %vsum1.next, %vsum2.next 26*0a6a1f1dSLionel Sambuc ret <8 x float> %r 27*0a6a1f1dSLionel Sambuc} 28*0a6a1f1dSLionel Sambuc 29*0a6a1f1dSLionel Sambucdeclare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) 30