1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s 3 4define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, ptr %arg13, ptr %arg14) nounwind { 5; CHECK-LABEL: eggs: 6; CHECK: ## %bb.0: ## %bb 7; CHECK-NEXT: pushq %r15 8; CHECK-NEXT: pushq %r14 9; CHECK-NEXT: pushq %r12 10; CHECK-NEXT: pushq %rbx 11; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 12; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax 13; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 14; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12 15; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 16; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 17; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx 18; CHECK-NEXT: leaq (%rbx,%r10,8), %r10 19; CHECK-NEXT: leaq (%rbx,%r11,8), %r11 20; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 21; CHECK-NEXT: xorl %ebx, %ebx 22; CHECK-NEXT: vmovupd (%r14,%r15,8), %zmm1 23; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 24; CHECK-NEXT: addq %r12, %r15 25; CHECK-NEXT: vmovupd (%r14,%r15,8), %zmm2 26; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r12 27; CHECK-NEXT: vmovupd (%r14,%r12,8), %zmm8 28; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 29; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 30; CHECK-NEXT: vxorpd %xmm5, %xmm5, %xmm5 31; CHECK-NEXT: vxorpd %xmm6, %xmm6, %xmm6 32; CHECK-NEXT: vxorpd %xmm7, %xmm7, %xmm7 33; CHECK-NEXT: .p2align 4 34; CHECK-NEXT: LBB0_1: ## %bb15 35; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 36; CHECK-NEXT: vbroadcastsd (%r11,%rbx,8), %zmm9 37; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm9) + zmm0 38; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm2 * zmm9) + zmm3 39; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm8 * zmm9) + zmm4 40; CHECK-NEXT: vbroadcastsd (%r10,%rbx,8), %zmm9 41; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm1 * zmm9) + zmm5 42; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm6 = (zmm2 * zmm9) + zmm6 43; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm7 = (zmm8 * zmm9) + zmm7 44; CHECK-NEXT: incq %rbx 45; CHECK-NEXT: cmpq %rbx, %rax 46; CHECK-NEXT: jne LBB0_1 47; CHECK-NEXT: ## %bb.2: ## %bb51 48; CHECK-NEXT: vmovapd %zmm0, (%rdi) 49; CHECK-NEXT: vmovapd %zmm3, (%rsi) 50; CHECK-NEXT: vmovapd %zmm4, (%rdx) 51; CHECK-NEXT: vmovapd %zmm5, (%rcx) 52; CHECK-NEXT: vmovapd %zmm6, (%r8) 53; CHECK-NEXT: vmovapd %zmm7, (%r9) 54; CHECK-NEXT: popq %rbx 55; CHECK-NEXT: popq %r12 56; CHECK-NEXT: popq %r14 57; CHECK-NEXT: popq %r15 58; CHECK-NEXT: vzeroupper 59; CHECK-NEXT: retq 60bb: 61 br label %bb15 62 63bb15: ; preds = %bb15, %bb 64 %tmp = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp38, %bb15 ] 65 %tmp16 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp39, %bb15 ] 66 %tmp17 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp40, %bb15 ] 67 %tmp18 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp46, %bb15 ] 68 %tmp19 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp47, %bb15 ] 69 %tmp20 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp48, %bb15 ] 70 %tmp21 = phi i64 [ 0, %bb ], [ %tmp49, %bb15 ] 71 %tmp22 = getelementptr inbounds double, ptr %arg14, i64 %arg11 72 %tmp24 = load <8 x double>, ptr %tmp22, align 8 73 %tmp25 = add i64 %arg10, %arg6 74 %tmp26 = getelementptr inbounds double, ptr %arg14, i64 %tmp25 75 %tmp28 = load <8 x double>, ptr %tmp26, align 8 76 %tmp29 = add i64 %arg10, %arg7 77 %tmp30 = getelementptr inbounds double, ptr %arg14, i64 %tmp29 78 %tmp32 = load <8 x double>, ptr %tmp30, align 8 79 %tmp33 = add i64 %tmp21, %arg8 80 %tmp34 = getelementptr inbounds double, ptr %arg13, i64 %tmp33 81 %tmp35 = load double, ptr %tmp34, align 8 82 %tmp36 = insertelement <8 x double> undef, double %tmp35, i32 0 83 %tmp37 = shufflevector <8 x double> %tmp36, <8 x double> undef, <8 x i32> zeroinitializer 84 %tmp38 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp37, <8 x double> %tmp) 85 %tmp39 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp37, <8 x double> %tmp16) 86 %tmp40 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp37, <8 x double> %tmp17) 87 %tmp41 = add i64 %tmp21, %arg9 88 %tmp42 = getelementptr inbounds double, ptr %arg13, i64 %tmp41 89 %tmp43 = load double, ptr %tmp42, align 8 90 %tmp44 = insertelement <8 x double> undef, double %tmp43, i32 0 91 %tmp45 = shufflevector <8 x double> %tmp44, <8 x double> undef, <8 x i32> zeroinitializer 92 %tmp46 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp45, <8 x double> %tmp18) 93 %tmp47 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp45, <8 x double> %tmp19) 94 %tmp48 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp45, <8 x double> %tmp20) 95 %tmp49 = add nuw nsw i64 %tmp21, 1 96 %tmp50 = icmp eq i64 %tmp49, %arg12 97 br i1 %tmp50, label %bb51, label %bb15 98 99bb51: ; preds = %bb15 100 store <8 x double> %tmp38, ptr %arg 101 store <8 x double> %tmp39, ptr %arg1 102 store <8 x double> %tmp40, ptr %arg2 103 store <8 x double> %tmp46, ptr %arg3 104 store <8 x double> %tmp47, ptr %arg4 105 store <8 x double> %tmp48, ptr %arg5 106 ret void 107} 108 109declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>) 110