1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s 3 4; TODO: ldp don't support r+r, we need to avoid lsr optimize this pattern 5define void @convolution(ptr %src0, ptr %src1, i64 %stride_xm, i64 %stride_xp, ptr %dst, i32 %w) { 6; CHECK-LABEL: convolution: 7; CHECK: // %bb.0: // %entry 8; CHECK-NEXT: mov x8, xzr 9; CHECK-NEXT: add x9, x1, x3 10; CHECK-NEXT: add x10, x1, x2 11; CHECK-NEXT: add x11, x0, x2 12; CHECK-NEXT: add x12, x0, x3 13; CHECK-NEXT: .LBB0_1: // %do.body 14; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 15; CHECK-NEXT: add x13, x1, x8 16; CHECK-NEXT: add x14, x0, x8 17; CHECK-NEXT: ldr q0, [x11, x8] 18; CHECK-NEXT: ldp q2, q3, [x14] 19; CHECK-NEXT: ldr q1, [x12, x8] 20; CHECK-NEXT: ldp q6, q7, [x13] 21; CHECK-NEXT: subs w5, w5, #1 22; CHECK-NEXT: ldr q4, [x10, x8] 23; CHECK-NEXT: ldr q5, [x9, x8] 24; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s 25; CHECK-NEXT: fadd v1.4s, v2.4s, v3.4s 26; CHECK-NEXT: add x8, x8, #32 27; CHECK-NEXT: fadd v2.4s, v4.4s, v5.4s 28; CHECK-NEXT: fadd v3.4s, v6.4s, v7.4s 29; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s 30; CHECK-NEXT: fadd v1.4s, v2.4s, v3.4s 31; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s 32; CHECK-NEXT: str q0, [x4], #16 33; CHECK-NEXT: b.ne .LBB0_1 34; CHECK-NEXT: // %bb.2: // %do.end 35; CHECK-NEXT: ret 36entry: 37 br label %do.body 38 39do.body: 40 %dst.addr.0 = phi ptr [ %dst, %entry ], [ %incdec.ptr, %do.body ] 41 %src1.addr.0 = phi ptr [ %src1, %entry ], [ %incdec.ptr2.i7, %do.body ] 42 %src0.addr.0 = phi ptr [ %src0, %entry ], [ %incdec.ptr2.i, %do.body ] 43 %w.addr.0 = phi i32 [ %w, %entry ], [ %dec, %do.body ] 44 %add.ptr.i = getelementptr inbounds i8, ptr %src0.addr.0, i64 %stride_xm 45 %0 = load <4 x float>, ptr %add.ptr.i, align 16 46 %add.ptr1.i = getelementptr inbounds i8, ptr %src0.addr.0, i64 %stride_xp 47 %1 = load <4 x float>, ptr %add.ptr1.i, align 16 48 %incdec.ptr.i = getelementptr inbounds <4 x float>, ptr %src0.addr.0, i64 1 49 %2 = load <4 x float>, ptr %src0.addr.0, align 16 50 %incdec.ptr2.i = getelementptr inbounds <4 x float>, ptr %src0.addr.0, i64 2 51 %3 = load <4 x float>, ptr %incdec.ptr.i, align 16 52 %add.i = fadd <4 x float> %0, %1 53 %add3.i = fadd <4 x float> %2, %3 54 %add4.i = fadd <4 x float> %add.i, %add3.i 55 %add.ptr.i4 = getelementptr inbounds i8, ptr %src1.addr.0, i64 %stride_xm 56 %4 = load <4 x float>, ptr %add.ptr.i4, align 16 57 %add.ptr1.i5 = getelementptr inbounds i8, ptr %src1.addr.0, i64 %stride_xp 58 %5 = load <4 x float>, ptr %add.ptr1.i5, align 16 59 %incdec.ptr.i6 = getelementptr inbounds <4 x float>, ptr %src1.addr.0, i64 1 60 %6 = load <4 x float>, ptr %src1.addr.0, align 16 61 %incdec.ptr2.i7 = getelementptr inbounds <4 x float>, ptr %src1.addr.0, i64 2 62 %7 = load <4 x float>, ptr %incdec.ptr.i6, align 16 63 %add.i8 = fadd <4 x float> %4, %5 64 %add3.i9 = fadd <4 x float> %6, %7 65 %add4.i10 = fadd <4 x float> %add.i8, %add3.i9 66 %add = fadd <4 x float> %add4.i, %add4.i10 67 %incdec.ptr = getelementptr inbounds <4 x float>, ptr %dst.addr.0, i64 1 68 store <4 x float> %add, ptr %dst.addr.0, align 16 69 %dec = add nsw i32 %w.addr.0, -1 70 %tobool.not = icmp eq i32 %dec, 0 71 br i1 %tobool.not, label %do.end, label %do.body 72 73do.end: 74 ret void 75} 76