1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -o - < %s | FileCheck %s 3 4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 5target triple = "armv8-unknown-linux-gnueabihf" 6 7define <4 x float> @test(ptr %A) { 8; CHECK-LABEL: test: 9; CHECK: @ %bb.0: 10; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! 11; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! 12; CHECK-NEXT: vadd.f32 q8, q8, q9 13; CHECK-NEXT: vld1.32 {d18, d19}, [r0] 14; CHECK-NEXT: vadd.f32 q0, q8, q9 15; CHECK-NEXT: bx lr 16 %X = load <4 x float>, ptr %A, align 4 17 %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 4 18 %Y = load <4 x float>, ptr %Y.ptr.elt, align 4 19 %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 8 20 %Z = load <4 x float>, ptr %Z.ptr.elt, align 4 21 %tmp.sum = fadd <4 x float> %X, %Y 22 %sum = fadd <4 x float> %tmp.sum, %Z 23 ret <4 x float> %sum 24} 25 26define <4 x float> @test_stride(ptr %A) { 27; CHECK-LABEL: test_stride: 28; CHECK: @ %bb.0: 29; CHECK-NEXT: mov r1, #24 30; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 31; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1 32; CHECK-NEXT: vadd.f32 q8, q8, q9 33; CHECK-NEXT: vld1.32 {d18, d19}, [r0] 34; CHECK-NEXT: vadd.f32 q0, q8, q9 35; CHECK-NEXT: bx lr 36 %X = load <4 x float>, ptr %A, align 4 37 %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 6 38 %Y = load <4 x float>, ptr %Y.ptr.elt, align 4 39 %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 12 40 %Z = load <4 x float>, ptr %Z.ptr.elt, align 4 41 %tmp.sum = fadd <4 x float> %X, %Y 42 %sum = fadd <4 x float> %tmp.sum, %Z 43 ret <4 x float> %sum 44} 45 46define <4 x float> @test_stride_mixed(ptr %A) { 47; CHECK-LABEL: test_stride_mixed: 48; CHECK: @ %bb.0: 49; CHECK-NEXT: mov r1, #24 50; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 51; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! 52; CHECK-NEXT: vadd.f32 q8, q8, q9 53; CHECK-NEXT: vld1.32 {d18, d19}, [r0] 54; CHECK-NEXT: vadd.f32 q0, q8, q9 55; CHECK-NEXT: bx lr 56 %X = load <4 x float>, ptr %A, align 4 57 %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 6 58 %Y = load <4 x float>, ptr %Y.ptr.elt, align 4 59 %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 10 60 %Z = load <4 x float>, ptr %Z.ptr.elt, align 4 61 %tmp.sum = fadd <4 x float> %X, %Y 62 %sum = fadd <4 x float> %tmp.sum, %Z 63 ret <4 x float> %sum 64} 65 66; Refrain from using multiple stride registers 67define <4 x float> @test_stride_noop(ptr %A) { 68; CHECK-LABEL: test_stride_noop: 69; CHECK: @ %bb.0: 70; CHECK-NEXT: mov r1, #24 71; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 72; CHECK-NEXT: mov r1, #32 73; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1 74; CHECK-NEXT: vadd.f32 q8, q8, q9 75; CHECK-NEXT: vld1.32 {d18, d19}, [r0] 76; CHECK-NEXT: vadd.f32 q0, q8, q9 77; CHECK-NEXT: bx lr 78 %X = load <4 x float>, ptr %A, align 4 79 %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 6 80 %Y = load <4 x float>, ptr %Y.ptr.elt, align 4 81 %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 14 82 %Z = load <4 x float>, ptr %Z.ptr.elt, align 4 83 %tmp.sum = fadd <4 x float> %X, %Y 84 %sum = fadd <4 x float> %tmp.sum, %Z 85 ret <4 x float> %sum 86} 87 88define <4 x float> @test_positive_initial_offset(ptr %A) { 89; CHECK-LABEL: test_positive_initial_offset: 90; CHECK: @ %bb.0: 91; CHECK-NEXT: add r0, r0, #32 92; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! 93; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! 94; CHECK-NEXT: vadd.f32 q8, q8, q9 95; CHECK-NEXT: vld1.32 {d18, d19}, [r0] 96; CHECK-NEXT: vadd.f32 q0, q8, q9 97; CHECK-NEXT: bx lr 98 %X.ptr.elt = getelementptr inbounds float, ptr %A, i32 8 99 %X = load <4 x float>, ptr %X.ptr.elt, align 4 100 %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 12 101 %Y = load <4 x float>, ptr %Y.ptr.elt, align 4 102 %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 16 103 %Z = load <4 x float>, ptr %Z.ptr.elt, align 4 104 %tmp.sum = fadd <4 x float> %X, %Y 105 %sum = fadd <4 x float> %tmp.sum, %Z 106 ret <4 x float> %sum 107} 108 109define <4 x float> @test_negative_initial_offset(ptr %A) { 110; CHECK-LABEL: test_negative_initial_offset: 111; CHECK: @ %bb.0: 112; CHECK-NEXT: sub r0, r0, #64 113; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! 114; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! 115; CHECK-NEXT: vadd.f32 q8, q8, q9 116; CHECK-NEXT: vld1.32 {d18, d19}, [r0] 117; CHECK-NEXT: vadd.f32 q0, q8, q9 118; CHECK-NEXT: bx lr 119 %X.ptr.elt = getelementptr inbounds float, ptr %A, i32 -16 120 %X = load <4 x float>, ptr %X.ptr.elt, align 4 121 %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 -12 122 %Y = load <4 x float>, ptr %Y.ptr.elt, align 4 123 %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 -8 124 %Z = load <4 x float>, ptr %Z.ptr.elt, align 4 125 %tmp.sum = fadd <4 x float> %X, %Y 126 %sum = fadd <4 x float> %tmp.sum, %Z 127 ret <4 x float> %sum 128} 129 130@global_float_array = external global [128 x float], align 4 131define <4 x float> @test_global() { 132; CHECK-LABEL: test_global: 133; CHECK: @ %bb.0: 134; CHECK-NEXT: movw r0, :lower16:global_float_array 135; CHECK-NEXT: movt r0, :upper16:global_float_array 136; CHECK-NEXT: add r0, r0, #32 137; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! 138; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! 139; CHECK-NEXT: vadd.f32 q8, q8, q9 140; CHECK-NEXT: vld1.32 {d18, d19}, [r0] 141; CHECK-NEXT: vadd.f32 q0, q8, q9 142; CHECK-NEXT: bx lr 143 %X = load <4 x float>, ptr getelementptr inbounds ([128 x float], ptr @global_float_array, i32 0, i32 8), align 4 144 %Y = load <4 x float>, ptr getelementptr inbounds ([128 x float], ptr @global_float_array, i32 0, i32 12), align 4 145 %Z = load <4 x float>, ptr getelementptr inbounds ([128 x float], ptr @global_float_array, i32 0, i32 16), align 4 146 %tmp.sum = fadd <4 x float> %X, %Y 147 %sum = fadd <4 x float> %tmp.sum, %Z 148 ret <4 x float> %sum 149} 150 151define <4 x float> @test_stack() { 152; Use huge alignment to test that ADD would not be converted to OR 153; CHECK-LABEL: test_stack: 154; CHECK: @ %bb.0: 155; CHECK-NEXT: .save {r4, r10, r11, lr} 156; CHECK-NEXT: push {r4, r10, r11, lr} 157; CHECK-NEXT: .setfp r11, sp, #8 158; CHECK-NEXT: add r11, sp, #8 159; CHECK-NEXT: .pad #240 160; CHECK-NEXT: sub sp, sp, #240 161; CHECK-NEXT: bfc sp, #0, #7 162; CHECK-NEXT: mov r4, sp 163; CHECK-NEXT: mov r0, r4 164; CHECK-NEXT: bl external_function 165; CHECK-NEXT: vld1.32 {d16, d17}, [r4:128]! 166; CHECK-NEXT: vld1.32 {d18, d19}, [r4:128]! 167; CHECK-NEXT: vadd.f32 q8, q8, q9 168; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128] 169; CHECK-NEXT: vadd.f32 q0, q8, q9 170; CHECK-NEXT: sub sp, r11, #8 171; CHECK-NEXT: pop {r4, r10, r11, pc} 172 %array = alloca [32 x float], align 128 173 call void @external_function(ptr %array) 174 %X = load <4 x float>, ptr %array, align 4 175 %Y.ptr.elt = getelementptr inbounds [32 x float], ptr %array, i32 0, i32 4 176 %Y = load <4 x float>, ptr %Y.ptr.elt, align 4 177 %Z.ptr.elt = getelementptr inbounds [32 x float], ptr %array, i32 0, i32 8 178 %Z = load <4 x float>, ptr %Z.ptr.elt, align 4 179 %tmp.sum = fadd <4 x float> %X, %Y 180 %sum = fadd <4 x float> %tmp.sum, %Z 181 ret <4 x float> %sum 182} 183 184define <2 x double> @test_double(ptr %A) { 185; CHECK-LABEL: test_double: 186; CHECK: @ %bb.0: 187; CHECK-NEXT: add r0, r0, #64 188; CHECK-NEXT: vld1.64 {d16, d17}, [r0]! 189; CHECK-NEXT: vld1.64 {d18, d19}, [r0]! 190; CHECK-NEXT: vadd.f64 d20, d17, d19 191; CHECK-NEXT: vadd.f64 d16, d16, d18 192; CHECK-NEXT: vld1.64 {d22, d23}, [r0] 193; CHECK-NEXT: vadd.f64 d1, d20, d23 194; CHECK-NEXT: vadd.f64 d0, d16, d22 195; CHECK-NEXT: bx lr 196 %X.ptr.elt = getelementptr inbounds double, ptr %A, i32 8 197 %X = load <2 x double>, ptr %X.ptr.elt, align 8 198 %Y.ptr.elt = getelementptr inbounds double, ptr %A, i32 10 199 %Y = load <2 x double>, ptr %Y.ptr.elt, align 8 200 %Z.ptr.elt = getelementptr inbounds double, ptr %A, i32 12 201 %Z = load <2 x double>, ptr %Z.ptr.elt, align 8 202 %tmp.sum = fadd <2 x double> %X, %Y 203 %sum = fadd <2 x double> %tmp.sum, %Z 204 ret <2 x double> %sum 205} 206 207define void @test_various_instructions(ptr %A) { 208; CHECK-LABEL: test_various_instructions: 209; CHECK: @ %bb.0: 210; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! 211; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! 212; CHECK-NEXT: vadd.f32 q8, q8, q9 213; CHECK-NEXT: vst1.32 {d16, d17}, [r0] 214; CHECK-NEXT: bx lr 215 %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr %A, i32 1) 216 %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 4 217 %Y = load <4 x float>, ptr %Y.ptr.elt, align 4 218 %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 8 219 %Z = fadd <4 x float> %X, %Y 220 tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr nonnull %Z.ptr.elt, <4 x float> %Z, i32 4) 221 ret void 222} 223 224define void @test_lsr_geps(ptr %a, ptr %b, i32 %n) { 225; CHECK-LABEL: test_lsr_geps: 226; CHECK: @ %bb.0: @ %entry 227; CHECK-NEXT: cmp r2, #1 228; CHECK-NEXT: bxlt lr 229; CHECK-NEXT: .LBB10_1: @ %for.body.preheader 230; CHECK-NEXT: mov r12, #0 231; CHECK-NEXT: .LBB10_2: @ %for.body 232; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 233; CHECK-NEXT: add r3, r0, r12 234; CHECK-NEXT: subs r2, r2, #1 235; CHECK-NEXT: vld1.32 {d16, d17}, [r3]! 236; CHECK-NEXT: vld1.32 {d18, d19}, [r3]! 237; CHECK-NEXT: vld1.32 {d20, d21}, [r3]! 238; CHECK-NEXT: vld1.32 {d22, d23}, [r3] 239; CHECK-NEXT: add r3, r1, r12 240; CHECK-NEXT: add r12, r12, #64 241; CHECK-NEXT: vst1.32 {d16, d17}, [r3]! 242; CHECK-NEXT: vst1.32 {d18, d19}, [r3]! 243; CHECK-NEXT: vst1.32 {d20, d21}, [r3]! 244; CHECK-NEXT: vst1.32 {d22, d23}, [r3] 245; CHECK-NEXT: bne .LBB10_2 246; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 247; CHECK-NEXT: bx lr 248entry: 249 %cmp61 = icmp sgt i32 %n, 0 250 br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup 251 252for.body.preheader: 253 br label %for.body 254 255for.cond.cleanup: 256 ret void 257 258for.body: 259 %lsr.iv1 = phi i32 [ 0, %for.body.preheader ], [ %lsr.iv.next2, %for.body ] 260 %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ] 261 %uglygep19 = getelementptr i8, ptr %a, i32 %lsr.iv1 262 %0 = load <4 x float>, ptr %uglygep19, align 4 263 %uglygep16 = getelementptr i8, ptr %a, i32 %lsr.iv1 264 %scevgep18 = getelementptr <4 x float>, ptr %uglygep16, i32 1 265 %1 = load <4 x float>, ptr %scevgep18, align 4 266 %uglygep13 = getelementptr i8, ptr %a, i32 %lsr.iv1 267 %scevgep15 = getelementptr <4 x float>, ptr %uglygep13, i32 2 268 %2 = load <4 x float>, ptr %scevgep15, align 4 269 %uglygep10 = getelementptr i8, ptr %a, i32 %lsr.iv1 270 %scevgep12 = getelementptr <4 x float>, ptr %uglygep10, i32 3 271 %3 = load <4 x float>, ptr %scevgep12, align 4 272 %uglygep8 = getelementptr i8, ptr %b, i32 %lsr.iv1 273 tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr %uglygep8, <4 x float> %0, i32 4) 274 %uglygep6 = getelementptr i8, ptr %b, i32 %lsr.iv1 275 %scevgep7 = getelementptr i8, ptr %uglygep6, i32 16 276 tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr nonnull %scevgep7, <4 x float> %1, i32 4) 277 %uglygep4 = getelementptr i8, ptr %b, i32 %lsr.iv1 278 %scevgep5 = getelementptr i8, ptr %uglygep4, i32 32 279 tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr nonnull %scevgep5, <4 x float> %2, i32 4) 280 %uglygep = getelementptr i8, ptr %b, i32 %lsr.iv1 281 %scevgep = getelementptr i8, ptr %uglygep, i32 48 282 tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr nonnull %scevgep, <4 x float> %3, i32 4) 283 %lsr.iv.next = add i32 %lsr.iv, -1 284 %lsr.iv.next2 = add nuw i32 %lsr.iv1, 64 285 %exitcond.not = icmp eq i32 %lsr.iv.next, 0 286 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 287} 288 289declare void @external_function(ptr) 290declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr, i32) nounwind readonly 291declare void @llvm.arm.neon.vst1.p0.v4f32(ptr, <4 x float>, i32) nounwind argmemonly 292