1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vld4_v2i32(ptr %src, ptr %dst) { 7; CHECK-LABEL: vld4_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 10; CHECK-NEXT: vldrw.u32 q0, [r0] 11; CHECK-NEXT: vmov.f32 s10, s7 12; CHECK-NEXT: vmov r2, s6 13; CHECK-NEXT: vmov.f32 s6, s5 14; CHECK-NEXT: vmov r3, s4 15; CHECK-NEXT: vmov.f32 s8, s3 16; CHECK-NEXT: vmov.f32 s12, s1 17; CHECK-NEXT: vmov r0, s10 18; CHECK-NEXT: add r0, r2 19; CHECK-NEXT: vmov r2, s6 20; CHECK-NEXT: add r2, r3 21; CHECK-NEXT: vmov r3, s2 22; CHECK-NEXT: add.w r12, r2, r0 23; CHECK-NEXT: vmov r2, s8 24; CHECK-NEXT: vmov r0, s0 25; CHECK-NEXT: add r2, r3 26; CHECK-NEXT: vmov r3, s12 27; CHECK-NEXT: add r0, r3 28; CHECK-NEXT: add r0, r2 29; CHECK-NEXT: strd r0, r12, [r1] 30; CHECK-NEXT: bx lr 31entry: 32 %l1 = load <8 x i32>, ptr %src, align 4 33 %s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 0, i32 4> 34 %s2 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 1, i32 5> 35 %s3 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 2, i32 6> 36 %s4 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 3, i32 7> 37 %a1 = add <2 x i32> %s1, %s2 38 %a2 = add <2 x i32> %s3, %s4 39 %a3 = add <2 x i32> %a1, %a2 40 store <2 x i32> %a3, ptr %dst 41 ret void 42} 43 44define void @vld4_v4i32(ptr %src, ptr %dst) { 45; CHECK-LABEL: vld4_v4i32: 46; CHECK: @ %bb.0: @ %entry 47; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 48; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 49; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 50; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 51; CHECK-NEXT: vadd.i32 q2, q2, q3 52; CHECK-NEXT: vadd.i32 q0, q0, q1 53; CHECK-NEXT: vadd.i32 q0, q0, q2 54; CHECK-NEXT: vstrw.32 q0, [r1] 55; CHECK-NEXT: bx lr 56entry: 57 %l1 = load <16 x i32>, ptr %src, align 4 58 %s1 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 59 %s2 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 60 %s3 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 61 %s4 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 62 %a1 = add <4 x i32> %s1, %s2 63 %a2 = add <4 x i32> %s3, %s4 64 %a3 = add <4 x i32> %a1, %a2 65 store <4 x i32> %a3, ptr %dst 66 ret void 67} 68 69define void @vld4_v8i32(ptr %src, ptr %dst) { 70; CHECK-LABEL: vld4_v8i32: 71; CHECK: @ %bb.0: @ %entry 72; CHECK-NEXT: .vsave {d8, d9, d10, d11} 73; CHECK-NEXT: vpush {d8, d9, d10, d11} 74; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 75; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 76; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 77; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! 78; CHECK-NEXT: vadd.i32 q4, q2, q3 79; CHECK-NEXT: vadd.i32 q5, q0, q1 80; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 81; CHECK-NEXT: vadd.i32 q4, q5, q4 82; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 83; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 84; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 85; CHECK-NEXT: vstrw.32 q4, [r1] 86; CHECK-NEXT: vadd.i32 q2, q2, q3 87; CHECK-NEXT: vadd.i32 q0, q0, q1 88; CHECK-NEXT: vadd.i32 q0, q0, q2 89; CHECK-NEXT: vstrw.32 q0, [r1, #16] 90; CHECK-NEXT: vpop {d8, d9, d10, d11} 91; CHECK-NEXT: bx lr 92entry: 93 %l1 = load <32 x i32>, ptr %src, align 4 94 %s1 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 95 %s2 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 96 %s3 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 97 %s4 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 98 %a1 = add <8 x i32> %s1, %s2 99 %a2 = add <8 x i32> %s3, %s4 100 %a3 = add <8 x i32> %a1, %a2 101 store <8 x i32> %a3, ptr %dst 102 ret void 103} 104 105define void @vld4_v16i32(ptr %src, ptr %dst) { 106; CHECK-LABEL: vld4_v16i32: 107; CHECK: @ %bb.0: @ %entry 108; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 109; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 110; CHECK-NEXT: .pad #32 111; CHECK-NEXT: sub sp, #32 112; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 113; CHECK-NEXT: mov r2, r0 114; CHECK-NEXT: add.w r3, r0, #192 115; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 116; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 117; CHECK-NEXT: adds r0, #128 118; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! 119; CHECK-NEXT: vadd.i32 q2, q2, q3 120; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r3] 121; CHECK-NEXT: vadd.i32 q0, q0, q1 122; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r3] 123; CHECK-NEXT: vadd.i32 q0, q0, q2 124; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 125; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r3] 126; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r3] 127; CHECK-NEXT: vadd.i32 q1, q5, q6 128; CHECK-NEXT: vadd.i32 q2, q3, q4 129; CHECK-NEXT: vadd.i32 q0, q2, q1 130; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r2] 131; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 132; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 133; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r2] 134; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 135; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r2] 136; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 137; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r2] 138; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 139; CHECK-NEXT: vadd.i32 q0, q0, q1 140; CHECK-NEXT: vadd.i32 q2, q2, q3 141; CHECK-NEXT: vadd.i32 q0, q0, q2 142; CHECK-NEXT: vadd.i32 q1, q6, q7 143; CHECK-NEXT: vadd.i32 q2, q4, q5 144; CHECK-NEXT: vstrw.32 q0, [r1, #32] 145; CHECK-NEXT: vadd.i32 q1, q2, q1 146; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload 147; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 148; CHECK-NEXT: vstrw.32 q1, [r1, #16] 149; CHECK-NEXT: vstrw.32 q2, [r1, #48] 150; CHECK-NEXT: vstrw.32 q0, [r1] 151; CHECK-NEXT: add sp, #32 152; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 153; CHECK-NEXT: bx lr 154entry: 155 %l1 = load <64 x i32>, ptr %src, align 4 156 %s1 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 157 %s2 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 158 %s3 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 159 %s4 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 160 %a1 = add <16 x i32> %s1, %s2 161 %a2 = add <16 x i32> %s3, %s4 162 %a3 = add <16 x i32> %a1, %a2 163 store <16 x i32> %a3, ptr %dst 164 ret void 165} 166 167define void @vld4_v4i32_align1(ptr %src, ptr %dst) { 168; CHECK-LABEL: vld4_v4i32_align1: 169; CHECK: @ %bb.0: @ %entry 170; CHECK-NEXT: .vsave {d8, d9, d10, d11} 171; CHECK-NEXT: vpush {d8, d9, d10, d11} 172; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 173; CHECK-NEXT: vldrb.u8 q1, [r0, #32] 174; CHECK-NEXT: vldrb.u8 q3, [r0, #16] 175; CHECK-NEXT: vldrb.u8 q2, [r0] 176; CHECK-NEXT: vmov.f32 s18, s7 177; CHECK-NEXT: vmov.f32 s16, s11 178; CHECK-NEXT: vmov.f32 s20, s10 179; CHECK-NEXT: vmov.f32 s17, s15 180; CHECK-NEXT: vmov.f32 s19, s3 181; CHECK-NEXT: vmov.f32 s21, s14 182; CHECK-NEXT: vmov.f32 s22, s6 183; CHECK-NEXT: vmov.f32 s23, s2 184; CHECK-NEXT: vadd.i32 q4, q5, q4 185; CHECK-NEXT: vmov.f32 s20, s9 186; CHECK-NEXT: vmov.f32 s21, s13 187; CHECK-NEXT: vmov.f32 s22, s5 188; CHECK-NEXT: vmov.f32 s23, s1 189; CHECK-NEXT: vmov.f32 s9, s12 190; CHECK-NEXT: vmov.f32 s10, s4 191; CHECK-NEXT: vmov.f32 s11, s0 192; CHECK-NEXT: vadd.i32 q0, q2, q5 193; CHECK-NEXT: vadd.i32 q0, q0, q4 194; CHECK-NEXT: vstrw.32 q0, [r1] 195; CHECK-NEXT: vpop {d8, d9, d10, d11} 196; CHECK-NEXT: bx lr 197entry: 198 %l1 = load <16 x i32>, ptr %src, align 1 199 %s1 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 200 %s2 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 201 %s3 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 202 %s4 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 203 %a1 = add <4 x i32> %s1, %s2 204 %a2 = add <4 x i32> %s3, %s4 205 %a3 = add <4 x i32> %a1, %a2 206 store <4 x i32> %a3, ptr %dst 207 ret void 208} 209 210; i16 211 212define void @vld4_v2i16(ptr %src, ptr %dst) { 213; CHECK-LABEL: vld4_v2i16: 214; CHECK: @ %bb.0: @ %entry 215; CHECK-NEXT: vldrh.u16 q0, [r0] 216; CHECK-NEXT: vmov.u16 r0, q0[7] 217; CHECK-NEXT: vmov.u16 r2, q0[6] 218; CHECK-NEXT: add r0, r2 219; CHECK-NEXT: vmov.u16 r2, q0[5] 220; CHECK-NEXT: vmov.u16 r3, q0[4] 221; CHECK-NEXT: add r2, r3 222; CHECK-NEXT: vmov.u16 r3, q0[0] 223; CHECK-NEXT: add r0, r2 224; CHECK-NEXT: strh r0, [r1, #2] 225; CHECK-NEXT: vmov.u16 r0, q0[3] 226; CHECK-NEXT: vmov.u16 r2, q0[2] 227; CHECK-NEXT: add r0, r2 228; CHECK-NEXT: vmov.u16 r2, q0[1] 229; CHECK-NEXT: add r2, r3 230; CHECK-NEXT: add r0, r2 231; CHECK-NEXT: strh r0, [r1] 232; CHECK-NEXT: bx lr 233entry: 234 %l1 = load <8 x i16>, ptr %src, align 2 235 %s1 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 0, i32 4> 236 %s2 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 1, i32 5> 237 %s3 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 2, i32 6> 238 %s4 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 3, i32 7> 239 %a1 = add <2 x i16> %s1, %s2 240 %a2 = add <2 x i16> %s3, %s4 241 %a3 = add <2 x i16> %a1, %a2 242 store <2 x i16> %a3, ptr %dst 243 ret void 244} 245 246define void @vld4_v4i16(ptr %src, ptr %dst) { 247; CHECK-LABEL: vld4_v4i16: 248; CHECK: @ %bb.0: @ %entry 249; CHECK-NEXT: .save {r4, lr} 250; CHECK-NEXT: push {r4, lr} 251; CHECK-NEXT: .pad #32 252; CHECK-NEXT: sub sp, #32 253; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 254; CHECK-NEXT: mov r4, sp 255; CHECK-NEXT: vshr.u32 q1, q0, #16 256; CHECK-NEXT: vmov.u16 r3, q0[3] 257; CHECK-NEXT: vstrh.32 q1, [r4, #8] 258; CHECK-NEXT: vldrh.u16 q1, [r0] 259; CHECK-NEXT: add r0, sp, #16 260; CHECK-NEXT: vshr.u32 q2, q1, #16 261; CHECK-NEXT: vmov.u16 r2, q1[3] 262; CHECK-NEXT: vstrh.32 q2, [r4] 263; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 264; CHECK-NEXT: vmov.u16 r2, q0[7] 265; CHECK-NEXT: vmov.u16 r3, q1[7] 266; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 267; CHECK-NEXT: vmov.u16 r2, q0[2] 268; CHECK-NEXT: vmov.u16 r3, q1[2] 269; CHECK-NEXT: vstrh.32 q0, [r0, #8] 270; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 271; CHECK-NEXT: vmov.u16 r2, q0[6] 272; CHECK-NEXT: vmov.u16 r3, q1[6] 273; CHECK-NEXT: vstrh.32 q1, [r0] 274; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 275; CHECK-NEXT: vldrw.u32 q1, [r4] 276; CHECK-NEXT: vadd.i32 q0, q3, q2 277; CHECK-NEXT: vldrw.u32 q2, [r0] 278; CHECK-NEXT: vadd.i32 q1, q2, q1 279; CHECK-NEXT: vadd.i32 q0, q1, q0 280; CHECK-NEXT: vstrh.32 q0, [r1] 281; CHECK-NEXT: add sp, #32 282; CHECK-NEXT: pop {r4, pc} 283entry: 284 %l1 = load <16 x i16>, ptr %src, align 2 285 %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 286 %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 287 %s3 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 288 %s4 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 289 %a1 = add <4 x i16> %s1, %s2 290 %a2 = add <4 x i16> %s3, %s4 291 %a3 = add <4 x i16> %a1, %a2 292 store <4 x i16> %a3, ptr %dst 293 ret void 294} 295 296define void @vld4_v8i16(ptr %src, ptr %dst) { 297; CHECK-LABEL: vld4_v8i16: 298; CHECK: @ %bb.0: @ %entry 299; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 300; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 301; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 302; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] 303; CHECK-NEXT: vadd.i16 q2, q2, q3 304; CHECK-NEXT: vadd.i16 q0, q0, q1 305; CHECK-NEXT: vadd.i16 q0, q0, q2 306; CHECK-NEXT: vstrw.32 q0, [r1] 307; CHECK-NEXT: bx lr 308entry: 309 %l1 = load <32 x i16>, ptr %src, align 2 310 %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 311 %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 312 %s3 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 313 %s4 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 314 %a1 = add <8 x i16> %s1, %s2 315 %a2 = add <8 x i16> %s3, %s4 316 %a3 = add <8 x i16> %a1, %a2 317 store <8 x i16> %a3, ptr %dst 318 ret void 319} 320 321define void @vld4_v16i16(ptr %src, ptr %dst) { 322; CHECK-LABEL: vld4_v16i16: 323; CHECK: @ %bb.0: @ %entry 324; CHECK-NEXT: .vsave {d8, d9, d10, d11} 325; CHECK-NEXT: vpush {d8, d9, d10, d11} 326; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 327; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 328; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 329; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! 330; CHECK-NEXT: vadd.i16 q4, q2, q3 331; CHECK-NEXT: vadd.i16 q5, q0, q1 332; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 333; CHECK-NEXT: vadd.i16 q4, q5, q4 334; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 335; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 336; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] 337; CHECK-NEXT: vstrw.32 q4, [r1] 338; CHECK-NEXT: vadd.i16 q2, q2, q3 339; CHECK-NEXT: vadd.i16 q0, q0, q1 340; CHECK-NEXT: vadd.i16 q0, q0, q2 341; CHECK-NEXT: vstrw.32 q0, [r1, #16] 342; CHECK-NEXT: vpop {d8, d9, d10, d11} 343; CHECK-NEXT: bx lr 344entry: 345 %l1 = load <64 x i16>, ptr %src, align 2 346 %s1 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 347 %s2 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 348 %s3 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 349 %s4 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 350 %a1 = add <16 x i16> %s1, %s2 351 %a2 = add <16 x i16> %s3, %s4 352 %a3 = add <16 x i16> %a1, %a2 353 store <16 x i16> %a3, ptr %dst 354 ret void 355} 356 357define void @vld4_v8i16_align1(ptr %src, ptr %dst) { 358; CHECK-LABEL: vld4_v8i16_align1: 359; CHECK: @ %bb.0: @ %entry 360; CHECK-NEXT: .vsave {d8, d9, d10, d11} 361; CHECK-NEXT: vpush {d8, d9, d10, d11} 362; CHECK-NEXT: vldrb.u8 q1, [r0, #32] 363; CHECK-NEXT: vldrb.u8 q2, [r0, #48] 364; CHECK-NEXT: vmovx.f16 s18, s5 365; CHECK-NEXT: vmovx.f16 s0, s7 366; CHECK-NEXT: vins.f16 s18, s0 367; CHECK-NEXT: vmovx.f16 s19, s9 368; CHECK-NEXT: vmovx.f16 s0, s11 369; CHECK-NEXT: vins.f16 s5, s7 370; CHECK-NEXT: vins.f16 s19, s0 371; CHECK-NEXT: vldrb.u8 q0, [r0] 372; CHECK-NEXT: vins.f16 s9, s11 373; CHECK-NEXT: vmov.f32 s22, s5 374; CHECK-NEXT: vmovx.f16 s16, s1 375; CHECK-NEXT: vmovx.f16 s12, s3 376; CHECK-NEXT: vins.f16 s16, s12 377; CHECK-NEXT: vldrb.u8 q3, [r0, #16] 378; CHECK-NEXT: vins.f16 s1, s3 379; CHECK-NEXT: vmov.f32 s23, s9 380; CHECK-NEXT: vmovx.f16 s17, s13 381; CHECK-NEXT: vmovx.f16 s20, s15 382; CHECK-NEXT: vins.f16 s13, s15 383; CHECK-NEXT: vins.f16 s17, s20 384; CHECK-NEXT: vmov.f32 s20, s1 385; CHECK-NEXT: vmovx.f16 s1, s6 386; CHECK-NEXT: vmov.f32 s21, s13 387; CHECK-NEXT: vadd.i16 q4, q5, q4 388; CHECK-NEXT: vmovx.f16 s22, s4 389; CHECK-NEXT: vins.f16 s22, s1 390; CHECK-NEXT: vmovx.f16 s23, s8 391; CHECK-NEXT: vmovx.f16 s1, s10 392; CHECK-NEXT: vmovx.f16 s20, s0 393; CHECK-NEXT: vins.f16 s23, s1 394; CHECK-NEXT: vmovx.f16 s1, s2 395; CHECK-NEXT: vins.f16 s20, s1 396; CHECK-NEXT: vmovx.f16 s21, s12 397; CHECK-NEXT: vmovx.f16 s1, s14 398; CHECK-NEXT: vins.f16 s8, s10 399; CHECK-NEXT: vins.f16 s4, s6 400; CHECK-NEXT: vins.f16 s12, s14 401; CHECK-NEXT: vins.f16 s21, s1 402; CHECK-NEXT: vins.f16 s0, s2 403; CHECK-NEXT: vmov.f32 s3, s8 404; CHECK-NEXT: vmov.f32 s1, s12 405; CHECK-NEXT: vmov.f32 s2, s4 406; CHECK-NEXT: vadd.i16 q0, q0, q5 407; CHECK-NEXT: vadd.i16 q0, q0, q4 408; CHECK-NEXT: vstrw.32 q0, [r1] 409; CHECK-NEXT: vpop {d8, d9, d10, d11} 410; CHECK-NEXT: bx lr 411entry: 412 %l1 = load <32 x i16>, ptr %src, align 1 413 %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 414 %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 415 %s3 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 416 %s4 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 417 %a1 = add <8 x i16> %s1, %s2 418 %a2 = add <8 x i16> %s3, %s4 419 %a3 = add <8 x i16> %a1, %a2 420 store <8 x i16> %a3, ptr %dst 421 ret void 422} 423 424; i8 425 426define void @vld4_v2i8(ptr %src, ptr %dst) { 427; CHECK-LABEL: vld4_v2i8: 428; CHECK: @ %bb.0: @ %entry 429; CHECK-NEXT: vldrb.u16 q0, [r0] 430; CHECK-NEXT: vmov.u16 r0, q0[7] 431; CHECK-NEXT: vmov.u16 r2, q0[6] 432; CHECK-NEXT: add r0, r2 433; CHECK-NEXT: vmov.u16 r2, q0[5] 434; CHECK-NEXT: vmov.u16 r3, q0[4] 435; CHECK-NEXT: add r2, r3 436; CHECK-NEXT: vmov.u16 r3, q0[0] 437; CHECK-NEXT: add r0, r2 438; CHECK-NEXT: strb r0, [r1, #1] 439; CHECK-NEXT: vmov.u16 r0, q0[3] 440; CHECK-NEXT: vmov.u16 r2, q0[2] 441; CHECK-NEXT: add r0, r2 442; CHECK-NEXT: vmov.u16 r2, q0[1] 443; CHECK-NEXT: add r2, r3 444; CHECK-NEXT: add r0, r2 445; CHECK-NEXT: strb r0, [r1] 446; CHECK-NEXT: bx lr 447entry: 448 %l1 = load <8 x i8>, ptr %src, align 1 449 %s1 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 0, i32 4> 450 %s2 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 1, i32 5> 451 %s3 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 2, i32 6> 452 %s4 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 3, i32 7> 453 %a1 = add <2 x i8> %s1, %s2 454 %a2 = add <2 x i8> %s3, %s4 455 %a3 = add <2 x i8> %a1, %a2 456 store <2 x i8> %a3, ptr %dst 457 ret void 458} 459 460define void @vld4_v4i8(ptr %src, ptr %dst) { 461; CHECK-LABEL: vld4_v4i8: 462; CHECK: @ %bb.0: @ %entry 463; CHECK-NEXT: vldrb.u8 q0, [r0] 464; CHECK-NEXT: vmov.u8 r0, q0[10] 465; CHECK-NEXT: vmov.u8 r2, q0[2] 466; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 467; CHECK-NEXT: vmov.u8 r0, q0[14] 468; CHECK-NEXT: vmov.u8 r2, q0[6] 469; CHECK-NEXT: vrev32.8 q2, q0 470; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 471; CHECK-NEXT: vadd.i32 q1, q1, q2 472; CHECK-NEXT: vrev16.8 q2, q0 473; CHECK-NEXT: vadd.i32 q0, q0, q2 474; CHECK-NEXT: vadd.i32 q0, q0, q1 475; CHECK-NEXT: vstrb.32 q0, [r1] 476; CHECK-NEXT: bx lr 477entry: 478 %l1 = load <16 x i8>, ptr %src, align 1 479 %s1 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 480 %s2 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 481 %s3 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 482 %s4 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 483 %a1 = add <4 x i8> %s1, %s2 484 %a2 = add <4 x i8> %s3, %s4 485 %a3 = add <4 x i8> %a1, %a2 486 store <4 x i8> %a3, ptr %dst 487 ret void 488} 489 490define void @vld4_v8i8(ptr %src, ptr %dst) { 491; CHECK-LABEL: vld4_v8i8: 492; CHECK: @ %bb.0: @ %entry 493; CHECK-NEXT: .pad #32 494; CHECK-NEXT: sub sp, #32 495; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 496; CHECK-NEXT: mov r2, sp 497; CHECK-NEXT: vshr.u16 q1, q0, #8 498; CHECK-NEXT: vstrb.16 q1, [r2, #8] 499; CHECK-NEXT: vldrb.u8 q2, [r0] 500; CHECK-NEXT: add r0, sp, #16 501; CHECK-NEXT: vshr.u16 q1, q2, #8 502; CHECK-NEXT: vmov.u8 r3, q2[3] 503; CHECK-NEXT: vstrb.16 q1, [r2] 504; CHECK-NEXT: vmov.16 q1[0], r3 505; CHECK-NEXT: vmov.u8 r3, q2[7] 506; CHECK-NEXT: vstrb.16 q0, [r0, #8] 507; CHECK-NEXT: vmov.16 q1[1], r3 508; CHECK-NEXT: vmov.u8 r3, q2[11] 509; CHECK-NEXT: vmov.16 q1[2], r3 510; CHECK-NEXT: vmov.u8 r3, q2[15] 511; CHECK-NEXT: vmov.16 q1[3], r3 512; CHECK-NEXT: vmov.u8 r3, q0[3] 513; CHECK-NEXT: vmov.16 q1[4], r3 514; CHECK-NEXT: vmov.u8 r3, q0[7] 515; CHECK-NEXT: vmov.16 q1[5], r3 516; CHECK-NEXT: vmov.u8 r3, q0[11] 517; CHECK-NEXT: vmov.16 q1[6], r3 518; CHECK-NEXT: vmov.u8 r3, q0[15] 519; CHECK-NEXT: vmov.16 q1[7], r3 520; CHECK-NEXT: vmov.u8 r3, q2[2] 521; CHECK-NEXT: vmov.16 q3[0], r3 522; CHECK-NEXT: vmov.u8 r3, q2[6] 523; CHECK-NEXT: vmov.16 q3[1], r3 524; CHECK-NEXT: vmov.u8 r3, q2[10] 525; CHECK-NEXT: vmov.16 q3[2], r3 526; CHECK-NEXT: vmov.u8 r3, q2[14] 527; CHECK-NEXT: vmov.16 q3[3], r3 528; CHECK-NEXT: vmov.u8 r3, q0[2] 529; CHECK-NEXT: vmov.16 q3[4], r3 530; CHECK-NEXT: vmov.u8 r3, q0[6] 531; CHECK-NEXT: vmov.16 q3[5], r3 532; CHECK-NEXT: vmov.u8 r3, q0[10] 533; CHECK-NEXT: vmov.16 q3[6], r3 534; CHECK-NEXT: vmov.u8 r3, q0[14] 535; CHECK-NEXT: vstrb.16 q2, [r0] 536; CHECK-NEXT: vmov.16 q3[7], r3 537; CHECK-NEXT: vadd.i16 q0, q3, q1 538; CHECK-NEXT: vldrw.u32 q1, [r2] 539; CHECK-NEXT: vldrw.u32 q2, [r0] 540; CHECK-NEXT: vadd.i16 q1, q2, q1 541; CHECK-NEXT: vadd.i16 q0, q1, q0 542; CHECK-NEXT: vstrb.16 q0, [r1] 543; CHECK-NEXT: add sp, #32 544; CHECK-NEXT: bx lr 545entry: 546 %l1 = load <32 x i8>, ptr %src, align 1 547 %s1 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 548 %s2 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 549 %s3 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 550 %s4 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 551 %a1 = add <8 x i8> %s1, %s2 552 %a2 = add <8 x i8> %s3, %s4 553 %a3 = add <8 x i8> %a1, %a2 554 store <8 x i8> %a3, ptr %dst 555 ret void 556} 557 558define void @vld4_v16i8(ptr %src, ptr %dst) { 559; CHECK-LABEL: vld4_v16i8: 560; CHECK: @ %bb.0: @ %entry 561; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] 562; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] 563; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] 564; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] 565; CHECK-NEXT: vadd.i8 q2, q2, q3 566; CHECK-NEXT: vadd.i8 q0, q0, q1 567; CHECK-NEXT: vadd.i8 q0, q0, q2 568; CHECK-NEXT: vstrw.32 q0, [r1] 569; CHECK-NEXT: bx lr 570entry: 571 %l1 = load <64 x i8>, ptr %src, align 1 572 %s1 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 573 %s2 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 574 %s3 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 575 %s4 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 576 %a1 = add <16 x i8> %s1, %s2 577 %a2 = add <16 x i8> %s3, %s4 578 %a3 = add <16 x i8> %a1, %a2 579 store <16 x i8> %a3, ptr %dst 580 ret void 581} 582 583; i64 584 585define void @vld4_v2i64(ptr %src, ptr %dst) { 586; CHECK-LABEL: vld4_v2i64: 587; CHECK: @ %bb.0: @ %entry 588; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 589; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 590; CHECK-NEXT: .vsave {d8, d9} 591; CHECK-NEXT: vpush {d8, d9} 592; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 593; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 594; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 595; CHECK-NEXT: vmov.f32 s4, s2 596; CHECK-NEXT: vmov.f32 s5, s3 597; CHECK-NEXT: vmov.f32 s2, s8 598; CHECK-NEXT: vmov.f32 s3, s9 599; CHECK-NEXT: vmov lr, r12, d5 600; CHECK-NEXT: vldrw.u32 q2, [r0] 601; CHECK-NEXT: vmov r0, r8, d9 602; CHECK-NEXT: vmov.f32 s12, s10 603; CHECK-NEXT: vmov.f32 s13, s11 604; CHECK-NEXT: vmov r2, r3, d1 605; CHECK-NEXT: vmov.f32 s2, s16 606; CHECK-NEXT: vmov.f32 s3, s17 607; CHECK-NEXT: vmov r5, r6, d1 608; CHECK-NEXT: adds.w r2, r2, lr 609; CHECK-NEXT: adc.w r3, r3, r12 610; CHECK-NEXT: vmov r4, r12, d2 611; CHECK-NEXT: adds r0, r0, r5 612; CHECK-NEXT: vmov r5, r7, d0 613; CHECK-NEXT: adc.w r6, r6, r8 614; CHECK-NEXT: adds r0, r0, r2 615; CHECK-NEXT: adc.w lr, r6, r3 616; CHECK-NEXT: vmov r3, r6, d6 617; CHECK-NEXT: adds r5, r5, r4 618; CHECK-NEXT: vmov r4, r2, d4 619; CHECK-NEXT: adc.w r7, r7, r12 620; CHECK-NEXT: adds r3, r3, r4 621; CHECK-NEXT: adcs r2, r6 622; CHECK-NEXT: adds r3, r3, r5 623; CHECK-NEXT: adcs r2, r7 624; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 625; CHECK-NEXT: vmov q0[3], q0[1], r2, lr 626; CHECK-NEXT: vstrw.32 q0, [r1] 627; CHECK-NEXT: vpop {d8, d9} 628; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 629entry: 630 %l1 = load <8 x i64>, ptr %src, align 8 631 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 0, i32 4> 632 %s2 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 1, i32 5> 633 %s3 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 2, i32 6> 634 %s4 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 3, i32 7> 635 %a1 = add <2 x i64> %s1, %s2 636 %a2 = add <2 x i64> %s3, %s4 637 %a3 = add <2 x i64> %a1, %a2 638 store <2 x i64> %a3, ptr %dst 639 ret void 640} 641 642define void @vld4_v4i64(ptr %src, ptr %dst) { 643; CHECK-LABEL: vld4_v4i64: 644; CHECK: @ %bb.0: @ %entry 645; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} 646; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 647; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 648; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 649; CHECK-NEXT: .pad #16 650; CHECK-NEXT: sub sp, #16 651; CHECK-NEXT: vldrw.u32 q0, [r0] 652; CHECK-NEXT: vldrw.u32 q5, [r0, #48] 653; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 654; CHECK-NEXT: vldrw.u32 q6, [r0, #80] 655; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 656; CHECK-NEXT: vmov.f32 s8, s2 657; CHECK-NEXT: vmov.f32 s9, s3 658; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 659; CHECK-NEXT: vmov.f32 s2, s20 660; CHECK-NEXT: vldrw.u32 q7, [r0, #112] 661; CHECK-NEXT: vmov.f32 s3, s21 662; CHECK-NEXT: vmov r3, r2, d11 663; CHECK-NEXT: vldrw.u32 q5, [r0, #96] 664; CHECK-NEXT: vmov.f32 s0, s26 665; CHECK-NEXT: vmov.f32 s1, s27 666; CHECK-NEXT: vmov lr, r12, d9 667; CHECK-NEXT: vmov.f32 s12, s6 668; CHECK-NEXT: vmov.f32 s13, s7 669; CHECK-NEXT: vmov r4, r5, d1 670; CHECK-NEXT: vmov.f32 s2, s16 671; CHECK-NEXT: vmov.f32 s3, s17 672; CHECK-NEXT: vldrw.u32 q4, [r0, #64] 673; CHECK-NEXT: vmov.f32 s6, s28 674; CHECK-NEXT: vmov.f32 s7, s29 675; CHECK-NEXT: vmov.f32 s10, s20 676; CHECK-NEXT: vmov.f32 s11, s21 677; CHECK-NEXT: vmov r0, r6, d1 678; CHECK-NEXT: adds r7, r4, r3 679; CHECK-NEXT: vmov r4, r8, d0 680; CHECK-NEXT: adcs r5, r2 681; CHECK-NEXT: vmov r2, r3, d12 682; CHECK-NEXT: vmov.f32 s0, s18 683; CHECK-NEXT: vmov.f32 s1, s19 684; CHECK-NEXT: adds.w r0, r0, lr 685; CHECK-NEXT: adc.w r6, r6, r12 686; CHECK-NEXT: adds.w lr, r0, r7 687; CHECK-NEXT: adc.w r12, r6, r5 688; CHECK-NEXT: vmov r6, r5, d0 689; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 690; CHECK-NEXT: adds r2, r2, r4 691; CHECK-NEXT: vmov r4, r0, d8 692; CHECK-NEXT: adc.w r3, r3, r8 693; CHECK-NEXT: adds r6, r6, r4 694; CHECK-NEXT: adcs r0, r5 695; CHECK-NEXT: adds.w r9, r6, r2 696; CHECK-NEXT: adc.w r8, r0, r3 697; CHECK-NEXT: vmov r5, r4, d15 698; CHECK-NEXT: vmov r3, r6, d3 699; CHECK-NEXT: vmov r7, r0, d5 700; CHECK-NEXT: adds r3, r3, r5 701; CHECK-NEXT: adcs r6, r4 702; CHECK-NEXT: vmov r5, r4, d11 703; CHECK-NEXT: adds r5, r5, r7 704; CHECK-NEXT: adcs r0, r4 705; CHECK-NEXT: adds r3, r3, r5 706; CHECK-NEXT: adc.w r10, r0, r6 707; CHECK-NEXT: vmov r4, r5, d4 708; CHECK-NEXT: vmov r6, r7, d0 709; CHECK-NEXT: vmov r2, r0, d2 710; CHECK-NEXT: vmov q1[2], q1[0], r9, r3 711; CHECK-NEXT: vmov q1[3], q1[1], r8, r10 712; CHECK-NEXT: vstrw.32 q1, [r1, #16] 713; CHECK-NEXT: adds r4, r4, r6 714; CHECK-NEXT: adcs r5, r7 715; CHECK-NEXT: vmov r6, r7, d6 716; CHECK-NEXT: adds r2, r2, r6 717; CHECK-NEXT: adcs r0, r7 718; CHECK-NEXT: adds r2, r2, r4 719; CHECK-NEXT: vmov q0[2], q0[0], r2, lr 720; CHECK-NEXT: adcs r0, r5 721; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 722; CHECK-NEXT: vstrw.32 q0, [r1] 723; CHECK-NEXT: add sp, #16 724; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 725; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 726entry: 727 %l1 = load <16 x i64>, ptr %src, align 8 728 %s1 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 729 %s2 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 730 %s3 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 731 %s4 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 732 %a1 = add <4 x i64> %s1, %s2 733 %a2 = add <4 x i64> %s3, %s4 734 %a3 = add <4 x i64> %a1, %a2 735 store <4 x i64> %a3, ptr %dst 736 ret void 737} 738 739; f32 740 741define void @vld4_v2f32(ptr %src, ptr %dst) { 742; CHECK-LABEL: vld4_v2f32: 743; CHECK: @ %bb.0: @ %entry 744; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 745; CHECK-NEXT: vldrw.u32 q1, [r0] 746; CHECK-NEXT: vmov.f32 s8, s7 747; CHECK-NEXT: vmov.f32 s9, s3 748; CHECK-NEXT: vmov.f32 s12, s6 749; CHECK-NEXT: vmov.f32 s13, s2 750; CHECK-NEXT: vadd.f32 q2, q3, q2 751; CHECK-NEXT: vmov.f32 s12, s5 752; CHECK-NEXT: vmov.f32 s13, s1 753; CHECK-NEXT: vmov.f32 s5, s0 754; CHECK-NEXT: vadd.f32 q0, q1, q3 755; CHECK-NEXT: vadd.f32 q0, q0, q2 756; CHECK-NEXT: vstmia r1, {s0, s1} 757; CHECK-NEXT: bx lr 758entry: 759 %l1 = load <8 x float>, ptr %src, align 4 760 %s1 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 0, i32 4> 761 %s2 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 1, i32 5> 762 %s3 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 2, i32 6> 763 %s4 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 3, i32 7> 764 %a1 = fadd <2 x float> %s1, %s2 765 %a2 = fadd <2 x float> %s3, %s4 766 %a3 = fadd <2 x float> %a1, %a2 767 store <2 x float> %a3, ptr %dst 768 ret void 769} 770 771define void @vld4_v4f32(ptr %src, ptr %dst) { 772; CHECK-LABEL: vld4_v4f32: 773; CHECK: @ %bb.0: @ %entry 774; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 775; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 776; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 777; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 778; CHECK-NEXT: vadd.f32 q2, q2, q3 779; CHECK-NEXT: vadd.f32 q0, q0, q1 780; CHECK-NEXT: vadd.f32 q0, q0, q2 781; CHECK-NEXT: vstrw.32 q0, [r1] 782; CHECK-NEXT: bx lr 783entry: 784 %l1 = load <16 x float>, ptr %src, align 4 785 %s1 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 786 %s2 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 787 %s3 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 788 %s4 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 789 %a1 = fadd <4 x float> %s1, %s2 790 %a2 = fadd <4 x float> %s3, %s4 791 %a3 = fadd <4 x float> %a1, %a2 792 store <4 x float> %a3, ptr %dst 793 ret void 794} 795 796define void @vld4_v8f32(ptr %src, ptr %dst) { 797; CHECK-LABEL: vld4_v8f32: 798; CHECK: @ %bb.0: @ %entry 799; CHECK-NEXT: .vsave {d8, d9, d10, d11} 800; CHECK-NEXT: vpush {d8, d9, d10, d11} 801; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 802; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 803; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 804; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! 805; CHECK-NEXT: vadd.f32 q4, q2, q3 806; CHECK-NEXT: vadd.f32 q5, q0, q1 807; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 808; CHECK-NEXT: vadd.f32 q4, q5, q4 809; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 810; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 811; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 812; CHECK-NEXT: vstrw.32 q4, [r1] 813; CHECK-NEXT: vadd.f32 q2, q2, q3 814; CHECK-NEXT: vadd.f32 q0, q0, q1 815; CHECK-NEXT: vadd.f32 q0, q0, q2 816; CHECK-NEXT: vstrw.32 q0, [r1, #16] 817; CHECK-NEXT: vpop {d8, d9, d10, d11} 818; CHECK-NEXT: bx lr 819entry: 820 %l1 = load <32 x float>, ptr %src, align 4 821 %s1 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 822 %s2 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 823 %s3 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 824 %s4 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 825 %a1 = fadd <8 x float> %s1, %s2 826 %a2 = fadd <8 x float> %s3, %s4 827 %a3 = fadd <8 x float> %a1, %a2 828 store <8 x float> %a3, ptr %dst 829 ret void 830} 831 832define void @vld4_v16f32(ptr %src, ptr %dst) { 833; CHECK-LABEL: vld4_v16f32: 834; CHECK: @ %bb.0: @ %entry 835; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 836; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 837; CHECK-NEXT: .pad #32 838; CHECK-NEXT: sub sp, #32 839; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 840; CHECK-NEXT: mov r2, r0 841; CHECK-NEXT: add.w r3, r0, #192 842; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 843; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 844; CHECK-NEXT: adds r0, #128 845; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! 846; CHECK-NEXT: vadd.f32 q2, q2, q3 847; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r3] 848; CHECK-NEXT: vadd.f32 q0, q0, q1 849; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r3] 850; CHECK-NEXT: vadd.f32 q0, q0, q2 851; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 852; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r3] 853; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r3] 854; CHECK-NEXT: vadd.f32 q1, q5, q6 855; CHECK-NEXT: vadd.f32 q2, q3, q4 856; CHECK-NEXT: vadd.f32 q0, q2, q1 857; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r2] 858; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 859; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 860; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r2] 861; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 862; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r2] 863; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 864; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r2] 865; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 866; CHECK-NEXT: vadd.f32 q0, q0, q1 867; CHECK-NEXT: vadd.f32 q2, q2, q3 868; CHECK-NEXT: vadd.f32 q0, q0, q2 869; CHECK-NEXT: vadd.f32 q1, q6, q7 870; CHECK-NEXT: vadd.f32 q2, q4, q5 871; CHECK-NEXT: vstrw.32 q0, [r1, #32] 872; CHECK-NEXT: vadd.f32 q1, q2, q1 873; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload 874; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 875; CHECK-NEXT: vstrw.32 q1, [r1, #16] 876; CHECK-NEXT: vstrw.32 q2, [r1, #48] 877; CHECK-NEXT: vstrw.32 q0, [r1] 878; CHECK-NEXT: add sp, #32 879; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 880; CHECK-NEXT: bx lr 881entry: 882 %l1 = load <64 x float>, ptr %src, align 4 883 %s1 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 884 %s2 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 885 %s3 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 886 %s4 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 887 %a1 = fadd <16 x float> %s1, %s2 888 %a2 = fadd <16 x float> %s3, %s4 889 %a3 = fadd <16 x float> %a1, %a2 890 store <16 x float> %a3, ptr %dst 891 ret void 892} 893 894define void @vld4_v4f32_align1(ptr %src, ptr %dst) { 895; CHECK-LABEL: vld4_v4f32_align1: 896; CHECK: @ %bb.0: @ %entry 897; CHECK-NEXT: .vsave {d8, d9, d10, d11} 898; CHECK-NEXT: vpush {d8, d9, d10, d11} 899; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 900; CHECK-NEXT: vldrb.u8 q1, [r0, #32] 901; CHECK-NEXT: vldrb.u8 q3, [r0, #16] 902; CHECK-NEXT: vldrb.u8 q2, [r0] 903; CHECK-NEXT: vmov.f32 s18, s7 904; CHECK-NEXT: vmov.f32 s16, s11 905; CHECK-NEXT: vmov.f32 s20, s10 906; CHECK-NEXT: vmov.f32 s17, s15 907; CHECK-NEXT: vmov.f32 s19, s3 908; CHECK-NEXT: vmov.f32 s21, s14 909; CHECK-NEXT: vmov.f32 s22, s6 910; CHECK-NEXT: vmov.f32 s23, s2 911; CHECK-NEXT: vadd.f32 q4, q5, q4 912; CHECK-NEXT: vmov.f32 s20, s9 913; CHECK-NEXT: vmov.f32 s21, s13 914; CHECK-NEXT: vmov.f32 s22, s5 915; CHECK-NEXT: vmov.f32 s23, s1 916; CHECK-NEXT: vmov.f32 s9, s12 917; CHECK-NEXT: vmov.f32 s10, s4 918; CHECK-NEXT: vmov.f32 s11, s0 919; CHECK-NEXT: vadd.f32 q0, q2, q5 920; CHECK-NEXT: vadd.f32 q0, q0, q4 921; CHECK-NEXT: vstrw.32 q0, [r1] 922; CHECK-NEXT: vpop {d8, d9, d10, d11} 923; CHECK-NEXT: bx lr 924entry: 925 %l1 = load <16 x float>, ptr %src, align 1 926 %s1 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 927 %s2 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 928 %s3 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 929 %s4 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 930 %a1 = fadd <4 x float> %s1, %s2 931 %a2 = fadd <4 x float> %s3, %s4 932 %a3 = fadd <4 x float> %a1, %a2 933 store <4 x float> %a3, ptr %dst 934 ret void 935} 936 937; f16 938 939define void @vld4_v2f16(ptr %src, ptr %dst) { 940; CHECK-LABEL: vld4_v2f16: 941; CHECK: @ %bb.0: @ %entry 942; CHECK-NEXT: vldrh.u16 q0, [r0] 943; CHECK-NEXT: vmovx.f16 s8, s1 944; CHECK-NEXT: vmovx.f16 s4, s3 945; CHECK-NEXT: vins.f16 s8, s4 946; CHECK-NEXT: vmovx.f16 s12, s0 947; CHECK-NEXT: vmovx.f16 s4, s2 948; CHECK-NEXT: vins.f16 s1, s3 949; CHECK-NEXT: vins.f16 s12, s4 950; CHECK-NEXT: vmov.f32 s4, s1 951; CHECK-NEXT: vins.f16 s0, s2 952; CHECK-NEXT: vadd.f16 q1, q1, q2 953; CHECK-NEXT: vadd.f16 q0, q0, q3 954; CHECK-NEXT: vadd.f16 q0, q0, q1 955; CHECK-NEXT: vmov r0, s0 956; CHECK-NEXT: str r0, [r1] 957; CHECK-NEXT: bx lr 958entry: 959 %l1 = load <8 x half>, ptr %src, align 2 960 %s1 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 0, i32 4> 961 %s2 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 1, i32 5> 962 %s3 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 2, i32 6> 963 %s4 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 3, i32 7> 964 %a1 = fadd <2 x half> %s1, %s2 965 %a2 = fadd <2 x half> %s3, %s4 966 %a3 = fadd <2 x half> %a1, %a2 967 store <2 x half> %a3, ptr %dst 968 ret void 969} 970 971define void @vld4_v4f16(ptr %src, ptr %dst) { 972; CHECK-LABEL: vld4_v4f16: 973; CHECK: @ %bb.0: @ %entry 974; CHECK-NEXT: .vsave {d8} 975; CHECK-NEXT: vpush {d8} 976; CHECK-NEXT: vldrh.u16 q0, [r0] 977; CHECK-NEXT: vldrh.u16 q2, [r0, #16] 978; CHECK-NEXT: vmovx.f16 s4, s0 979; CHECK-NEXT: vmovx.f16 s6, s2 980; CHECK-NEXT: vins.f16 s0, s2 981; CHECK-NEXT: vmovx.f16 s12, s1 982; CHECK-NEXT: vmovx.f16 s2, s3 983; CHECK-NEXT: vins.f16 s4, s6 984; CHECK-NEXT: vmovx.f16 s5, s8 985; CHECK-NEXT: vmovx.f16 s6, s10 986; CHECK-NEXT: vins.f16 s12, s2 987; CHECK-NEXT: vmovx.f16 s13, s9 988; CHECK-NEXT: vmovx.f16 s2, s11 989; CHECK-NEXT: vins.f16 s1, s3 990; CHECK-NEXT: vins.f16 s9, s11 991; CHECK-NEXT: vins.f16 s8, s10 992; CHECK-NEXT: vmov.f32 s16, s1 993; CHECK-NEXT: vins.f16 s5, s6 994; CHECK-NEXT: vins.f16 s13, s2 995; CHECK-NEXT: vmov.f32 s1, s8 996; CHECK-NEXT: vmov.f32 s17, s9 997; CHECK-NEXT: vadd.f16 q0, q0, q1 998; CHECK-NEXT: vadd.f16 q3, q4, q3 999; CHECK-NEXT: vadd.f16 q0, q0, q3 1000; CHECK-NEXT: vmov r0, r2, d0 1001; CHECK-NEXT: strd r0, r2, [r1] 1002; CHECK-NEXT: vpop {d8} 1003; CHECK-NEXT: bx lr 1004entry: 1005 %l1 = load <16 x half>, ptr %src, align 2 1006 %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 1007 %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 1008 %s3 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 1009 %s4 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 1010 %a1 = fadd <4 x half> %s1, %s2 1011 %a2 = fadd <4 x half> %s3, %s4 1012 %a3 = fadd <4 x half> %a1, %a2 1013 store <4 x half> %a3, ptr %dst 1014 ret void 1015} 1016 1017define void @vld4_v8f16(ptr %src, ptr %dst) { 1018; CHECK-LABEL: vld4_v8f16: 1019; CHECK: @ %bb.0: @ %entry 1020; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 1021; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 1022; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 1023; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] 1024; CHECK-NEXT: vadd.f16 q2, q2, q3 1025; CHECK-NEXT: vadd.f16 q0, q0, q1 1026; CHECK-NEXT: vadd.f16 q0, q0, q2 1027; CHECK-NEXT: vstrw.32 q0, [r1] 1028; CHECK-NEXT: bx lr 1029entry: 1030 %l1 = load <32 x half>, ptr %src, align 2 1031 %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 1032 %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 1033 %s3 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 1034 %s4 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 1035 %a1 = fadd <8 x half> %s1, %s2 1036 %a2 = fadd <8 x half> %s3, %s4 1037 %a3 = fadd <8 x half> %a1, %a2 1038 store <8 x half> %a3, ptr %dst 1039 ret void 1040} 1041 1042define void @vld4_v16f16(ptr %src, ptr %dst) { 1043; CHECK-LABEL: vld4_v16f16: 1044; CHECK: @ %bb.0: @ %entry 1045; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1046; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1047; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 1048; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 1049; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 1050; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! 1051; CHECK-NEXT: vld40.16 {q4, q5, q6, q7}, [r0] 1052; CHECK-NEXT: vadd.f16 q2, q2, q3 1053; CHECK-NEXT: vadd.f16 q0, q0, q1 1054; CHECK-NEXT: vld41.16 {q4, q5, q6, q7}, [r0] 1055; CHECK-NEXT: vadd.f16 q0, q0, q2 1056; CHECK-NEXT: vld42.16 {q4, q5, q6, q7}, [r0] 1057; CHECK-NEXT: vld43.16 {q4, q5, q6, q7}, [r0] 1058; CHECK-NEXT: vstrw.32 q0, [r1] 1059; CHECK-NEXT: vadd.f16 q6, q6, q7 1060; CHECK-NEXT: vadd.f16 q4, q4, q5 1061; CHECK-NEXT: vadd.f16 q4, q4, q6 1062; CHECK-NEXT: vstrw.32 q4, [r1, #16] 1063; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1064; CHECK-NEXT: bx lr 1065entry: 1066 %l1 = load <64 x half>, ptr %src, align 2 1067 %s1 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 1068 %s2 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 1069 %s3 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 1070 %s4 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 1071 %a1 = fadd <16 x half> %s1, %s2 1072 %a2 = fadd <16 x half> %s3, %s4 1073 %a3 = fadd <16 x half> %a1, %a2 1074 store <16 x half> %a3, ptr %dst 1075 ret void 1076} 1077 1078define void @vld4_v8f16_align1(ptr %src, ptr %dst) { 1079; CHECK-LABEL: vld4_v8f16_align1: 1080; CHECK: @ %bb.0: @ %entry 1081; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 1082; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 1083; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 1084; CHECK-NEXT: vldrb.u8 q2, [r0, #48] 1085; CHECK-NEXT: vmovx.f16 s18, s1 1086; CHECK-NEXT: vmovx.f16 s4, s3 1087; CHECK-NEXT: vins.f16 s18, s4 1088; CHECK-NEXT: vmovx.f16 s19, s9 1089; CHECK-NEXT: vmovx.f16 s4, s11 1090; CHECK-NEXT: vins.f16 s1, s3 1091; CHECK-NEXT: vins.f16 s19, s4 1092; CHECK-NEXT: vldrb.u8 q1, [r0] 1093; CHECK-NEXT: vmovx.f16 s22, s0 1094; CHECK-NEXT: vmovx.f16 s3, s2 1095; CHECK-NEXT: vmovx.f16 s16, s5 1096; CHECK-NEXT: vmovx.f16 s12, s7 1097; CHECK-NEXT: vins.f16 s16, s12 1098; CHECK-NEXT: vldrb.u8 q3, [r0, #16] 1099; CHECK-NEXT: vins.f16 s22, s3 1100; CHECK-NEXT: vmovx.f16 s23, s8 1101; CHECK-NEXT: vmovx.f16 s17, s13 1102; CHECK-NEXT: vmovx.f16 s20, s15 1103; CHECK-NEXT: vmovx.f16 s3, s10 1104; CHECK-NEXT: vins.f16 s17, s20 1105; CHECK-NEXT: vins.f16 s23, s3 1106; CHECK-NEXT: vmovx.f16 s20, s4 1107; CHECK-NEXT: vmovx.f16 s3, s6 1108; CHECK-NEXT: vins.f16 s9, s11 1109; CHECK-NEXT: vins.f16 s5, s7 1110; CHECK-NEXT: vins.f16 s13, s15 1111; CHECK-NEXT: vins.f16 s20, s3 1112; CHECK-NEXT: vmovx.f16 s21, s12 1113; CHECK-NEXT: vmovx.f16 s3, s14 1114; CHECK-NEXT: vins.f16 s8, s10 1115; CHECK-NEXT: vins.f16 s0, s2 1116; CHECK-NEXT: vins.f16 s12, s14 1117; CHECK-NEXT: vins.f16 s4, s6 1118; CHECK-NEXT: vmov.f32 s24, s5 1119; CHECK-NEXT: vins.f16 s21, s3 1120; CHECK-NEXT: vmov.f32 s26, s1 1121; CHECK-NEXT: vmov.f32 s27, s9 1122; CHECK-NEXT: vmov.f32 s25, s13 1123; CHECK-NEXT: vmov.f32 s6, s0 1124; CHECK-NEXT: vadd.f16 q4, q6, q4 1125; CHECK-NEXT: vmov.f32 s7, s8 1126; CHECK-NEXT: vmov.f32 s5, s12 1127; CHECK-NEXT: vadd.f16 q0, q1, q5 1128; CHECK-NEXT: vadd.f16 q0, q0, q4 1129; CHECK-NEXT: vstrw.32 q0, [r1] 1130; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 1131; CHECK-NEXT: bx lr 1132entry: 1133 %l1 = load <32 x half>, ptr %src, align 1 1134 %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 1135 %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 1136 %s3 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 1137 %s4 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 1138 %a1 = fadd <8 x half> %s1, %s2 1139 %a2 = fadd <8 x half> %s3, %s4 1140 %a3 = fadd <8 x half> %a1, %a2 1141 store <8 x half> %a3, ptr %dst 1142 ret void 1143} 1144 1145; f64 1146 1147define void @vld4_v2f64(ptr %src, ptr %dst) { 1148; CHECK-LABEL: vld4_v2f64: 1149; CHECK: @ %bb.0: @ %entry 1150; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 1151; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 1152; CHECK-NEXT: vldrw.u32 q2, [r0] 1153; CHECK-NEXT: vadd.f64 d0, d0, d1 1154; CHECK-NEXT: vadd.f64 d1, d2, d3 1155; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1156; CHECK-NEXT: vadd.f64 d2, d2, d3 1157; CHECK-NEXT: vadd.f64 d3, d4, d5 1158; CHECK-NEXT: vadd.f64 d1, d1, d0 1159; CHECK-NEXT: vadd.f64 d0, d3, d2 1160; CHECK-NEXT: vstrw.32 q0, [r1] 1161; CHECK-NEXT: bx lr 1162entry: 1163 %l1 = load <8 x double>, ptr %src, align 8 1164 %s1 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 0, i32 4> 1165 %s2 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 1, i32 5> 1166 %s3 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 2, i32 6> 1167 %s4 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 3, i32 7> 1168 %a1 = fadd <2 x double> %s1, %s2 1169 %a2 = fadd <2 x double> %s3, %s4 1170 %a3 = fadd <2 x double> %a1, %a2 1171 store <2 x double> %a3, ptr %dst 1172 ret void 1173} 1174 1175define void @vld4_v4f64(ptr %src, ptr %dst) { 1176; CHECK-LABEL: vld4_v4f64: 1177; CHECK: @ %bb.0: @ %entry 1178; CHECK-NEXT: .vsave {d8, d9} 1179; CHECK-NEXT: vpush {d8, d9} 1180; CHECK-NEXT: vldrw.u32 q0, [r0, #112] 1181; CHECK-NEXT: vldrw.u32 q1, [r0, #96] 1182; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 1183; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1184; CHECK-NEXT: vadd.f64 d0, d0, d1 1185; CHECK-NEXT: vldrw.u32 q4, [r0] 1186; CHECK-NEXT: vadd.f64 d1, d2, d3 1187; CHECK-NEXT: vldrw.u32 q1, [r0, #80] 1188; CHECK-NEXT: vadd.f64 d2, d2, d3 1189; CHECK-NEXT: vadd.f64 d3, d4, d5 1190; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 1191; CHECK-NEXT: vadd.f64 d4, d4, d5 1192; CHECK-NEXT: vadd.f64 d5, d6, d7 1193; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 1194; CHECK-NEXT: vadd.f64 d6, d6, d7 1195; CHECK-NEXT: vadd.f64 d7, d8, d9 1196; CHECK-NEXT: vadd.f64 d1, d1, d0 1197; CHECK-NEXT: vadd.f64 d0, d3, d2 1198; CHECK-NEXT: vadd.f64 d3, d5, d4 1199; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1200; CHECK-NEXT: vadd.f64 d2, d7, d6 1201; CHECK-NEXT: vstrw.32 q1, [r1] 1202; CHECK-NEXT: vpop {d8, d9} 1203; CHECK-NEXT: bx lr 1204entry: 1205 %l1 = load <16 x double>, ptr %src, align 8 1206 %s1 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 1207 %s2 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 1208 %s3 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 1209 %s4 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 1210 %a1 = fadd <4 x double> %s1, %s2 1211 %a2 = fadd <4 x double> %s3, %s4 1212 %a3 = fadd <4 x double> %a1, %a2 1213 store <4 x double> %a3, ptr %dst 1214 ret void 1215} 1216