1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,CHECK-LV 3; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,CHECK-LIS 4 5; i32 6 7define void @vld3_v2i32(ptr %src, ptr %dst) { 8; CHECK-LABEL: vld3_v2i32: 9; CHECK: @ %bb.0: @ %entry 10; CHECK-NEXT: .save {r7, lr} 11; CHECK-NEXT: push {r7, lr} 12; CHECK-NEXT: vldrw.u32 q0, [r0] 13; CHECK-NEXT: ldrd r0, r2, [r0, #16] 14; CHECK-NEXT: vmov.f32 s6, s3 15; CHECK-NEXT: vmov r12, lr, d0 16; CHECK-NEXT: vmov r3, s6 17; CHECK-NEXT: add r0, r3 18; CHECK-NEXT: add.w r3, r12, lr 19; CHECK-NEXT: add r0, r2 20; CHECK-NEXT: vmov r2, s2 21; CHECK-NEXT: add r2, r3 22; CHECK-NEXT: strd r2, r0, [r1] 23; CHECK-NEXT: pop {r7, pc} 24entry: 25 %l1 = load <6 x i32>, ptr %src, align 4 26 %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3> 27 %s2 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 1, i32 4> 28 %s3 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 2, i32 5> 29 %a1 = add <2 x i32> %s1, %s2 30 %a = add <2 x i32> %a1, %s3 31 store <2 x i32> %a, ptr %dst 32 ret void 33} 34 35define void @vld3_v4i32(ptr %src, ptr %dst) { 36; CHECK-LABEL: vld3_v4i32: 37; CHECK: @ %bb.0: @ %entry 38; CHECK-NEXT: .vsave {d8, d9} 39; CHECK-NEXT: vpush {d8, d9} 40; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 41; CHECK-NEXT: vldrw.u32 q1, [r0] 42; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 43; CHECK-NEXT: vmov.f32 s10, s2 44; CHECK-NEXT: vmov.f32 s13, s0 45; CHECK-NEXT: vmov.f32 s14, s3 46; CHECK-NEXT: vmov.f32 s8, s4 47; CHECK-NEXT: vmov.f32 s9, s7 48; CHECK-NEXT: vmov.f32 s12, s5 49; CHECK-NEXT: vmov.f32 s15, s18 50; CHECK-NEXT: vmov.f32 s11, s17 51; CHECK-NEXT: vadd.i32 q2, q2, q3 52; CHECK-NEXT: vmov.f32 s0, s6 53; CHECK-NEXT: vmov.f32 s2, s16 54; CHECK-NEXT: vmov.f32 s3, s19 55; CHECK-NEXT: vadd.i32 q0, q2, q0 56; CHECK-NEXT: vstrw.32 q0, [r1] 57; CHECK-NEXT: vpop {d8, d9} 58; CHECK-NEXT: bx lr 59entry: 60 %l1 = load <12 x i32>, ptr %src, align 4 61 %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 62 %s2 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 63 %s3 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 64 %a1 = add <4 x i32> %s1, %s2 65 %a = add <4 x i32> %a1, %s3 66 store <4 x i32> %a, ptr %dst 67 ret void 68} 69 70define void @vld3_v8i32(ptr %src, ptr %dst) { 71; CHECK-LABEL: vld3_v8i32: 72; CHECK: @ %bb.0: @ %entry 73; CHECK-NEXT: .vsave {d8, d9, d10, d11} 74; CHECK-NEXT: vpush {d8, d9, d10, d11} 75; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 76; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 77; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 78; CHECK-NEXT: vmov.f32 s10, s2 79; CHECK-NEXT: vmov.f32 s13, s0 80; CHECK-NEXT: vmov.f32 s14, s3 81; CHECK-NEXT: vmov.f32 s8, s4 82; CHECK-NEXT: vmov.f32 s9, s7 83; CHECK-NEXT: vmov.f32 s12, s5 84; CHECK-NEXT: vmov.f32 s15, s18 85; CHECK-NEXT: vmov.f32 s11, s17 86; CHECK-NEXT: vadd.i32 q2, q2, q3 87; CHECK-NEXT: vmov.f32 s0, s6 88; CHECK-NEXT: vmov.f32 s2, s16 89; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 90; CHECK-NEXT: vmov.f32 s3, s19 91; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 92; CHECK-NEXT: vadd.i32 q0, q2, q0 93; CHECK-NEXT: vldrw.u32 q2, [r0] 94; CHECK-NEXT: vmov.f32 s17, s4 95; CHECK-NEXT: vstrw.32 q0, [r1, #16] 96; CHECK-NEXT: vmov.f32 s18, s7 97; CHECK-NEXT: vmov.f32 s22, s6 98; CHECK-NEXT: vmov.f32 s16, s9 99; CHECK-NEXT: vmov.f32 s19, s14 100; CHECK-NEXT: vmov.f32 s20, s8 101; CHECK-NEXT: vmov.f32 s21, s11 102; CHECK-NEXT: vmov.f32 s23, s13 103; CHECK-NEXT: vadd.i32 q4, q5, q4 104; CHECK-NEXT: vmov.f32 s4, s10 105; CHECK-NEXT: vmov.f32 s6, s12 106; CHECK-NEXT: vmov.f32 s7, s15 107; CHECK-NEXT: vadd.i32 q1, q4, q1 108; CHECK-NEXT: vstrw.32 q1, [r1] 109; CHECK-NEXT: vpop {d8, d9, d10, d11} 110; CHECK-NEXT: bx lr 111entry: 112 %l1 = load <24 x i32>, ptr %src, align 4 113 %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 114 %s2 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 115 %s3 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 116 %a1 = add <8 x i32> %s1, %s2 117 %a = add <8 x i32> %a1, %s3 118 store <8 x i32> %a, ptr %dst 119 ret void 120} 121 122define void @vld3_v16i32(ptr %src, ptr %dst) { 123; CHECK-LABEL: vld3_v16i32: 124; CHECK: @ %bb.0: @ %entry 125; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 126; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 127; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 128; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 129; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 130; CHECK-NEXT: vldrw.u32 q6, [r0, #176] 131; CHECK-NEXT: vmov.f32 s10, s2 132; CHECK-NEXT: vmov.f32 s13, s0 133; CHECK-NEXT: vmov.f32 s14, s3 134; CHECK-NEXT: vmov.f32 s8, s4 135; CHECK-NEXT: vmov.f32 s9, s7 136; CHECK-NEXT: vmov.f32 s12, s5 137; CHECK-NEXT: vmov.f32 s15, s18 138; CHECK-NEXT: vmov.f32 s11, s17 139; CHECK-NEXT: vadd.i32 q2, q2, q3 140; CHECK-NEXT: vmov.f32 s0, s6 141; CHECK-NEXT: vmov.f32 s2, s16 142; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 143; CHECK-NEXT: vmov.f32 s3, s19 144; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 145; CHECK-NEXT: vadd.i32 q0, q2, q0 146; CHECK-NEXT: vldrw.u32 q2, [r0] 147; CHECK-NEXT: vmov.f32 s17, s4 148; CHECK-NEXT: vmov.f32 s18, s7 149; CHECK-NEXT: vmov.f32 s22, s6 150; CHECK-NEXT: vmov.f32 s16, s9 151; CHECK-NEXT: vmov.f32 s19, s14 152; CHECK-NEXT: vmov.f32 s20, s8 153; CHECK-NEXT: vmov.f32 s21, s11 154; CHECK-NEXT: vmov.f32 s23, s13 155; CHECK-NEXT: vmov.f32 s4, s10 156; CHECK-NEXT: vldrw.u32 q2, [r0, #160] 157; CHECK-NEXT: vmov.f32 s6, s12 158; CHECK-NEXT: vadd.i32 q4, q5, q4 159; CHECK-NEXT: vmov.f32 s7, s15 160; CHECK-NEXT: vldrw.u32 q3, [r0, #144] 161; CHECK-NEXT: vadd.i32 q1, q4, q1 162; CHECK-NEXT: vmov.f32 s18, s10 163; CHECK-NEXT: vmov.f32 s21, s8 164; CHECK-NEXT: vmov.f32 s22, s11 165; CHECK-NEXT: vmov.f32 s16, s12 166; CHECK-NEXT: vmov.f32 s17, s15 167; CHECK-NEXT: vmov.f32 s20, s13 168; CHECK-NEXT: vmov.f32 s23, s26 169; CHECK-NEXT: vmov.f32 s19, s25 170; CHECK-NEXT: vadd.i32 q4, q4, q5 171; CHECK-NEXT: vmov.f32 s8, s14 172; CHECK-NEXT: vmov.f32 s10, s24 173; CHECK-NEXT: vldrw.u32 q3, [r0, #112] 174; CHECK-NEXT: vmov.f32 s11, s27 175; CHECK-NEXT: vldrw.u32 q5, [r0, #128] 176; CHECK-NEXT: vadd.i32 q2, q4, q2 177; CHECK-NEXT: vldrw.u32 q4, [r0, #96] 178; CHECK-NEXT: vmov.f32 s25, s12 179; CHECK-NEXT: vstrw.32 q2, [r1, #48] 180; CHECK-NEXT: vmov.f32 s26, s15 181; CHECK-NEXT: vstrw.32 q0, [r1, #16] 182; CHECK-NEXT: vmov.f32 s30, s14 183; CHECK-NEXT: vstrw.32 q1, [r1] 184; CHECK-NEXT: vmov.f32 s24, s17 185; CHECK-NEXT: vmov.f32 s27, s22 186; CHECK-NEXT: vmov.f32 s28, s16 187; CHECK-NEXT: vmov.f32 s29, s19 188; CHECK-NEXT: vmov.f32 s31, s21 189; CHECK-NEXT: vadd.i32 q6, q7, q6 190; CHECK-NEXT: vmov.f32 s12, s18 191; CHECK-NEXT: vmov.f32 s14, s20 192; CHECK-NEXT: vmov.f32 s15, s23 193; CHECK-NEXT: vadd.i32 q3, q6, q3 194; CHECK-NEXT: vstrw.32 q3, [r1, #32] 195; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 196; CHECK-NEXT: bx lr 197entry: 198 %l1 = load <48 x i32>, ptr %src, align 4 199 %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 200 %s2 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 201 %s3 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 202 %a1 = add <16 x i32> %s1, %s2 203 %a = add <16 x i32> %a1, %s3 204 store <16 x i32> %a, ptr %dst 205 ret void 206} 207 208; i16 209 210define void @vld3_v2i16(ptr %src, ptr %dst) { 211; CHECK-LABEL: vld3_v2i16: 212; CHECK: @ %bb.0: @ %entry 213; CHECK-NEXT: .pad #8 214; CHECK-NEXT: sub sp, #8 215; CHECK-NEXT: vldrh.u32 q0, [r0] 216; CHECK-NEXT: ldr r2, [r0, #8] 217; CHECK-NEXT: mov r3, sp 218; CHECK-NEXT: str r2, [sp] 219; CHECK-NEXT: vmov.f32 s6, s3 220; CHECK-NEXT: vmov.f32 s8, s1 221; CHECK-NEXT: vmov r0, s6 222; CHECK-NEXT: vldrh.u32 q1, [r3] 223; CHECK-NEXT: vmov.f32 s6, s4 224; CHECK-NEXT: vmov.f32 s4, s2 225; CHECK-NEXT: vmov.f32 s2, s5 226; CHECK-NEXT: vmov r2, s6 227; CHECK-NEXT: add r0, r2 228; CHECK-NEXT: vmov r2, s2 229; CHECK-NEXT: add r0, r2 230; CHECK-NEXT: strh r0, [r1, #2] 231; CHECK-NEXT: vmov r0, s8 232; CHECK-NEXT: vmov r2, s0 233; CHECK-NEXT: add r0, r2 234; CHECK-NEXT: vmov r2, s4 235; CHECK-NEXT: add r0, r2 236; CHECK-NEXT: strh r0, [r1] 237; CHECK-NEXT: add sp, #8 238; CHECK-NEXT: bx lr 239entry: 240 %l1 = load <6 x i16>, ptr %src, align 4 241 %s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 0, i32 3> 242 %s2 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 1, i32 4> 243 %s3 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 2, i32 5> 244 %a1 = add <2 x i16> %s1, %s2 245 %a = add <2 x i16> %a1, %s3 246 store <2 x i16> %a, ptr %dst 247 ret void 248} 249 250define void @vld3_v4i16(ptr %src, ptr %dst) { 251; CHECK-LABEL: vld3_v4i16: 252; CHECK: @ %bb.0: @ %entry 253; CHECK-NEXT: .save {r4, r5, r6, lr} 254; CHECK-NEXT: push {r4, r5, r6, lr} 255; CHECK-NEXT: vldrw.u32 q0, [r0] 256; CHECK-NEXT: vldrh.u32 q1, [r0, #16] 257; CHECK-NEXT: vmov.u16 r5, q0[6] 258; CHECK-NEXT: vmov.u16 r6, q0[0] 259; CHECK-NEXT: vmov r0, r3, d2 260; CHECK-NEXT: vmov.u16 lr, q0[2] 261; CHECK-NEXT: vmov r2, r4, d3 262; CHECK-NEXT: vmov q1[2], q1[0], r6, r5 263; CHECK-NEXT: vmov.u16 r5, q0[7] 264; CHECK-NEXT: vmov.u16 r6, q0[1] 265; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 266; CHECK-NEXT: vmov.u16 r5, q0[3] 267; CHECK-NEXT: vmov.u16 r6, q0[4] 268; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 269; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 270; CHECK-NEXT: vmov.u16 r12, q0[5] 271; CHECK-NEXT: vadd.i32 q0, q1, q2 272; CHECK-NEXT: vmov q1[2], q1[0], lr, r0 273; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 274; CHECK-NEXT: vadd.i32 q0, q0, q1 275; CHECK-NEXT: vstrh.32 q0, [r1] 276; CHECK-NEXT: pop {r4, r5, r6, pc} 277entry: 278 %l1 = load <12 x i16>, ptr %src, align 4 279 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 280 %s2 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 281 %s3 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 282 %a1 = add <4 x i16> %s1, %s2 283 %a = add <4 x i16> %a1, %s3 284 store <4 x i16> %a, ptr %dst 285 ret void 286} 287 288define void @vld3_v8i16(ptr %src, ptr %dst) { 289; CHECK-LABEL: vld3_v8i16: 290; CHECK: @ %bb.0: @ %entry 291; CHECK-NEXT: .vsave {d8, d9} 292; CHECK-NEXT: vpush {d8, d9} 293; CHECK-NEXT: vldrw.u32 q0, [r0] 294; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 295; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 296; CHECK-NEXT: vmovx.f16 s6, s2 297; CHECK-NEXT: vmov.f32 s4, s1 298; CHECK-NEXT: vins.f16 s4, s6 299; CHECK-NEXT: vmovx.f16 s6, s9 300; CHECK-NEXT: vmov.f32 s5, s8 301; CHECK-NEXT: vmovx.f16 s7, s12 302; CHECK-NEXT: vins.f16 s5, s6 303; CHECK-NEXT: vmov.f32 s6, s11 304; CHECK-NEXT: vins.f16 s6, s7 305; CHECK-NEXT: vmovx.f16 s16, s15 306; CHECK-NEXT: vmov.f32 s7, s14 307; CHECK-NEXT: vmovx.f16 s17, s3 308; CHECK-NEXT: vins.f16 s7, s16 309; CHECK-NEXT: vmovx.f16 s16, s0 310; CHECK-NEXT: vins.f16 s16, s2 311; CHECK-NEXT: vmovx.f16 s2, s1 312; CHECK-NEXT: vins.f16 s0, s2 313; CHECK-NEXT: vmovx.f16 s2, s8 314; CHECK-NEXT: vins.f16 s3, s2 315; CHECK-NEXT: vmovx.f16 s2, s11 316; CHECK-NEXT: vmovx.f16 s8, s14 317; CHECK-NEXT: vmovx.f16 s18, s10 318; CHECK-NEXT: vmovx.f16 s19, s13 319; CHECK-NEXT: vins.f16 s10, s2 320; CHECK-NEXT: vins.f16 s13, s8 321; CHECK-NEXT: vmov.f32 s1, s3 322; CHECK-NEXT: vins.f16 s18, s12 323; CHECK-NEXT: vins.f16 s19, s15 324; CHECK-NEXT: vins.f16 s17, s9 325; CHECK-NEXT: vmov.f32 s2, s10 326; CHECK-NEXT: vmov.f32 s3, s13 327; CHECK-NEXT: vadd.i16 q0, q0, q4 328; CHECK-NEXT: vadd.i16 q0, q0, q1 329; CHECK-NEXT: vstrw.32 q0, [r1] 330; CHECK-NEXT: vpop {d8, d9} 331; CHECK-NEXT: bx lr 332entry: 333 %l1 = load <24 x i16>, ptr %src, align 4 334 %s1 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 335 %s2 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 336 %s3 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 337 %a1 = add <8 x i16> %s1, %s2 338 %a = add <8 x i16> %a1, %s3 339 store <8 x i16> %a, ptr %dst 340 ret void 341} 342 343define void @vld3_v16i16(ptr %src, ptr %dst) { 344; CHECK-LV-LABEL: vld3_v16i16: 345; CHECK-LV: @ %bb.0: @ %entry 346; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} 347; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} 348; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #48] 349; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #64] 350; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] 351; CHECK-LV-NEXT: vmovx.f16 s6, s2 352; CHECK-LV-NEXT: vmov.f32 s4, s1 353; CHECK-LV-NEXT: vins.f16 s4, s6 354; CHECK-LV-NEXT: vmovx.f16 s6, s9 355; CHECK-LV-NEXT: vmov.f32 s5, s8 356; CHECK-LV-NEXT: vmovx.f16 s7, s12 357; CHECK-LV-NEXT: vins.f16 s5, s6 358; CHECK-LV-NEXT: vmov.f32 s6, s11 359; CHECK-LV-NEXT: vins.f16 s6, s7 360; CHECK-LV-NEXT: vmovx.f16 s16, s15 361; CHECK-LV-NEXT: vmov.f32 s7, s14 362; CHECK-LV-NEXT: vmovx.f16 s17, s3 363; CHECK-LV-NEXT: vins.f16 s7, s16 364; CHECK-LV-NEXT: vmovx.f16 s16, s0 365; CHECK-LV-NEXT: vins.f16 s16, s2 366; CHECK-LV-NEXT: vmovx.f16 s2, s1 367; CHECK-LV-NEXT: vins.f16 s0, s2 368; CHECK-LV-NEXT: vmovx.f16 s2, s8 369; CHECK-LV-NEXT: vins.f16 s3, s2 370; CHECK-LV-NEXT: vmovx.f16 s2, s11 371; CHECK-LV-NEXT: vmovx.f16 s8, s14 372; CHECK-LV-NEXT: vmovx.f16 s18, s10 373; CHECK-LV-NEXT: vmovx.f16 s19, s13 374; CHECK-LV-NEXT: vins.f16 s10, s2 375; CHECK-LV-NEXT: vins.f16 s13, s8 376; CHECK-LV-NEXT: vmov.f32 s1, s3 377; CHECK-LV-NEXT: vins.f16 s18, s12 378; CHECK-LV-NEXT: vins.f16 s19, s15 379; CHECK-LV-NEXT: vmov.f32 s3, s13 380; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #16] 381; CHECK-LV-NEXT: vins.f16 s17, s9 382; CHECK-LV-NEXT: vmov.f32 s2, s10 383; CHECK-LV-NEXT: vadd.i16 q0, q0, q4 384; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #32] 385; CHECK-LV-NEXT: vadd.i16 q0, q0, q1 386; CHECK-LV-NEXT: vmovx.f16 s6, s14 387; CHECK-LV-NEXT: vldrw.u32 q4, [r0] 388; CHECK-LV-NEXT: vins.f16 s6, s8 389; CHECK-LV-NEXT: vmov.f32 s22, s15 390; CHECK-LV-NEXT: vmovx.f16 s8, s8 391; CHECK-LV-NEXT: vins.f16 s22, s8 392; CHECK-LV-NEXT: vmovx.f16 s8, s11 393; CHECK-LV-NEXT: vmov.f32 s23, s10 394; CHECK-LV-NEXT: vmovx.f16 s4, s16 395; CHECK-LV-NEXT: vins.f16 s23, s8 396; CHECK-LV-NEXT: vmovx.f16 s8, s17 397; CHECK-LV-NEXT: vins.f16 s16, s8 398; CHECK-LV-NEXT: vmovx.f16 s8, s12 399; CHECK-LV-NEXT: vmovx.f16 s5, s19 400; CHECK-LV-NEXT: vins.f16 s19, s8 401; CHECK-LV-NEXT: vmovx.f16 s8, s15 402; CHECK-LV-NEXT: vmovx.f16 s7, s9 403; CHECK-LV-NEXT: vins.f16 s14, s8 404; CHECK-LV-NEXT: vmovx.f16 s8, s10 405; CHECK-LV-NEXT: vins.f16 s4, s18 406; CHECK-LV-NEXT: vmov.f32 s20, s17 407; CHECK-LV-NEXT: vmovx.f16 s18, s18 408; CHECK-LV-NEXT: vins.f16 s9, s8 409; CHECK-LV-NEXT: vins.f16 s5, s13 410; CHECK-LV-NEXT: vins.f16 s20, s18 411; CHECK-LV-NEXT: vmov.f32 s17, s19 412; CHECK-LV-NEXT: vins.f16 s7, s11 413; CHECK-LV-NEXT: vmovx.f16 s13, s13 414; CHECK-LV-NEXT: vmov.f32 s21, s12 415; CHECK-LV-NEXT: vmov.f32 s18, s14 416; CHECK-LV-NEXT: vins.f16 s21, s13 417; CHECK-LV-NEXT: vmov.f32 s19, s9 418; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] 419; CHECK-LV-NEXT: vadd.i16 q1, q4, q1 420; CHECK-LV-NEXT: vadd.i16 q1, q1, q5 421; CHECK-LV-NEXT: vstrw.32 q1, [r1] 422; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} 423; CHECK-LV-NEXT: bx lr 424; 425; CHECK-LIS-LABEL: vld3_v16i16: 426; CHECK-LIS: @ %bb.0: @ %entry 427; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} 428; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} 429; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #48] 430; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #64] 431; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] 432; CHECK-LIS-NEXT: vmovx.f16 s6, s2 433; CHECK-LIS-NEXT: vmov.f32 s4, s1 434; CHECK-LIS-NEXT: vins.f16 s4, s6 435; CHECK-LIS-NEXT: vmovx.f16 s6, s9 436; CHECK-LIS-NEXT: vmov.f32 s5, s8 437; CHECK-LIS-NEXT: vmovx.f16 s7, s12 438; CHECK-LIS-NEXT: vins.f16 s5, s6 439; CHECK-LIS-NEXT: vmov.f32 s6, s11 440; CHECK-LIS-NEXT: vins.f16 s6, s7 441; CHECK-LIS-NEXT: vmovx.f16 s16, s15 442; CHECK-LIS-NEXT: vmov.f32 s7, s14 443; CHECK-LIS-NEXT: vmovx.f16 s17, s3 444; CHECK-LIS-NEXT: vins.f16 s7, s16 445; CHECK-LIS-NEXT: vmovx.f16 s16, s0 446; CHECK-LIS-NEXT: vins.f16 s16, s2 447; CHECK-LIS-NEXT: vmovx.f16 s2, s1 448; CHECK-LIS-NEXT: vins.f16 s0, s2 449; CHECK-LIS-NEXT: vmovx.f16 s2, s8 450; CHECK-LIS-NEXT: vins.f16 s3, s2 451; CHECK-LIS-NEXT: vmovx.f16 s2, s11 452; CHECK-LIS-NEXT: vmovx.f16 s8, s14 453; CHECK-LIS-NEXT: vmovx.f16 s18, s10 454; CHECK-LIS-NEXT: vmovx.f16 s19, s13 455; CHECK-LIS-NEXT: vins.f16 s10, s2 456; CHECK-LIS-NEXT: vins.f16 s13, s8 457; CHECK-LIS-NEXT: vmov.f32 s1, s3 458; CHECK-LIS-NEXT: vins.f16 s18, s12 459; CHECK-LIS-NEXT: vins.f16 s19, s15 460; CHECK-LIS-NEXT: vmov.f32 s3, s13 461; CHECK-LIS-NEXT: vins.f16 s17, s9 462; CHECK-LIS-NEXT: vmov.f32 s2, s10 463; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] 464; CHECK-LIS-NEXT: vadd.i16 q0, q0, q4 465; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #16] 466; CHECK-LIS-NEXT: vadd.i16 q0, q0, q1 467; CHECK-LIS-NEXT: vldrw.u32 q3, [r0] 468; CHECK-LIS-NEXT: vmovx.f16 s6, s18 469; CHECK-LIS-NEXT: vmov.f32 s22, s19 470; CHECK-LIS-NEXT: vins.f16 s6, s8 471; CHECK-LIS-NEXT: vmovx.f16 s8, s8 472; CHECK-LIS-NEXT: vins.f16 s22, s8 473; CHECK-LIS-NEXT: vmovx.f16 s8, s11 474; CHECK-LIS-NEXT: vmov.f32 s23, s10 475; CHECK-LIS-NEXT: vmovx.f16 s4, s12 476; CHECK-LIS-NEXT: vins.f16 s23, s8 477; CHECK-LIS-NEXT: vmovx.f16 s8, s13 478; CHECK-LIS-NEXT: vins.f16 s12, s8 479; CHECK-LIS-NEXT: vmovx.f16 s8, s16 480; CHECK-LIS-NEXT: vmovx.f16 s5, s15 481; CHECK-LIS-NEXT: vins.f16 s15, s8 482; CHECK-LIS-NEXT: vmovx.f16 s8, s19 483; CHECK-LIS-NEXT: vins.f16 s4, s14 484; CHECK-LIS-NEXT: vmov.f32 s20, s13 485; CHECK-LIS-NEXT: vmovx.f16 s14, s14 486; CHECK-LIS-NEXT: vins.f16 s18, s8 487; CHECK-LIS-NEXT: vmovx.f16 s8, s10 488; CHECK-LIS-NEXT: vmovx.f16 s7, s9 489; CHECK-LIS-NEXT: vins.f16 s20, s14 490; CHECK-LIS-NEXT: vmovx.f16 s14, s17 491; CHECK-LIS-NEXT: vmov.f32 s21, s16 492; CHECK-LIS-NEXT: vins.f16 s9, s8 493; CHECK-LIS-NEXT: vins.f16 s21, s14 494; CHECK-LIS-NEXT: vmov.f32 s13, s15 495; CHECK-LIS-NEXT: vins.f16 s7, s11 496; CHECK-LIS-NEXT: vins.f16 s5, s17 497; CHECK-LIS-NEXT: vmov.f32 s14, s18 498; CHECK-LIS-NEXT: vmov.f32 s15, s9 499; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] 500; CHECK-LIS-NEXT: vadd.i16 q1, q3, q1 501; CHECK-LIS-NEXT: vadd.i16 q1, q1, q5 502; CHECK-LIS-NEXT: vstrw.32 q1, [r1] 503; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} 504; CHECK-LIS-NEXT: bx lr 505entry: 506 %l1 = load <48 x i16>, ptr %src, align 4 507 %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 508 %s2 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 509 %s3 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 510 %a1 = add <16 x i16> %s1, %s2 511 %a = add <16 x i16> %a1, %s3 512 store <16 x i16> %a, ptr %dst 513 ret void 514} 515 516; i8 517 518define void @vld3_v2i8(ptr %src, ptr %dst) { 519; CHECK-LABEL: vld3_v2i8: 520; CHECK: @ %bb.0: @ %entry 521; CHECK-NEXT: .pad #8 522; CHECK-NEXT: sub sp, #8 523; CHECK-NEXT: ldrd r0, r2, [r0] 524; CHECK-NEXT: strd r0, r2, [sp] 525; CHECK-NEXT: mov r0, sp 526; CHECK-NEXT: vldrb.u16 q0, [r0] 527; CHECK-NEXT: vmov.u16 r0, q0[4] 528; CHECK-NEXT: vmov.u16 r2, q0[3] 529; CHECK-NEXT: add r0, r2 530; CHECK-NEXT: vmov.u16 r2, q0[5] 531; CHECK-NEXT: add r0, r2 532; CHECK-NEXT: strb r0, [r1, #1] 533; CHECK-NEXT: vmov.u16 r0, q0[1] 534; CHECK-NEXT: vmov.u16 r2, q0[0] 535; CHECK-NEXT: add r0, r2 536; CHECK-NEXT: vmov.u16 r2, q0[2] 537; CHECK-NEXT: add r0, r2 538; CHECK-NEXT: strb r0, [r1] 539; CHECK-NEXT: add sp, #8 540; CHECK-NEXT: bx lr 541entry: 542 %l1 = load <6 x i8>, ptr %src, align 4 543 %s1 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 0, i32 3> 544 %s2 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 1, i32 4> 545 %s3 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 2, i32 5> 546 %a1 = add <2 x i8> %s1, %s2 547 %a = add <2 x i8> %a1, %s3 548 store <2 x i8> %a, ptr %dst 549 ret void 550} 551 552define void @vld3_v4i8(ptr %src, ptr %dst) { 553; CHECK-LABEL: vld3_v4i8: 554; CHECK: @ %bb.0: @ %entry 555; CHECK-NEXT: .save {r4, lr} 556; CHECK-NEXT: push {r4, lr} 557; CHECK-NEXT: .pad #8 558; CHECK-NEXT: sub sp, #8 559; CHECK-NEXT: vldrb.u16 q0, [r0] 560; CHECK-NEXT: ldr r0, [r0, #8] 561; CHECK-NEXT: str r0, [sp] 562; CHECK-NEXT: vmov.u16 r3, q0[6] 563; CHECK-NEXT: vmov.u16 r4, q0[0] 564; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 565; CHECK-NEXT: vmov.u16 r3, q0[7] 566; CHECK-NEXT: vmov.u16 r4, q0[1] 567; CHECK-NEXT: vmov.u16 r12, q0[5] 568; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 569; CHECK-NEXT: mov r3, sp 570; CHECK-NEXT: vmov.u16 lr, q0[2] 571; CHECK-NEXT: vmov.u16 r2, q0[3] 572; CHECK-NEXT: vmov.u16 r0, q0[4] 573; CHECK-NEXT: vldrb.u16 q0, [r3] 574; CHECK-NEXT: vmov.u16 r3, q0[2] 575; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 576; CHECK-NEXT: vmov.u16 r0, q0[1] 577; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 578; CHECK-NEXT: vmov.u16 r0, q0[0] 579; CHECK-NEXT: vadd.i32 q1, q1, q2 580; CHECK-NEXT: vmov q2[2], q2[0], lr, r0 581; CHECK-NEXT: vmov.u16 r0, q0[3] 582; CHECK-NEXT: vmov q2[3], q2[1], r12, r0 583; CHECK-NEXT: vadd.i32 q0, q1, q2 584; CHECK-NEXT: vstrb.32 q0, [r1] 585; CHECK-NEXT: add sp, #8 586; CHECK-NEXT: pop {r4, pc} 587entry: 588 %l1 = load <12 x i8>, ptr %src, align 4 589 %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 590 %s2 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 591 %s3 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 592 %a1 = add <4 x i8> %s1, %s2 593 %a = add <4 x i8> %a1, %s3 594 store <4 x i8> %a, ptr %dst 595 ret void 596} 597 598define void @vld3_v8i8(ptr %src, ptr %dst) { 599; CHECK-LABEL: vld3_v8i8: 600; CHECK: @ %bb.0: @ %entry 601; CHECK-NEXT: .vsave {d8, d9} 602; CHECK-NEXT: vpush {d8, d9} 603; CHECK-NEXT: vldrw.u32 q0, [r0] 604; CHECK-NEXT: vldrb.u16 q1, [r0, #16] 605; CHECK-NEXT: vmov.u8 r2, q0[1] 606; CHECK-NEXT: vmov.u8 r0, q0[0] 607; CHECK-NEXT: vmov.16 q2[0], r2 608; CHECK-NEXT: vmov.u8 r2, q0[4] 609; CHECK-NEXT: vmov.16 q3[0], r0 610; CHECK-NEXT: vmov.u8 r0, q0[3] 611; CHECK-NEXT: vmov.16 q2[1], r2 612; CHECK-NEXT: vmov.u8 r2, q0[7] 613; CHECK-NEXT: vmov.16 q3[1], r0 614; CHECK-NEXT: vmov.u8 r0, q0[6] 615; CHECK-NEXT: vmov.16 q2[2], r2 616; CHECK-NEXT: vmov.u8 r2, q0[10] 617; CHECK-NEXT: vmov.16 q3[2], r0 618; CHECK-NEXT: vmov.u8 r0, q0[9] 619; CHECK-NEXT: vmov.16 q2[3], r2 620; CHECK-NEXT: vmov.u8 r2, q0[13] 621; CHECK-NEXT: vmov.16 q3[3], r0 622; CHECK-NEXT: vmov.u8 r0, q0[12] 623; CHECK-NEXT: vmov.16 q2[4], r2 624; CHECK-NEXT: vmov.16 q3[4], r0 625; CHECK-NEXT: vmov.u8 r0, q0[15] 626; CHECK-NEXT: vmovx.f16 s16, s6 627; CHECK-NEXT: vmov.f32 s18, s5 628; CHECK-NEXT: vmovx.f16 s11, s5 629; CHECK-NEXT: vmov.16 q3[5], r0 630; CHECK-NEXT: vins.f16 s18, s16 631; CHECK-NEXT: vins.f16 s10, s4 632; CHECK-NEXT: vins.f16 s11, s7 633; CHECK-NEXT: vmov.f32 s15, s18 634; CHECK-NEXT: vmov.u8 r0, q0[2] 635; CHECK-NEXT: vadd.i16 q2, q3, q2 636; CHECK-NEXT: vmov.16 q3[0], r0 637; CHECK-NEXT: vmov.u8 r0, q0[5] 638; CHECK-NEXT: vmov.16 q3[1], r0 639; CHECK-NEXT: vmov.u8 r0, q0[8] 640; CHECK-NEXT: vmov.16 q3[2], r0 641; CHECK-NEXT: vmov.u8 r0, q0[11] 642; CHECK-NEXT: vmov.16 q3[3], r0 643; CHECK-NEXT: vmov.u8 r0, q0[14] 644; CHECK-NEXT: vmov.16 q3[4], r0 645; CHECK-NEXT: vmov.u16 r0, q1[1] 646; CHECK-NEXT: vmovx.f16 s0, s7 647; CHECK-NEXT: vmov.f32 s2, s6 648; CHECK-NEXT: vins.f16 s2, s0 649; CHECK-NEXT: vmov.16 q3[5], r0 650; CHECK-NEXT: vmov.f32 s15, s2 651; CHECK-NEXT: vadd.i16 q0, q2, q3 652; CHECK-NEXT: vstrb.16 q0, [r1] 653; CHECK-NEXT: vpop {d8, d9} 654; CHECK-NEXT: bx lr 655entry: 656 %l1 = load <24 x i8>, ptr %src, align 4 657 %s1 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 658 %s2 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 659 %s3 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 660 %a1 = add <8 x i8> %s1, %s2 661 %a = add <8 x i8> %a1, %s3 662 store <8 x i8> %a, ptr %dst 663 ret void 664} 665 666define void @vld3_v16i8(ptr %src, ptr %dst) { 667; CHECK-LABEL: vld3_v16i8: 668; CHECK: @ %bb.0: @ %entry 669; CHECK-NEXT: .vsave {d8, d9, d10, d11} 670; CHECK-NEXT: vpush {d8, d9, d10, d11} 671; CHECK-NEXT: vldrw.u32 q1, [r0] 672; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 673; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 674; CHECK-NEXT: vmov.u8 r2, q1[1] 675; CHECK-NEXT: vmov.8 q3[0], r2 676; CHECK-NEXT: vmov.u8 r2, q1[4] 677; CHECK-NEXT: vmov.8 q3[1], r2 678; CHECK-NEXT: vmov.u8 r2, q1[7] 679; CHECK-NEXT: vmov.8 q3[2], r2 680; CHECK-NEXT: vmov.u8 r2, q1[10] 681; CHECK-NEXT: vmov.8 q3[3], r2 682; CHECK-NEXT: vmov.u8 r2, q1[13] 683; CHECK-NEXT: vmov.8 q3[4], r2 684; CHECK-NEXT: vmov.u8 r2, q0[0] 685; CHECK-NEXT: vmov.8 q3[5], r2 686; CHECK-NEXT: vmov.u8 r2, q0[3] 687; CHECK-NEXT: vmov.8 q3[6], r2 688; CHECK-NEXT: vmov.u8 r2, q0[9] 689; CHECK-NEXT: vmov.8 q4[8], r2 690; CHECK-NEXT: vmov.u8 r2, q0[12] 691; CHECK-NEXT: vmov.8 q4[9], r2 692; CHECK-NEXT: vmov.u8 r2, q0[15] 693; CHECK-NEXT: vmov.8 q4[10], r2 694; CHECK-NEXT: vmov.u8 r0, q2[2] 695; CHECK-NEXT: vmov.8 q4[11], r0 696; CHECK-NEXT: vmov.u8 r0, q2[5] 697; CHECK-NEXT: vmov.8 q4[12], r0 698; CHECK-NEXT: vmov.u8 r0, q2[8] 699; CHECK-NEXT: vmov.8 q4[13], r0 700; CHECK-NEXT: vmov.u8 r0, q2[11] 701; CHECK-NEXT: vmov.8 q4[14], r0 702; CHECK-NEXT: vmov.u8 r0, q2[14] 703; CHECK-NEXT: vmov.8 q4[15], r0 704; CHECK-NEXT: vmov.u8 r0, q0[6] 705; CHECK-NEXT: vmov.8 q3[7], r0 706; CHECK-NEXT: vmov.u8 r0, q1[0] 707; CHECK-NEXT: vmov.f32 s14, s18 708; CHECK-NEXT: vmov.f32 s15, s19 709; CHECK-NEXT: vmov.8 q4[0], r0 710; CHECK-NEXT: vmov.u8 r0, q1[3] 711; CHECK-NEXT: vmov.8 q4[1], r0 712; CHECK-NEXT: vmov.u8 r0, q1[6] 713; CHECK-NEXT: vmov.8 q4[2], r0 714; CHECK-NEXT: vmov.u8 r0, q1[9] 715; CHECK-NEXT: vmov.8 q4[3], r0 716; CHECK-NEXT: vmov.u8 r0, q1[12] 717; CHECK-NEXT: vmov.8 q4[4], r0 718; CHECK-NEXT: vmov.u8 r0, q1[15] 719; CHECK-NEXT: vmov.8 q4[5], r0 720; CHECK-NEXT: vmov.u8 r0, q0[2] 721; CHECK-NEXT: vmov.8 q4[6], r0 722; CHECK-NEXT: vmov.u8 r0, q0[8] 723; CHECK-NEXT: vmov.8 q5[8], r0 724; CHECK-NEXT: vmov.u8 r0, q0[11] 725; CHECK-NEXT: vmov.8 q5[9], r0 726; CHECK-NEXT: vmov.u8 r0, q0[14] 727; CHECK-NEXT: vmov.8 q5[10], r0 728; CHECK-NEXT: vmov.u8 r0, q2[1] 729; CHECK-NEXT: vmov.8 q5[11], r0 730; CHECK-NEXT: vmov.u8 r0, q2[4] 731; CHECK-NEXT: vmov.8 q5[12], r0 732; CHECK-NEXT: vmov.u8 r0, q2[7] 733; CHECK-NEXT: vmov.8 q5[13], r0 734; CHECK-NEXT: vmov.u8 r0, q2[10] 735; CHECK-NEXT: vmov.8 q5[14], r0 736; CHECK-NEXT: vmov.u8 r0, q2[13] 737; CHECK-NEXT: vmov.8 q5[15], r0 738; CHECK-NEXT: vmov.u8 r0, q0[5] 739; CHECK-NEXT: vmov.8 q4[7], r0 740; CHECK-NEXT: vmov.u8 r0, q1[2] 741; CHECK-NEXT: vmov.f32 s18, s22 742; CHECK-NEXT: vmov.f32 s19, s23 743; CHECK-NEXT: vadd.i8 q3, q4, q3 744; CHECK-NEXT: vmov.8 q4[0], r0 745; CHECK-NEXT: vmov.u8 r0, q1[5] 746; CHECK-NEXT: vmov.8 q4[1], r0 747; CHECK-NEXT: vmov.u8 r0, q1[8] 748; CHECK-NEXT: vmov.8 q4[2], r0 749; CHECK-NEXT: vmov.u8 r0, q1[11] 750; CHECK-NEXT: vmov.8 q4[3], r0 751; CHECK-NEXT: vmov.u8 r0, q1[14] 752; CHECK-NEXT: vmov.8 q4[4], r0 753; CHECK-NEXT: vmov.u8 r0, q0[1] 754; CHECK-NEXT: vmov.8 q4[5], r0 755; CHECK-NEXT: vmov.u8 r0, q0[4] 756; CHECK-NEXT: vmov.8 q4[6], r0 757; CHECK-NEXT: vmov.u8 r0, q0[10] 758; CHECK-NEXT: vmov.8 q1[8], r0 759; CHECK-NEXT: vmov.u8 r0, q0[13] 760; CHECK-NEXT: vmov.8 q1[9], r0 761; CHECK-NEXT: vmov.u8 r0, q2[0] 762; CHECK-NEXT: vmov.8 q1[10], r0 763; CHECK-NEXT: vmov.u8 r0, q2[3] 764; CHECK-NEXT: vmov.8 q1[11], r0 765; CHECK-NEXT: vmov.u8 r0, q2[6] 766; CHECK-NEXT: vmov.8 q1[12], r0 767; CHECK-NEXT: vmov.u8 r0, q2[9] 768; CHECK-NEXT: vmov.8 q1[13], r0 769; CHECK-NEXT: vmov.u8 r0, q2[12] 770; CHECK-NEXT: vmov.8 q1[14], r0 771; CHECK-NEXT: vmov.u8 r0, q2[15] 772; CHECK-NEXT: vmov.8 q1[15], r0 773; CHECK-NEXT: vmov.u8 r0, q0[7] 774; CHECK-NEXT: vmov.8 q4[7], r0 775; CHECK-NEXT: vmov.f32 s18, s6 776; CHECK-NEXT: vmov.f32 s19, s7 777; CHECK-NEXT: vadd.i8 q0, q3, q4 778; CHECK-NEXT: vstrw.32 q0, [r1] 779; CHECK-NEXT: vpop {d8, d9, d10, d11} 780; CHECK-NEXT: bx lr 781entry: 782 %l1 = load <48 x i8>, ptr %src, align 4 783 %s1 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 784 %s2 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 785 %s3 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 786 %a1 = add <16 x i8> %s1, %s2 787 %a = add <16 x i8> %a1, %s3 788 store <16 x i8> %a, ptr %dst 789 ret void 790} 791 792; i64 793 794define void @vld3_v2i64(ptr %src, ptr %dst) { 795; CHECK-LABEL: vld3_v2i64: 796; CHECK: @ %bb.0: @ %entry 797; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 798; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 799; CHECK-NEXT: vldrw.u32 q0, [r0] 800; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 801; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 802; CHECK-NEXT: vmov.f32 s12, s2 803; CHECK-NEXT: vmov.f32 s13, s3 804; CHECK-NEXT: vmov.f32 s2, s4 805; CHECK-NEXT: vmov.f32 s3, s5 806; CHECK-NEXT: vmov r0, r3, d5 807; CHECK-NEXT: vmov r2, r4, d3 808; CHECK-NEXT: vmov r6, r7, d0 809; CHECK-NEXT: vmov r5, r8, d6 810; CHECK-NEXT: vmov lr, r12, d1 811; CHECK-NEXT: adds.w r0, r0, lr 812; CHECK-NEXT: adc.w r3, r3, r12 813; CHECK-NEXT: adds r0, r0, r2 814; CHECK-NEXT: adc.w r2, r3, r4 815; CHECK-NEXT: vmov r3, r4, d4 816; CHECK-NEXT: adds r6, r6, r5 817; CHECK-NEXT: adc.w r7, r7, r8 818; CHECK-NEXT: adds r3, r3, r6 819; CHECK-NEXT: adcs r7, r4 820; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 821; CHECK-NEXT: vmov q0[3], q0[1], r7, r2 822; CHECK-NEXT: vstrw.32 q0, [r1] 823; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 824entry: 825 %l1 = load <6 x i64>, ptr %src, align 4 826 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3> 827 %s2 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 1, i32 4> 828 %s3 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 2, i32 5> 829 %a1 = add <2 x i64> %s1, %s2 830 %a = add <2 x i64> %a1, %s3 831 store <2 x i64> %a, ptr %dst 832 ret void 833} 834 835define void @vld3_v4i64(ptr %src, ptr %dst) { 836; CHECK-LV-LABEL: vld3_v4i64: 837; CHECK-LV: @ %bb.0: @ %entry 838; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} 839; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} 840; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12} 841; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12} 842; CHECK-LV-NEXT: vldrw.u32 q0, [r0] 843; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] 844; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] 845; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #48] 846; CHECK-LV-NEXT: vmov.f32 s4, s2 847; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #64] 848; CHECK-LV-NEXT: vmov.f32 s5, s3 849; CHECK-LV-NEXT: vmov.f32 s2, s12 850; CHECK-LV-NEXT: vmov.f32 s3, s13 851; CHECK-LV-NEXT: vmov r5, r4, d5 852; CHECK-LV-NEXT: vmov r3, r8, d7 853; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] 854; CHECK-LV-NEXT: vmov.f32 s24, s22 855; CHECK-LV-NEXT: vmov.f32 s25, s23 856; CHECK-LV-NEXT: vmov lr, r12, d1 857; CHECK-LV-NEXT: vmov.f32 s2, s12 858; CHECK-LV-NEXT: vmov.f32 s3, s13 859; CHECK-LV-NEXT: vmov r6, r7, d12 860; CHECK-LV-NEXT: adds.w r0, r5, lr 861; CHECK-LV-NEXT: adc.w r5, r4, r12 862; CHECK-LV-NEXT: adds.w lr, r0, r3 863; CHECK-LV-NEXT: vmov r4, r2, d10 864; CHECK-LV-NEXT: adc.w r12, r5, r8 865; CHECK-LV-NEXT: vmov r5, r0, d8 866; CHECK-LV-NEXT: adds r6, r6, r4 867; CHECK-LV-NEXT: adcs r2, r7 868; CHECK-LV-NEXT: adds r6, r6, r5 869; CHECK-LV-NEXT: adc.w r8, r2, r0 870; CHECK-LV-NEXT: vmov r7, r4, d1 871; CHECK-LV-NEXT: vmov r2, r5, d9 872; CHECK-LV-NEXT: vmov r3, r0, d0 873; CHECK-LV-NEXT: adds r2, r2, r7 874; CHECK-LV-NEXT: adc.w r7, r5, r4 875; CHECK-LV-NEXT: vmov r5, r4, d7 876; CHECK-LV-NEXT: adds r2, r2, r5 877; CHECK-LV-NEXT: adcs r7, r4 878; CHECK-LV-NEXT: vmov r5, r4, d2 879; CHECK-LV-NEXT: vmov q1[2], q1[0], r6, r2 880; CHECK-LV-NEXT: vmov q1[3], q1[1], r8, r7 881; CHECK-LV-NEXT: vstrw.32 q1, [r1, #16] 882; CHECK-LV-NEXT: adds r3, r3, r5 883; CHECK-LV-NEXT: adcs r0, r4 884; CHECK-LV-NEXT: vmov r4, r5, d4 885; CHECK-LV-NEXT: adds r3, r3, r4 886; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, lr 887; CHECK-LV-NEXT: adcs r0, r5 888; CHECK-LV-NEXT: vmov q0[3], q0[1], r0, r12 889; CHECK-LV-NEXT: vstrw.32 q0, [r1] 890; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12} 891; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 892; 893; CHECK-LIS-LABEL: vld3_v4i64: 894; CHECK-LIS: @ %bb.0: @ %entry 895; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} 896; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} 897; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12} 898; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12} 899; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] 900; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] 901; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] 902; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #48] 903; CHECK-LIS-NEXT: vmov.f32 s4, s2 904; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #64] 905; CHECK-LIS-NEXT: vmov.f32 s5, s3 906; CHECK-LIS-NEXT: vmov.f32 s2, s12 907; CHECK-LIS-NEXT: vmov.f32 s3, s13 908; CHECK-LIS-NEXT: vmov r5, r4, d5 909; CHECK-LIS-NEXT: vmov r3, r8, d7 910; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] 911; CHECK-LIS-NEXT: vmov.f32 s24, s22 912; CHECK-LIS-NEXT: vmov.f32 s25, s23 913; CHECK-LIS-NEXT: vmov lr, r12, d1 914; CHECK-LIS-NEXT: vmov.f32 s2, s12 915; CHECK-LIS-NEXT: vmov.f32 s3, s13 916; CHECK-LIS-NEXT: vmov r7, r6, d12 917; CHECK-LIS-NEXT: adds.w r0, r5, lr 918; CHECK-LIS-NEXT: adc.w r5, r4, r12 919; CHECK-LIS-NEXT: adds.w lr, r0, r3 920; CHECK-LIS-NEXT: vmov r4, r2, d10 921; CHECK-LIS-NEXT: adc.w r12, r5, r8 922; CHECK-LIS-NEXT: vmov r5, r0, d8 923; CHECK-LIS-NEXT: adds r7, r7, r4 924; CHECK-LIS-NEXT: adcs r2, r6 925; CHECK-LIS-NEXT: adds r7, r7, r5 926; CHECK-LIS-NEXT: adc.w r8, r2, r0 927; CHECK-LIS-NEXT: vmov r6, r4, d1 928; CHECK-LIS-NEXT: vmov r2, r5, d9 929; CHECK-LIS-NEXT: vmov r3, r0, d0 930; CHECK-LIS-NEXT: adds r2, r2, r6 931; CHECK-LIS-NEXT: adc.w r6, r5, r4 932; CHECK-LIS-NEXT: vmov r5, r4, d7 933; CHECK-LIS-NEXT: adds r2, r2, r5 934; CHECK-LIS-NEXT: adcs r6, r4 935; CHECK-LIS-NEXT: vmov r5, r4, d2 936; CHECK-LIS-NEXT: vmov q1[2], q1[0], r7, r2 937; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r6 938; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] 939; CHECK-LIS-NEXT: adds r3, r3, r5 940; CHECK-LIS-NEXT: adcs r0, r4 941; CHECK-LIS-NEXT: vmov r4, r5, d4 942; CHECK-LIS-NEXT: adds r3, r3, r4 943; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, lr 944; CHECK-LIS-NEXT: adcs r0, r5 945; CHECK-LIS-NEXT: vmov q0[3], q0[1], r0, r12 946; CHECK-LIS-NEXT: vstrw.32 q0, [r1] 947; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12} 948; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 949entry: 950 %l1 = load <12 x i64>, ptr %src, align 4 951 %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 952 %s2 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 953 %s3 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 954 %a1 = add <4 x i64> %s1, %s2 955 %a = add <4 x i64> %a1, %s3 956 store <4 x i64> %a, ptr %dst 957 ret void 958} 959 960; f32 961 962define void @vld3_v2f32(ptr %src, ptr %dst) { 963; CHECK-LABEL: vld3_v2f32: 964; CHECK: @ %bb.0: @ %entry 965; CHECK-NEXT: vldrw.u32 q2, [r0] 966; CHECK-NEXT: vldr s1, [r0, #16] 967; CHECK-NEXT: vldr s5, [r0, #20] 968; CHECK-NEXT: vmov.f32 s12, s8 969; CHECK-NEXT: vmov.f32 s13, s11 970; CHECK-NEXT: vmov.f32 s0, s9 971; CHECK-NEXT: vadd.f32 q0, q3, q0 972; CHECK-NEXT: vmov.f32 s4, s10 973; CHECK-NEXT: vadd.f32 q0, q0, q1 974; CHECK-NEXT: vstmia r1, {s0, s1} 975; CHECK-NEXT: bx lr 976entry: 977 %l1 = load <6 x float>, ptr %src, align 4 978 %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3> 979 %s2 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 1, i32 4> 980 %s3 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 2, i32 5> 981 %a1 = fadd <2 x float> %s1, %s2 982 %a = fadd <2 x float> %a1, %s3 983 store <2 x float> %a, ptr %dst 984 ret void 985} 986 987define void @vld3_v4f32(ptr %src, ptr %dst) { 988; CHECK-LABEL: vld3_v4f32: 989; CHECK: @ %bb.0: @ %entry 990; CHECK-NEXT: .vsave {d8, d9} 991; CHECK-NEXT: vpush {d8, d9} 992; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 993; CHECK-NEXT: vldrw.u32 q1, [r0] 994; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 995; CHECK-NEXT: vmov.f32 s10, s2 996; CHECK-NEXT: vmov.f32 s13, s0 997; CHECK-NEXT: vmov.f32 s14, s3 998; CHECK-NEXT: vmov.f32 s8, s4 999; CHECK-NEXT: vmov.f32 s9, s7 1000; CHECK-NEXT: vmov.f32 s12, s5 1001; CHECK-NEXT: vmov.f32 s15, s18 1002; CHECK-NEXT: vmov.f32 s11, s17 1003; CHECK-NEXT: vadd.f32 q2, q2, q3 1004; CHECK-NEXT: vmov.f32 s0, s6 1005; CHECK-NEXT: vmov.f32 s2, s16 1006; CHECK-NEXT: vmov.f32 s3, s19 1007; CHECK-NEXT: vadd.f32 q0, q2, q0 1008; CHECK-NEXT: vstrw.32 q0, [r1] 1009; CHECK-NEXT: vpop {d8, d9} 1010; CHECK-NEXT: bx lr 1011entry: 1012 %l1 = load <12 x float>, ptr %src, align 4 1013 %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1014 %s2 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1015 %s3 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1016 %a1 = fadd <4 x float> %s1, %s2 1017 %a = fadd <4 x float> %a1, %s3 1018 store <4 x float> %a, ptr %dst 1019 ret void 1020} 1021 1022define void @vld3_v8f32(ptr %src, ptr %dst) { 1023; CHECK-LABEL: vld3_v8f32: 1024; CHECK: @ %bb.0: @ %entry 1025; CHECK-NEXT: .vsave {d8, d9, d10, d11} 1026; CHECK-NEXT: vpush {d8, d9, d10, d11} 1027; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1028; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 1029; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 1030; CHECK-NEXT: vmov.f32 s10, s2 1031; CHECK-NEXT: vmov.f32 s13, s0 1032; CHECK-NEXT: vmov.f32 s14, s3 1033; CHECK-NEXT: vmov.f32 s8, s4 1034; CHECK-NEXT: vmov.f32 s9, s7 1035; CHECK-NEXT: vmov.f32 s12, s5 1036; CHECK-NEXT: vmov.f32 s15, s18 1037; CHECK-NEXT: vmov.f32 s11, s17 1038; CHECK-NEXT: vadd.f32 q2, q2, q3 1039; CHECK-NEXT: vmov.f32 s0, s6 1040; CHECK-NEXT: vmov.f32 s2, s16 1041; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1042; CHECK-NEXT: vmov.f32 s3, s19 1043; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1044; CHECK-NEXT: vadd.f32 q0, q2, q0 1045; CHECK-NEXT: vldrw.u32 q2, [r0] 1046; CHECK-NEXT: vmov.f32 s17, s4 1047; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1048; CHECK-NEXT: vmov.f32 s18, s7 1049; CHECK-NEXT: vmov.f32 s22, s6 1050; CHECK-NEXT: vmov.f32 s16, s9 1051; CHECK-NEXT: vmov.f32 s19, s14 1052; CHECK-NEXT: vmov.f32 s20, s8 1053; CHECK-NEXT: vmov.f32 s21, s11 1054; CHECK-NEXT: vmov.f32 s23, s13 1055; CHECK-NEXT: vadd.f32 q4, q5, q4 1056; CHECK-NEXT: vmov.f32 s4, s10 1057; CHECK-NEXT: vmov.f32 s6, s12 1058; CHECK-NEXT: vmov.f32 s7, s15 1059; CHECK-NEXT: vadd.f32 q1, q4, q1 1060; CHECK-NEXT: vstrw.32 q1, [r1] 1061; CHECK-NEXT: vpop {d8, d9, d10, d11} 1062; CHECK-NEXT: bx lr 1063entry: 1064 %l1 = load <24 x float>, ptr %src, align 4 1065 %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 1066 %s2 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 1067 %s3 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 1068 %a1 = fadd <8 x float> %s1, %s2 1069 %a = fadd <8 x float> %a1, %s3 1070 store <8 x float> %a, ptr %dst 1071 ret void 1072} 1073 1074define void @vld3_v16f32(ptr %src, ptr %dst) { 1075; CHECK-LABEL: vld3_v16f32: 1076; CHECK: @ %bb.0: @ %entry 1077; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1078; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1079; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1080; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 1081; CHECK-NEXT: vldrw.u32 q4, [r0, #80] 1082; CHECK-NEXT: vldrw.u32 q6, [r0, #176] 1083; CHECK-NEXT: vmov.f32 s10, s2 1084; CHECK-NEXT: vmov.f32 s13, s0 1085; CHECK-NEXT: vmov.f32 s14, s3 1086; CHECK-NEXT: vmov.f32 s8, s4 1087; CHECK-NEXT: vmov.f32 s9, s7 1088; CHECK-NEXT: vmov.f32 s12, s5 1089; CHECK-NEXT: vmov.f32 s15, s18 1090; CHECK-NEXT: vmov.f32 s11, s17 1091; CHECK-NEXT: vadd.f32 q2, q2, q3 1092; CHECK-NEXT: vmov.f32 s0, s6 1093; CHECK-NEXT: vmov.f32 s2, s16 1094; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1095; CHECK-NEXT: vmov.f32 s3, s19 1096; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1097; CHECK-NEXT: vadd.f32 q0, q2, q0 1098; CHECK-NEXT: vldrw.u32 q2, [r0] 1099; CHECK-NEXT: vmov.f32 s17, s4 1100; CHECK-NEXT: vmov.f32 s18, s7 1101; CHECK-NEXT: vmov.f32 s22, s6 1102; CHECK-NEXT: vmov.f32 s16, s9 1103; CHECK-NEXT: vmov.f32 s19, s14 1104; CHECK-NEXT: vmov.f32 s20, s8 1105; CHECK-NEXT: vmov.f32 s21, s11 1106; CHECK-NEXT: vmov.f32 s23, s13 1107; CHECK-NEXT: vmov.f32 s4, s10 1108; CHECK-NEXT: vldrw.u32 q2, [r0, #160] 1109; CHECK-NEXT: vmov.f32 s6, s12 1110; CHECK-NEXT: vadd.f32 q4, q5, q4 1111; CHECK-NEXT: vmov.f32 s7, s15 1112; CHECK-NEXT: vldrw.u32 q3, [r0, #144] 1113; CHECK-NEXT: vadd.f32 q1, q4, q1 1114; CHECK-NEXT: vmov.f32 s18, s10 1115; CHECK-NEXT: vmov.f32 s21, s8 1116; CHECK-NEXT: vmov.f32 s22, s11 1117; CHECK-NEXT: vmov.f32 s16, s12 1118; CHECK-NEXT: vmov.f32 s17, s15 1119; CHECK-NEXT: vmov.f32 s20, s13 1120; CHECK-NEXT: vmov.f32 s23, s26 1121; CHECK-NEXT: vmov.f32 s19, s25 1122; CHECK-NEXT: vadd.f32 q4, q4, q5 1123; CHECK-NEXT: vmov.f32 s8, s14 1124; CHECK-NEXT: vmov.f32 s10, s24 1125; CHECK-NEXT: vldrw.u32 q3, [r0, #112] 1126; CHECK-NEXT: vmov.f32 s11, s27 1127; CHECK-NEXT: vldrw.u32 q5, [r0, #128] 1128; CHECK-NEXT: vadd.f32 q2, q4, q2 1129; CHECK-NEXT: vldrw.u32 q4, [r0, #96] 1130; CHECK-NEXT: vmov.f32 s25, s12 1131; CHECK-NEXT: vstrw.32 q2, [r1, #48] 1132; CHECK-NEXT: vmov.f32 s26, s15 1133; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1134; CHECK-NEXT: vmov.f32 s30, s14 1135; CHECK-NEXT: vstrw.32 q1, [r1] 1136; CHECK-NEXT: vmov.f32 s24, s17 1137; CHECK-NEXT: vmov.f32 s27, s22 1138; CHECK-NEXT: vmov.f32 s28, s16 1139; CHECK-NEXT: vmov.f32 s29, s19 1140; CHECK-NEXT: vmov.f32 s31, s21 1141; CHECK-NEXT: vadd.f32 q6, q7, q6 1142; CHECK-NEXT: vmov.f32 s12, s18 1143; CHECK-NEXT: vmov.f32 s14, s20 1144; CHECK-NEXT: vmov.f32 s15, s23 1145; CHECK-NEXT: vadd.f32 q3, q6, q3 1146; CHECK-NEXT: vstrw.32 q3, [r1, #32] 1147; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1148; CHECK-NEXT: bx lr 1149entry: 1150 %l1 = load <48 x float>, ptr %src, align 4 1151 %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 1152 %s2 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 1153 %s3 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 1154 %a1 = fadd <16 x float> %s1, %s2 1155 %a = fadd <16 x float> %a1, %s3 1156 store <16 x float> %a, ptr %dst 1157 ret void 1158} 1159 1160; f16 1161 1162define void @vld3_v2f16(ptr %src, ptr %dst) { 1163; CHECK-LABEL: vld3_v2f16: 1164; CHECK: @ %bb.0: @ %entry 1165; CHECK-NEXT: ldrd r2, r3, [r0] 1166; CHECK-NEXT: ldr r0, [r0, #8] 1167; CHECK-NEXT: vmov.32 q0[1], r3 1168; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 1169; CHECK-NEXT: vmovx.f16 s8, s0 1170; CHECK-NEXT: vmovx.f16 s4, s2 1171; CHECK-NEXT: vins.f16 s8, s2 1172; CHECK-NEXT: vmovx.f16 s2, s1 1173; CHECK-NEXT: vins.f16 s1, s4 1174; CHECK-NEXT: vins.f16 s0, s2 1175; CHECK-NEXT: vadd.f16 q1, q0, q2 1176; CHECK-NEXT: vmov.f32 s0, s1 1177; CHECK-NEXT: vadd.f16 q0, q1, q0 1178; CHECK-NEXT: vmov r0, s0 1179; CHECK-NEXT: str r0, [r1] 1180; CHECK-NEXT: bx lr 1181entry: 1182 %l1 = load <6 x half>, ptr %src, align 4 1183 %s1 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 0, i32 3> 1184 %s2 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 1, i32 4> 1185 %s3 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 2, i32 5> 1186 %a1 = fadd <2 x half> %s1, %s2 1187 %a = fadd <2 x half> %a1, %s3 1188 store <2 x half> %a, ptr %dst 1189 ret void 1190} 1191 1192define void @vld3_v4f16(ptr %src, ptr %dst) { 1193; CHECK-LABEL: vld3_v4f16: 1194; CHECK: @ %bb.0: @ %entry 1195; CHECK-NEXT: ldrd r2, r3, [r0, #16] 1196; CHECK-NEXT: vldrw.u32 q1, [r0] 1197; CHECK-NEXT: vmov.32 q2[0], r2 1198; CHECK-NEXT: vmovx.f16 s12, s4 1199; CHECK-NEXT: vmov.32 q2[1], r3 1200; CHECK-NEXT: vmovx.f16 s13, s7 1201; CHECK-NEXT: vmovx.f16 s0, s9 1202; CHECK-NEXT: vmov.f32 s1, s8 1203; CHECK-NEXT: vins.f16 s1, s0 1204; CHECK-NEXT: vmovx.f16 s0, s5 1205; CHECK-NEXT: vins.f16 s4, s0 1206; CHECK-NEXT: vmovx.f16 s0, s6 1207; CHECK-NEXT: vins.f16 s5, s0 1208; CHECK-NEXT: vmovx.f16 s0, s8 1209; CHECK-NEXT: vins.f16 s7, s0 1210; CHECK-NEXT: vmov.f32 s0, s5 1211; CHECK-NEXT: vins.f16 s12, s6 1212; CHECK-NEXT: vins.f16 s13, s9 1213; CHECK-NEXT: vmov.f32 s5, s7 1214; CHECK-NEXT: vadd.f16 q1, q1, q3 1215; CHECK-NEXT: vadd.f16 q0, q1, q0 1216; CHECK-NEXT: vmov r0, r2, d0 1217; CHECK-NEXT: strd r0, r2, [r1] 1218; CHECK-NEXT: bx lr 1219entry: 1220 %l1 = load <12 x half>, ptr %src, align 4 1221 %s1 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1222 %s2 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1223 %s3 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1224 %a1 = fadd <4 x half> %s1, %s2 1225 %a = fadd <4 x half> %a1, %s3 1226 store <4 x half> %a, ptr %dst 1227 ret void 1228} 1229 1230define void @vld3_v8f16(ptr %src, ptr %dst) { 1231; CHECK-LV-LABEL: vld3_v8f16: 1232; CHECK-LV: @ %bb.0: @ %entry 1233; CHECK-LV-NEXT: .vsave {d8, d9} 1234; CHECK-LV-NEXT: vpush {d8, d9} 1235; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] 1236; CHECK-LV-NEXT: vldrw.u32 q0, [r0] 1237; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] 1238; CHECK-LV-NEXT: vmov.f32 s5, s8 1239; CHECK-LV-NEXT: vmovx.f16 s8, s8 1240; CHECK-LV-NEXT: vmovx.f16 s17, s3 1241; CHECK-LV-NEXT: vins.f16 s3, s8 1242; CHECK-LV-NEXT: vmovx.f16 s8, s11 1243; CHECK-LV-NEXT: vmovx.f16 s18, s10 1244; CHECK-LV-NEXT: vmovx.f16 s16, s0 1245; CHECK-LV-NEXT: vins.f16 s10, s8 1246; CHECK-LV-NEXT: vmovx.f16 s6, s2 1247; CHECK-LV-NEXT: vmov.f32 s4, s1 1248; CHECK-LV-NEXT: vmovx.f16 s8, s14 1249; CHECK-LV-NEXT: vmovx.f16 s19, s13 1250; CHECK-LV-NEXT: vins.f16 s4, s6 1251; CHECK-LV-NEXT: vmovx.f16 s6, s9 1252; CHECK-LV-NEXT: vins.f16 s16, s2 1253; CHECK-LV-NEXT: vmovx.f16 s2, s15 1254; CHECK-LV-NEXT: vmovx.f16 s7, s12 1255; CHECK-LV-NEXT: vins.f16 s18, s12 1256; CHECK-LV-NEXT: vmovx.f16 s12, s1 1257; CHECK-LV-NEXT: vins.f16 s13, s8 1258; CHECK-LV-NEXT: vins.f16 s5, s6 1259; CHECK-LV-NEXT: vmov.f32 s6, s11 1260; CHECK-LV-NEXT: vins.f16 s14, s2 1261; CHECK-LV-NEXT: vmov.f32 s1, s3 1262; CHECK-LV-NEXT: vins.f16 s19, s15 1263; CHECK-LV-NEXT: vins.f16 s17, s9 1264; CHECK-LV-NEXT: vins.f16 s0, s12 1265; CHECK-LV-NEXT: vmov.f32 s2, s10 1266; CHECK-LV-NEXT: vmov.f32 s3, s13 1267; CHECK-LV-NEXT: vins.f16 s6, s7 1268; CHECK-LV-NEXT: vmov.f32 s7, s14 1269; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 1270; CHECK-LV-NEXT: vadd.f16 q0, q0, q1 1271; CHECK-LV-NEXT: vstrw.32 q0, [r1] 1272; CHECK-LV-NEXT: vpop {d8, d9} 1273; CHECK-LV-NEXT: bx lr 1274; 1275; CHECK-LIS-LABEL: vld3_v8f16: 1276; CHECK-LIS: @ %bb.0: @ %entry 1277; CHECK-LIS-NEXT: .vsave {d8, d9} 1278; CHECK-LIS-NEXT: vpush {d8, d9} 1279; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] 1280; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] 1281; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] 1282; CHECK-LIS-NEXT: vmov.f32 s4, s1 1283; CHECK-LIS-NEXT: vmovx.f16 s6, s2 1284; CHECK-LIS-NEXT: vins.f16 s4, s6 1285; CHECK-LIS-NEXT: vmov.f32 s5, s8 1286; CHECK-LIS-NEXT: vmovx.f16 s6, s9 1287; CHECK-LIS-NEXT: vmovx.f16 s8, s8 1288; CHECK-LIS-NEXT: vmovx.f16 s13, s3 1289; CHECK-LIS-NEXT: vins.f16 s5, s6 1290; CHECK-LIS-NEXT: vins.f16 s3, s8 1291; CHECK-LIS-NEXT: vmov.f32 s6, s11 1292; CHECK-LIS-NEXT: vmovx.f16 s12, s16 1293; CHECK-LIS-NEXT: vmovx.f16 s8, s11 1294; CHECK-LIS-NEXT: vmovx.f16 s14, s10 1295; CHECK-LIS-NEXT: vins.f16 s6, s12 1296; CHECK-LIS-NEXT: vmovx.f16 s12, s0 1297; CHECK-LIS-NEXT: vins.f16 s10, s8 1298; CHECK-LIS-NEXT: vmovx.f16 s8, s18 1299; CHECK-LIS-NEXT: vmovx.f16 s15, s17 1300; CHECK-LIS-NEXT: vins.f16 s12, s2 1301; CHECK-LIS-NEXT: vmovx.f16 s2, s19 1302; CHECK-LIS-NEXT: vmovx.f16 s1, s1 1303; CHECK-LIS-NEXT: vins.f16 s17, s8 1304; CHECK-LIS-NEXT: vins.f16 s18, s2 1305; CHECK-LIS-NEXT: vins.f16 s0, s1 1306; CHECK-LIS-NEXT: vmov.f32 s1, s3 1307; CHECK-LIS-NEXT: vins.f16 s14, s16 1308; CHECK-LIS-NEXT: vins.f16 s15, s19 1309; CHECK-LIS-NEXT: vins.f16 s13, s9 1310; CHECK-LIS-NEXT: vmov.f32 s2, s10 1311; CHECK-LIS-NEXT: vmov.f32 s3, s17 1312; CHECK-LIS-NEXT: vmov.f32 s7, s18 1313; CHECK-LIS-NEXT: vadd.f16 q0, q0, q3 1314; CHECK-LIS-NEXT: vadd.f16 q0, q0, q1 1315; CHECK-LIS-NEXT: vstrw.32 q0, [r1] 1316; CHECK-LIS-NEXT: vpop {d8, d9} 1317; CHECK-LIS-NEXT: bx lr 1318entry: 1319 %l1 = load <24 x half>, ptr %src, align 4 1320 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 1321 %s2 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 1322 %s3 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 1323 %a1 = fadd <8 x half> %s1, %s2 1324 %a = fadd <8 x half> %a1, %s3 1325 store <8 x half> %a, ptr %dst 1326 ret void 1327} 1328 1329define void @vld3_v16f16(ptr %src, ptr %dst) { 1330; CHECK-LV-LABEL: vld3_v16f16: 1331; CHECK-LV: @ %bb.0: @ %entry 1332; CHECK-LV-NEXT: .vsave {d8, d9} 1333; CHECK-LV-NEXT: vpush {d8, d9} 1334; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #48] 1335; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #64] 1336; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] 1337; CHECK-LV-NEXT: vmovx.f16 s6, s2 1338; CHECK-LV-NEXT: vmov.f32 s4, s1 1339; CHECK-LV-NEXT: vins.f16 s4, s6 1340; CHECK-LV-NEXT: vmovx.f16 s6, s9 1341; CHECK-LV-NEXT: vmov.f32 s5, s8 1342; CHECK-LV-NEXT: vmovx.f16 s7, s12 1343; CHECK-LV-NEXT: vins.f16 s5, s6 1344; CHECK-LV-NEXT: vmov.f32 s6, s11 1345; CHECK-LV-NEXT: vins.f16 s6, s7 1346; CHECK-LV-NEXT: vmovx.f16 s16, s15 1347; CHECK-LV-NEXT: vmov.f32 s7, s14 1348; CHECK-LV-NEXT: vmovx.f16 s17, s3 1349; CHECK-LV-NEXT: vins.f16 s7, s16 1350; CHECK-LV-NEXT: vmovx.f16 s16, s0 1351; CHECK-LV-NEXT: vins.f16 s16, s2 1352; CHECK-LV-NEXT: vmovx.f16 s2, s1 1353; CHECK-LV-NEXT: vins.f16 s0, s2 1354; CHECK-LV-NEXT: vmovx.f16 s2, s8 1355; CHECK-LV-NEXT: vins.f16 s3, s2 1356; CHECK-LV-NEXT: vmovx.f16 s2, s11 1357; CHECK-LV-NEXT: vmovx.f16 s18, s10 1358; CHECK-LV-NEXT: vins.f16 s10, s2 1359; CHECK-LV-NEXT: vmovx.f16 s2, s14 1360; CHECK-LV-NEXT: vmovx.f16 s19, s13 1361; CHECK-LV-NEXT: vins.f16 s13, s2 1362; CHECK-LV-NEXT: vmov.f32 s1, s3 1363; CHECK-LV-NEXT: vins.f16 s18, s12 1364; CHECK-LV-NEXT: vins.f16 s19, s15 1365; CHECK-LV-NEXT: vmov.f32 s3, s13 1366; CHECK-LV-NEXT: vins.f16 s17, s9 1367; CHECK-LV-NEXT: vmov.f32 s2, s10 1368; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #16] 1369; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 1370; CHECK-LV-NEXT: vadd.f16 q2, q0, q1 1371; CHECK-LV-NEXT: vldrw.u32 q0, [r0] 1372; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] 1373; CHECK-LV-NEXT: vstrw.32 q2, [r1, #16] 1374; CHECK-LV-NEXT: vmovx.f16 s10, s2 1375; CHECK-LV-NEXT: vmov.f32 s8, s1 1376; CHECK-LV-NEXT: vins.f16 s8, s10 1377; CHECK-LV-NEXT: vmovx.f16 s10, s13 1378; CHECK-LV-NEXT: vmov.f32 s9, s12 1379; CHECK-LV-NEXT: vmovx.f16 s11, s4 1380; CHECK-LV-NEXT: vins.f16 s9, s10 1381; CHECK-LV-NEXT: vmov.f32 s10, s15 1382; CHECK-LV-NEXT: vins.f16 s10, s11 1383; CHECK-LV-NEXT: vmovx.f16 s16, s7 1384; CHECK-LV-NEXT: vmov.f32 s11, s6 1385; CHECK-LV-NEXT: vmovx.f16 s17, s3 1386; CHECK-LV-NEXT: vins.f16 s11, s16 1387; CHECK-LV-NEXT: vmovx.f16 s16, s0 1388; CHECK-LV-NEXT: vins.f16 s16, s2 1389; CHECK-LV-NEXT: vmovx.f16 s2, s1 1390; CHECK-LV-NEXT: vins.f16 s0, s2 1391; CHECK-LV-NEXT: vmovx.f16 s2, s12 1392; CHECK-LV-NEXT: vins.f16 s3, s2 1393; CHECK-LV-NEXT: vmovx.f16 s2, s15 1394; CHECK-LV-NEXT: vmovx.f16 s18, s14 1395; CHECK-LV-NEXT: vins.f16 s14, s2 1396; CHECK-LV-NEXT: vmovx.f16 s2, s6 1397; CHECK-LV-NEXT: vmovx.f16 s19, s5 1398; CHECK-LV-NEXT: vins.f16 s5, s2 1399; CHECK-LV-NEXT: vmov.f32 s1, s3 1400; CHECK-LV-NEXT: vins.f16 s18, s4 1401; CHECK-LV-NEXT: vins.f16 s19, s7 1402; CHECK-LV-NEXT: vins.f16 s17, s13 1403; CHECK-LV-NEXT: vmov.f32 s2, s14 1404; CHECK-LV-NEXT: vmov.f32 s3, s5 1405; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 1406; CHECK-LV-NEXT: vadd.f16 q0, q0, q2 1407; CHECK-LV-NEXT: vstrw.32 q0, [r1] 1408; CHECK-LV-NEXT: vpop {d8, d9} 1409; CHECK-LV-NEXT: bx lr 1410; 1411; CHECK-LIS-LABEL: vld3_v16f16: 1412; CHECK-LIS: @ %bb.0: @ %entry 1413; CHECK-LIS-NEXT: .vsave {d8, d9} 1414; CHECK-LIS-NEXT: vpush {d8, d9} 1415; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #48] 1416; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #64] 1417; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] 1418; CHECK-LIS-NEXT: vmovx.f16 s6, s2 1419; CHECK-LIS-NEXT: vmov.f32 s4, s1 1420; CHECK-LIS-NEXT: vins.f16 s4, s6 1421; CHECK-LIS-NEXT: vmovx.f16 s6, s9 1422; CHECK-LIS-NEXT: vmov.f32 s5, s8 1423; CHECK-LIS-NEXT: vmovx.f16 s7, s12 1424; CHECK-LIS-NEXT: vins.f16 s5, s6 1425; CHECK-LIS-NEXT: vmov.f32 s6, s11 1426; CHECK-LIS-NEXT: vins.f16 s6, s7 1427; CHECK-LIS-NEXT: vmovx.f16 s16, s15 1428; CHECK-LIS-NEXT: vmov.f32 s7, s14 1429; CHECK-LIS-NEXT: vmovx.f16 s17, s3 1430; CHECK-LIS-NEXT: vins.f16 s7, s16 1431; CHECK-LIS-NEXT: vmovx.f16 s16, s0 1432; CHECK-LIS-NEXT: vins.f16 s16, s2 1433; CHECK-LIS-NEXT: vmovx.f16 s2, s1 1434; CHECK-LIS-NEXT: vins.f16 s0, s2 1435; CHECK-LIS-NEXT: vmovx.f16 s2, s8 1436; CHECK-LIS-NEXT: vins.f16 s3, s2 1437; CHECK-LIS-NEXT: vmovx.f16 s2, s11 1438; CHECK-LIS-NEXT: vmovx.f16 s18, s10 1439; CHECK-LIS-NEXT: vins.f16 s10, s2 1440; CHECK-LIS-NEXT: vmovx.f16 s2, s14 1441; CHECK-LIS-NEXT: vmovx.f16 s19, s13 1442; CHECK-LIS-NEXT: vins.f16 s13, s2 1443; CHECK-LIS-NEXT: vmov.f32 s1, s3 1444; CHECK-LIS-NEXT: vins.f16 s18, s12 1445; CHECK-LIS-NEXT: vins.f16 s19, s15 1446; CHECK-LIS-NEXT: vmov.f32 s3, s13 1447; CHECK-LIS-NEXT: vins.f16 s17, s9 1448; CHECK-LIS-NEXT: vmov.f32 s2, s10 1449; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] 1450; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 1451; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] 1452; CHECK-LIS-NEXT: vadd.f16 q1, q0, q1 1453; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] 1454; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] 1455; CHECK-LIS-NEXT: vmov.f32 s5, s12 1456; CHECK-LIS-NEXT: vmovx.f16 s6, s2 1457; CHECK-LIS-NEXT: vmov.f32 s4, s1 1458; CHECK-LIS-NEXT: vins.f16 s4, s6 1459; CHECK-LIS-NEXT: vmovx.f16 s6, s13 1460; CHECK-LIS-NEXT: vins.f16 s5, s6 1461; CHECK-LIS-NEXT: vmov.f32 s6, s15 1462; CHECK-LIS-NEXT: vmovx.f16 s7, s8 1463; CHECK-LIS-NEXT: vmovx.f16 s16, s11 1464; CHECK-LIS-NEXT: vins.f16 s6, s7 1465; CHECK-LIS-NEXT: vmov.f32 s7, s10 1466; CHECK-LIS-NEXT: vins.f16 s7, s16 1467; CHECK-LIS-NEXT: vmovx.f16 s16, s0 1468; CHECK-LIS-NEXT: vins.f16 s16, s2 1469; CHECK-LIS-NEXT: vmovx.f16 s2, s1 1470; CHECK-LIS-NEXT: vins.f16 s0, s2 1471; CHECK-LIS-NEXT: vmovx.f16 s2, s12 1472; CHECK-LIS-NEXT: vmovx.f16 s17, s3 1473; CHECK-LIS-NEXT: vins.f16 s3, s2 1474; CHECK-LIS-NEXT: vmovx.f16 s2, s15 1475; CHECK-LIS-NEXT: vmovx.f16 s18, s14 1476; CHECK-LIS-NEXT: vins.f16 s14, s2 1477; CHECK-LIS-NEXT: vmovx.f16 s2, s10 1478; CHECK-LIS-NEXT: vmovx.f16 s19, s9 1479; CHECK-LIS-NEXT: vins.f16 s9, s2 1480; CHECK-LIS-NEXT: vmov.f32 s1, s3 1481; CHECK-LIS-NEXT: vins.f16 s18, s8 1482; CHECK-LIS-NEXT: vins.f16 s19, s11 1483; CHECK-LIS-NEXT: vins.f16 s17, s13 1484; CHECK-LIS-NEXT: vmov.f32 s2, s14 1485; CHECK-LIS-NEXT: vmov.f32 s3, s9 1486; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 1487; CHECK-LIS-NEXT: vadd.f16 q0, q0, q1 1488; CHECK-LIS-NEXT: vstrw.32 q0, [r1] 1489; CHECK-LIS-NEXT: vpop {d8, d9} 1490; CHECK-LIS-NEXT: bx lr 1491entry: 1492 %l1 = load <48 x half>, ptr %src, align 4 1493 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 1494 %s2 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 1495 %s3 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 1496 %a1 = fadd <16 x half> %s1, %s2 1497 %a = fadd <16 x half> %a1, %s3 1498 store <16 x half> %a, ptr %dst 1499 ret void 1500} 1501 1502; f64 1503 1504define void @vld3_v2f64(ptr %src, ptr %dst) { 1505; CHECK-LABEL: vld3_v2f64: 1506; CHECK: @ %bb.0: @ %entry 1507; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 1508; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1509; CHECK-NEXT: vldrw.u32 q2, [r0] 1510; CHECK-NEXT: vadd.f64 d0, d3, d0 1511; CHECK-NEXT: vadd.f64 d3, d4, d5 1512; CHECK-NEXT: vadd.f64 d1, d0, d1 1513; CHECK-NEXT: vadd.f64 d0, d3, d2 1514; CHECK-NEXT: vstrw.32 q0, [r1] 1515; CHECK-NEXT: bx lr 1516entry: 1517 %l1 = load <6 x double>, ptr %src, align 4 1518 %s1 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 0, i32 3> 1519 %s2 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 1, i32 4> 1520 %s3 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 2, i32 5> 1521 %a1 = fadd <2 x double> %s1, %s2 1522 %a = fadd <2 x double> %a1, %s3 1523 store <2 x double> %a, ptr %dst 1524 ret void 1525} 1526 1527define void @vld3_v4f64(ptr %src, ptr %dst) { 1528; CHECK-LABEL: vld3_v4f64: 1529; CHECK: @ %bb.0: @ %entry 1530; CHECK-NEXT: .vsave {d8, d9} 1531; CHECK-NEXT: vpush {d8, d9} 1532; CHECK-NEXT: vldrw.u32 q1, [r0, #80] 1533; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 1534; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 1535; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 1536; CHECK-NEXT: vadd.f64 d1, d1, d2 1537; CHECK-NEXT: vldrw.u32 q4, [r0] 1538; CHECK-NEXT: vadd.f64 d2, d4, d5 1539; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 1540; CHECK-NEXT: vadd.f64 d4, d7, d4 1541; CHECK-NEXT: vadd.f64 d7, d8, d9 1542; CHECK-NEXT: vadd.f64 d1, d1, d3 1543; CHECK-NEXT: vadd.f64 d0, d2, d0 1544; CHECK-NEXT: vadd.f64 d3, d4, d5 1545; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1546; CHECK-NEXT: vadd.f64 d2, d7, d6 1547; CHECK-NEXT: vstrw.32 q1, [r1] 1548; CHECK-NEXT: vpop {d8, d9} 1549; CHECK-NEXT: bx lr 1550entry: 1551 %l1 = load <12 x double>, ptr %src, align 4 1552 %s1 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1553 %s2 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1554 %s3 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1555 %a1 = fadd <4 x double> %s1, %s2 1556 %a = fadd <4 x double> %a1, %s3 1557 store <4 x double> %a, ptr %dst 1558 ret void 1559} 1560