1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vst3_v2i32(ptr %src, ptr %dst) { 7; CHECK-LABEL: vst3_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: .save {r4, lr} 10; CHECK-NEXT: push {r4, lr} 11; CHECK-NEXT: ldrd lr, r12, [r0] 12; CHECK-NEXT: ldrd r3, r2, [r0, #8] 13; CHECK-NEXT: ldrd r4, r0, [r0, #16] 14; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 15; CHECK-NEXT: str r2, [r1, #16] 16; CHECK-NEXT: vmov.32 q0[0], r4 17; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 18; CHECK-NEXT: vmov.32 q0[1], r0 19; CHECK-NEXT: vmov.f32 s8, s4 20; CHECK-NEXT: vmov.f32 s9, s6 21; CHECK-NEXT: str r0, [r1, #20] 22; CHECK-NEXT: vmov.f32 s10, s0 23; CHECK-NEXT: vmov.f32 s11, s5 24; CHECK-NEXT: vstrw.32 q2, [r1] 25; CHECK-NEXT: pop {r4, pc} 26entry: 27 %l1 = load <2 x i32>, ptr %src, align 4 28 %s2 = getelementptr <2 x i32>, ptr %src, i32 1 29 %l2 = load <2 x i32>, ptr %s2, align 4 30 %s3 = getelementptr <2 x i32>, ptr %src, i32 2 31 %l3 = load <2 x i32>, ptr %s3, align 4 32 %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 33 %t2 = shufflevector <2 x i32> %l3, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 34 %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 35 store <6 x i32> %s, ptr %dst 36 ret void 37} 38 39define void @vst3_v4i32(ptr %src, ptr %dst) { 40; CHECK-LABEL: vst3_v4i32: 41; CHECK: @ %bb.0: @ %entry 42; CHECK-NEXT: .vsave {d8, d9} 43; CHECK-NEXT: vpush {d8, d9} 44; CHECK-NEXT: vldrw.u32 q2, [r0] 45; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 46; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 47; CHECK-NEXT: vmov.f32 s4, s8 48; CHECK-NEXT: vmov r0, r2, d0 49; CHECK-NEXT: vmov.f32 s5, s12 50; CHECK-NEXT: vmov.f32 s7, s9 51; CHECK-NEXT: vmov.f32 s16, s13 52; CHECK-NEXT: vmov.32 q1[2], r0 53; CHECK-NEXT: vmov.f32 s18, s10 54; CHECK-NEXT: vstrw.32 q1, [r1] 55; CHECK-NEXT: vmov.f32 s19, s14 56; CHECK-NEXT: vmov.f32 s0, s2 57; CHECK-NEXT: vmov.32 q4[1], r2 58; CHECK-NEXT: vmov.f32 s1, s11 59; CHECK-NEXT: vstrw.32 q4, [r1, #16] 60; CHECK-NEXT: vmov.f32 s2, s15 61; CHECK-NEXT: vstrw.32 q0, [r1, #32] 62; CHECK-NEXT: vpop {d8, d9} 63; CHECK-NEXT: bx lr 64entry: 65 %l1 = load <4 x i32>, ptr %src, align 4 66 %s2 = getelementptr <4 x i32>, ptr %src, i32 1 67 %l2 = load <4 x i32>, ptr %s2, align 4 68 %s3 = getelementptr <4 x i32>, ptr %src, i32 2 69 %l3 = load <4 x i32>, ptr %s3, align 4 70 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 71 %t2 = shufflevector <4 x i32> %l3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 72 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 73 store <12 x i32> %s, ptr %dst 74 ret void 75} 76 77define void @vst3_v8i32(ptr %src, ptr %dst) { 78; CHECK-LABEL: vst3_v8i32: 79; CHECK: @ %bb.0: @ %entry 80; CHECK-NEXT: .save {r7, lr} 81; CHECK-NEXT: push {r7, lr} 82; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 83; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 84; CHECK-NEXT: vldrw.u32 q3, [r0] 85; CHECK-NEXT: vldrw.u32 q7, [r0, #48] 86; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 87; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 88; CHECK-NEXT: vmov.f32 s4, s12 89; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 90; CHECK-NEXT: vmov.f32 s12, s28 91; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 92; CHECK-NEXT: vmov r2, lr, d0 93; CHECK-NEXT: vmov r12, r3, d4 94; CHECK-NEXT: vmov.f32 s0, s2 95; CHECK-NEXT: vmov.f32 s2, s31 96; CHECK-NEXT: vmov.f32 s20, s29 97; CHECK-NEXT: vmov.f32 s9, s15 98; CHECK-NEXT: vmov.f32 s29, s12 99; CHECK-NEXT: vmov.f32 s5, s16 100; CHECK-NEXT: vmov.f32 s7, s13 101; CHECK-NEXT: vmov.f32 s22, s26 102; CHECK-NEXT: vmov.32 q1[2], r12 103; CHECK-NEXT: vmov.f32 s23, s30 104; CHECK-NEXT: vstrw.32 q1, [r1] 105; CHECK-NEXT: vmov.f32 s28, s24 106; CHECK-NEXT: vmov.32 q5[1], lr 107; CHECK-NEXT: vmov.f32 s31, s25 108; CHECK-NEXT: vstrw.32 q5, [r1, #64] 109; CHECK-NEXT: vmov.f32 s12, s17 110; CHECK-NEXT: vmov.32 q7[2], r2 111; CHECK-NEXT: vmov.f32 s15, s18 112; CHECK-NEXT: vstrw.32 q7, [r1, #48] 113; CHECK-NEXT: vmov.f32 s1, s27 114; CHECK-NEXT: vmov.32 q3[1], r3 115; CHECK-NEXT: vmov.f32 s8, s10 116; CHECK-NEXT: vstrw.32 q3, [r1, #16] 117; CHECK-NEXT: vmov.f32 s10, s19 118; CHECK-NEXT: vstrw.32 q0, [r1, #80] 119; CHECK-NEXT: vstrw.32 q2, [r1, #32] 120; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 121; CHECK-NEXT: pop {r7, pc} 122entry: 123 %l1 = load <8 x i32>, ptr %src, align 4 124 %s2 = getelementptr <8 x i32>, ptr %src, i32 1 125 %l2 = load <8 x i32>, ptr %s2, align 4 126 %s3 = getelementptr <8 x i32>, ptr %src, i32 2 127 %l3 = load <8 x i32>, ptr %s3, align 4 128 %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 129 %t2 = shufflevector <8 x i32> %l3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 130 %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 131 store <24 x i32> %s, ptr %dst 132 ret void 133} 134 135define void @vst3_v16i32(ptr %src, ptr %dst) { 136; CHECK-LABEL: vst3_v16i32: 137; CHECK: @ %bb.0: @ %entry 138; CHECK-NEXT: .save {r4, lr} 139; CHECK-NEXT: push {r4, lr} 140; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 141; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 142; CHECK-NEXT: .pad #160 143; CHECK-NEXT: sub sp, #160 144; CHECK-NEXT: vldrw.u32 q3, [r0, #160] 145; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 146; CHECK-NEXT: vldrw.u32 q5, [r0, #128] 147; CHECK-NEXT: vldrw.u32 q1, [r0] 148; CHECK-NEXT: vstrw.32 q3, [sp, #144] @ 16-byte Spill 149; CHECK-NEXT: vldrw.u32 q3, [r0, #144] 150; CHECK-NEXT: vmov r12, r3, d10 151; CHECK-NEXT: vldrw.u32 q7, [r0, #176] 152; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill 153; CHECK-NEXT: vldrw.u32 q3, [r0, #96] 154; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 155; CHECK-NEXT: vmov.f32 s8, s1 156; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill 157; CHECK-NEXT: vldrw.u32 q3, [r0, #80] 158; CHECK-NEXT: vmov.f32 s10, s6 159; CHECK-NEXT: vldrw.u32 q4, [r0, #112] 160; CHECK-NEXT: vmov.f32 s11, s2 161; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill 162; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 163; CHECK-NEXT: vmov.32 q2[1], r3 164; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill 165; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 166; CHECK-NEXT: vstrw.32 q2, [r1, #16] 167; CHECK-NEXT: vmov.f32 s20, s22 168; CHECK-NEXT: vmov.f32 s22, s3 169; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill 170; CHECK-NEXT: vmov.f32 s9, s0 171; CHECK-NEXT: vmov.f32 s0, s30 172; CHECK-NEXT: vmov.f32 s1, s15 173; CHECK-NEXT: vmov.f32 s2, s19 174; CHECK-NEXT: vmov.f32 s3, s31 175; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill 176; CHECK-NEXT: vmov.f32 s8, s4 177; CHECK-NEXT: vmov.f32 s11, s5 178; CHECK-NEXT: vmov.f32 s0, s17 179; CHECK-NEXT: vstrw.32 q2, [sp, #128] @ 16-byte Spill 180; CHECK-NEXT: vmov.f32 s2, s14 181; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload 182; CHECK-NEXT: vmov.f32 s3, s18 183; CHECK-NEXT: vmov.f32 s21, s7 184; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload 185; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill 186; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload 187; CHECK-NEXT: vstrw.32 q5, [r1, #32] 188; CHECK-NEXT: vmov.f32 s21, s7 189; CHECK-NEXT: vmov.f32 s20, s2 190; CHECK-NEXT: vmov.f32 s23, s3 191; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload 192; CHECK-NEXT: vmov.f32 s22, s11 193; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill 194; CHECK-NEXT: vmov.f32 s21, s16 195; CHECK-NEXT: vmov.f32 s23, s13 196; CHECK-NEXT: vmov.f32 s16, s9 197; CHECK-NEXT: vmov.f32 s19, s10 198; CHECK-NEXT: vmov.f32 s13, s8 199; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload 200; CHECK-NEXT: vmov.f32 s18, s6 201; CHECK-NEXT: vmov.f64 d14, d4 202; CHECK-NEXT: vmov.f32 s15, s5 203; CHECK-NEXT: vmov.f32 s5, s27 204; CHECK-NEXT: vmov.f32 s8, s24 205; CHECK-NEXT: vmov.f32 s6, s3 206; CHECK-NEXT: vmov.f32 s9, s0 207; CHECK-NEXT: vmov.f32 s24, s1 208; CHECK-NEXT: vmov.f32 s27, s2 209; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 210; CHECK-NEXT: vmov r0, r3, d14 211; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload 212; CHECK-NEXT: vmov.f32 s7, s11 213; CHECK-NEXT: vstrw.32 q0, [r1, #128] 214; CHECK-NEXT: vmov.f32 s11, s25 215; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload 216; CHECK-NEXT: vmov.f32 s20, s12 217; CHECK-NEXT: vmov.32 q6[1], r3 218; CHECK-NEXT: vmov.f32 s12, s4 219; CHECK-NEXT: vstrw.32 q6, [r1, #64] 220; CHECK-NEXT: vmov.f32 s4, s10 221; CHECK-NEXT: vmov.32 q2[2], r0 222; CHECK-NEXT: vmov r0, lr, d14 223; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload 224; CHECK-NEXT: vmov.32 q0[1], lr 225; CHECK-NEXT: vmov.32 q5[2], r0 226; CHECK-NEXT: vstrw.32 q0, [r1, #160] 227; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 228; CHECK-NEXT: vmov r2, r4, d14 229; CHECK-NEXT: vstrw.32 q2, [r1, #48] 230; CHECK-NEXT: vstrw.32 q0, [r1, #176] 231; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload 232; CHECK-NEXT: vmov.32 q3[2], r2 233; CHECK-NEXT: vmov.32 q4[1], r4 234; CHECK-NEXT: vmov.32 q0[2], r12 235; CHECK-NEXT: vstrw.32 q1, [r1, #80] 236; CHECK-NEXT: vstrw.32 q3, [r1, #96] 237; CHECK-NEXT: vstrw.32 q4, [r1, #112] 238; CHECK-NEXT: vstrw.32 q5, [r1, #144] 239; CHECK-NEXT: vstrw.32 q0, [r1] 240; CHECK-NEXT: add sp, #160 241; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 242; CHECK-NEXT: pop {r4, pc} 243entry: 244 %l1 = load <16 x i32>, ptr %src, align 4 245 %s2 = getelementptr <16 x i32>, ptr %src, i32 1 246 %l2 = load <16 x i32>, ptr %s2, align 4 247 %s3 = getelementptr <16 x i32>, ptr %src, i32 2 248 %l3 = load <16 x i32>, ptr %s3, align 4 249 %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 250 %t2 = shufflevector <16 x i32> %l3, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 251 %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 252 store <48 x i32> %s, ptr %dst 253 ret void 254} 255 256; i16 257 258define void @vst3_v2i16(ptr %src, ptr %dst) { 259; CHECK-LABEL: vst3_v2i16: 260; CHECK: @ %bb.0: @ %entry 261; CHECK-NEXT: .save {r4, lr} 262; CHECK-NEXT: push {r4, lr} 263; CHECK-NEXT: ldrh r2, [r0, #10] 264; CHECK-NEXT: ldrh r4, [r0, #8] 265; CHECK-NEXT: ldrh.w r12, [r0, #2] 266; CHECK-NEXT: ldrh.w lr, [r0] 267; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 268; CHECK-NEXT: ldrh r3, [r0, #6] 269; CHECK-NEXT: ldrh r0, [r0, #4] 270; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 271; CHECK-NEXT: vmov q2, q1 272; CHECK-NEXT: vmovnt.i32 q2, q0 273; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 274; CHECK-NEXT: vmov r0, s10 275; CHECK-NEXT: vmov.f32 s1, s4 276; CHECK-NEXT: vmov.f32 s3, s2 277; CHECK-NEXT: vmov.32 q0[2], r4 278; CHECK-NEXT: vstrh.32 q0, [r1] 279; CHECK-NEXT: str r0, [r1, #8] 280; CHECK-NEXT: pop {r4, pc} 281entry: 282 %l1 = load <2 x i16>, ptr %src, align 4 283 %s2 = getelementptr <2 x i16>, ptr %src, i32 1 284 %l2 = load <2 x i16>, ptr %s2, align 4 285 %s3 = getelementptr <2 x i16>, ptr %src, i32 2 286 %l3 = load <2 x i16>, ptr %s3, align 4 287 %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 288 %t2 = shufflevector <2 x i16> %l3, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 289 %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 290 store <6 x i16> %s, ptr %dst 291 ret void 292} 293 294define void @vst3_v4i16(ptr %src, ptr %dst) { 295; CHECK-LABEL: vst3_v4i16: 296; CHECK: @ %bb.0: @ %entry 297; CHECK-NEXT: .save {r4, r5, r7, lr} 298; CHECK-NEXT: push {r4, r5, r7, lr} 299; CHECK-NEXT: vldrh.u32 q1, [r0] 300; CHECK-NEXT: vldrh.u32 q0, [r0, #8] 301; CHECK-NEXT: vldrh.u32 q2, [r0, #16] 302; CHECK-NEXT: vmov r0, r5, d2 303; CHECK-NEXT: vmov.f32 s5, s7 304; CHECK-NEXT: vmov r2, r3, d0 305; CHECK-NEXT: vmov lr, r4, d1 306; CHECK-NEXT: vmov.16 q0[0], r0 307; CHECK-NEXT: vmov.f32 s4, s10 308; CHECK-NEXT: vmov.16 q0[1], r2 309; CHECK-NEXT: vmov.f32 s7, s11 310; CHECK-NEXT: vmov r12, s6 311; CHECK-NEXT: vmov.32 q1[2], r4 312; CHECK-NEXT: vmov r0, r4, d4 313; CHECK-NEXT: vstrh.32 q1, [r1, #16] 314; CHECK-NEXT: vmov.16 q0[2], r0 315; CHECK-NEXT: vmov.16 q0[3], r5 316; CHECK-NEXT: vmov.16 q0[4], r3 317; CHECK-NEXT: vmov.16 q0[5], r4 318; CHECK-NEXT: vmov.16 q0[6], r12 319; CHECK-NEXT: vmov.16 q0[7], lr 320; CHECK-NEXT: vstrw.32 q0, [r1] 321; CHECK-NEXT: pop {r4, r5, r7, pc} 322entry: 323 %l1 = load <4 x i16>, ptr %src, align 4 324 %s2 = getelementptr <4 x i16>, ptr %src, i32 1 325 %l2 = load <4 x i16>, ptr %s2, align 4 326 %s3 = getelementptr <4 x i16>, ptr %src, i32 2 327 %l3 = load <4 x i16>, ptr %s3, align 4 328 %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 329 %t2 = shufflevector <4 x i16> %l3, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 330 %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 331 store <12 x i16> %s, ptr %dst 332 ret void 333} 334 335define void @vst3_v8i16(ptr %src, ptr %dst) { 336; CHECK-LABEL: vst3_v8i16: 337; CHECK: @ %bb.0: @ %entry 338; CHECK-NEXT: .vsave {d8, d9, d10, d11} 339; CHECK-NEXT: vpush {d8, d9, d10, d11} 340; CHECK-NEXT: vldrw.u32 q1, [r0] 341; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 342; CHECK-NEXT: vmov.f32 s12, s7 343; CHECK-NEXT: vmov.u16 r2, q2[5] 344; CHECK-NEXT: vmov.16 q0[0], r2 345; CHECK-NEXT: vins.f16 s12, s11 346; CHECK-NEXT: vmov.f32 s1, s12 347; CHECK-NEXT: vmov.u16 r2, q2[7] 348; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 349; CHECK-NEXT: vmov.16 q0[6], r2 350; CHECK-NEXT: vmov.f32 s2, s7 351; CHECK-NEXT: vmov.u16 r0, q2[3] 352; CHECK-NEXT: vmovx.f16 s7, s14 353; CHECK-NEXT: vmov.16 q4[2], r0 354; CHECK-NEXT: vins.f16 s0, s7 355; CHECK-NEXT: vmovx.f16 s7, s15 356; CHECK-NEXT: vins.f16 s3, s7 357; CHECK-NEXT: vmov.f32 s7, s6 358; CHECK-NEXT: vmovx.f16 s2, s2 359; CHECK-NEXT: vins.f16 s7, s10 360; CHECK-NEXT: vmov.f32 s20, s4 361; CHECK-NEXT: vins.f16 s15, s2 362; CHECK-NEXT: vmov.f32 s18, s7 363; CHECK-NEXT: vins.f16 s20, s8 364; CHECK-NEXT: vmov.f32 s7, s6 365; CHECK-NEXT: vmovx.f16 s6, s5 366; CHECK-NEXT: vmov.f32 s2, s15 367; CHECK-NEXT: vmovx.f16 s15, s13 368; CHECK-NEXT: vins.f16 s13, s6 369; CHECK-NEXT: vmovx.f16 s6, s7 370; CHECK-NEXT: vmov.u16 r0, q2[1] 371; CHECK-NEXT: vmovx.f16 s4, s4 372; CHECK-NEXT: vins.f16 s14, s6 373; CHECK-NEXT: vmovx.f16 s6, s12 374; CHECK-NEXT: vmov.16 q5[4], r0 375; CHECK-NEXT: vins.f16 s5, s9 376; CHECK-NEXT: vins.f16 s12, s4 377; CHECK-NEXT: vins.f16 s17, s15 378; CHECK-NEXT: vmov.f32 s16, s13 379; CHECK-NEXT: vins.f16 s22, s6 380; CHECK-NEXT: vmov.f32 s19, s14 381; CHECK-NEXT: vstrw.32 q0, [r1, #32] 382; CHECK-NEXT: vmov.f32 s23, s5 383; CHECK-NEXT: vstrw.32 q4, [r1, #16] 384; CHECK-NEXT: vmov.f32 s21, s12 385; CHECK-NEXT: vstrw.32 q5, [r1] 386; CHECK-NEXT: vpop {d8, d9, d10, d11} 387; CHECK-NEXT: bx lr 388entry: 389 %l1 = load <8 x i16>, ptr %src, align 4 390 %s2 = getelementptr <8 x i16>, ptr %src, i32 1 391 %l2 = load <8 x i16>, ptr %s2, align 4 392 %s3 = getelementptr <8 x i16>, ptr %src, i32 2 393 %l3 = load <8 x i16>, ptr %s3, align 4 394 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 395 %t2 = shufflevector <8 x i16> %l3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 396 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 397 store <24 x i16> %s, ptr %dst 398 ret void 399} 400 401define void @vst3_v16i16(ptr %src, ptr %dst) { 402; CHECK-LABEL: vst3_v16i16: 403; CHECK: @ %bb.0: @ %entry 404; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 405; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 406; CHECK-NEXT: .pad #48 407; CHECK-NEXT: sub sp, #48 408; CHECK-NEXT: vldrw.u32 q2, [r0] 409; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 410; CHECK-NEXT: vldrw.u32 q7, [r0, #80] 411; CHECK-NEXT: vmov.f32 s0, s11 412; CHECK-NEXT: vmov.u16 r2, q1[5] 413; CHECK-NEXT: vmov.16 q3[0], r2 414; CHECK-NEXT: vins.f16 s0, s7 415; CHECK-NEXT: vmov.f32 s2, s11 416; CHECK-NEXT: vmov.u16 r2, q1[7] 417; CHECK-NEXT: vmov.f64 d12, d4 418; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill 419; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 420; CHECK-NEXT: vmov.f32 s26, s10 421; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 422; CHECK-NEXT: vmov.f32 s13, s0 423; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill 424; CHECK-NEXT: vmov.16 q3[6], r2 425; CHECK-NEXT: vmovx.f16 s0, s10 426; CHECK-NEXT: vins.f16 s12, s0 427; CHECK-NEXT: vmovx.f16 s0, s2 428; CHECK-NEXT: vmov.f32 s14, s11 429; CHECK-NEXT: vins.f16 s14, s0 430; CHECK-NEXT: vmov.f32 s20, s7 431; CHECK-NEXT: vmov q0, q3 432; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 433; CHECK-NEXT: vmov.u16 r2, q3[5] 434; CHECK-NEXT: vins.f16 s20, s15 435; CHECK-NEXT: vmov.16 q4[0], r2 436; CHECK-NEXT: vmov.u16 r2, q3[7] 437; CHECK-NEXT: vmov.f32 s17, s20 438; CHECK-NEXT: vmovx.f16 s20, s31 439; CHECK-NEXT: vmov.16 q4[6], r2 440; CHECK-NEXT: vmov.f32 s18, s7 441; CHECK-NEXT: vmovx.f16 s7, s30 442; CHECK-NEXT: vins.f16 s16, s7 443; CHECK-NEXT: vmovx.f16 s7, s18 444; CHECK-NEXT: vins.f16 s31, s7 445; CHECK-NEXT: vmovx.f16 s7, s11 446; CHECK-NEXT: vins.f16 s3, s7 447; CHECK-NEXT: vins.f16 s19, s20 448; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 449; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 450; CHECK-NEXT: vmov.f32 s20, s24 451; CHECK-NEXT: vmovx.f16 s11, s8 452; CHECK-NEXT: vmov.f32 s7, s25 453; CHECK-NEXT: vins.f16 s20, s0 454; CHECK-NEXT: vmov.u16 r0, q0[1] 455; CHECK-NEXT: vins.f16 s7, s1 456; CHECK-NEXT: vmov.16 q5[4], r0 457; CHECK-NEXT: vmov.u16 r0, q3[1] 458; CHECK-NEXT: vmov.f32 s23, s7 459; CHECK-NEXT: vmovx.f16 s7, s24 460; CHECK-NEXT: vmov.f32 s24, s4 461; CHECK-NEXT: vins.f16 s8, s7 462; CHECK-NEXT: vins.f16 s24, s12 463; CHECK-NEXT: vmov.f32 s21, s8 464; CHECK-NEXT: vmov.f32 s8, s5 465; CHECK-NEXT: vmov.16 q6[4], r0 466; CHECK-NEXT: vins.f16 s8, s13 467; CHECK-NEXT: vmovx.f16 s4, s4 468; CHECK-NEXT: vmov.f32 s27, s8 469; CHECK-NEXT: vmovx.f16 s8, s28 470; CHECK-NEXT: vins.f16 s28, s4 471; CHECK-NEXT: vmov.f32 s4, s6 472; CHECK-NEXT: vmov.u16 r0, q3[3] 473; CHECK-NEXT: vins.f16 s4, s14 474; CHECK-NEXT: vmov.16 q0[2], r0 475; CHECK-NEXT: vins.f16 s26, s8 476; CHECK-NEXT: vmov.f32 s2, s4 477; CHECK-NEXT: vmovx.f16 s4, s29 478; CHECK-NEXT: vins.f16 s1, s4 479; CHECK-NEXT: vmovx.f16 s4, s6 480; CHECK-NEXT: vmovx.f16 s0, s5 481; CHECK-NEXT: vins.f16 s30, s4 482; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload 483; CHECK-NEXT: vins.f16 s29, s0 484; CHECK-NEXT: vmov.f32 s0, s29 485; CHECK-NEXT: vins.f16 s22, s11 486; CHECK-NEXT: vmov.f32 s3, s30 487; CHECK-NEXT: vstrw.32 q5, [r1] 488; CHECK-NEXT: vmov.f32 s29, s5 489; CHECK-NEXT: vstrw.32 q0, [r1, #64] 490; CHECK-NEXT: vmov.f32 s30, s6 491; CHECK-NEXT: vmov.f32 s8, s6 492; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload 493; CHECK-NEXT: vmov.f32 s18, s31 494; CHECK-NEXT: vmov.u16 r0, q1[3] 495; CHECK-NEXT: vins.f16 s8, s6 496; CHECK-NEXT: vmov.16 q1[2], r0 497; CHECK-NEXT: vmov.f32 s25, s28 498; CHECK-NEXT: vmov.f32 s6, s8 499; CHECK-NEXT: vmovx.f16 s8, s9 500; CHECK-NEXT: vmovx.f16 s4, s29 501; CHECK-NEXT: vins.f16 s5, s8 502; CHECK-NEXT: vmovx.f16 s8, s30 503; CHECK-NEXT: vins.f16 s9, s4 504; CHECK-NEXT: vins.f16 s10, s8 505; CHECK-NEXT: vmov.f32 s4, s9 506; CHECK-NEXT: vmov.f32 s7, s10 507; CHECK-NEXT: vstrw.32 q6, [r1, #48] 508; CHECK-NEXT: vstrw.32 q1, [r1, #16] 509; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload 510; CHECK-NEXT: vstrw.32 q4, [r1, #80] 511; CHECK-NEXT: vstrw.32 q1, [r1, #32] 512; CHECK-NEXT: add sp, #48 513; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 514; CHECK-NEXT: bx lr 515entry: 516 %l1 = load <16 x i16>, ptr %src, align 4 517 %s2 = getelementptr <16 x i16>, ptr %src, i32 1 518 %l2 = load <16 x i16>, ptr %s2, align 4 519 %s3 = getelementptr <16 x i16>, ptr %src, i32 2 520 %l3 = load <16 x i16>, ptr %s3, align 4 521 %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 522 %t2 = shufflevector <16 x i16> %l3, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 523 %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 524 store <48 x i16> %s, ptr %dst 525 ret void 526} 527 528; i8 529 530define void @vst3_v2i8(ptr %src, ptr %dst) { 531; CHECK-LABEL: vst3_v2i8: 532; CHECK: @ %bb.0: @ %entry 533; CHECK-NEXT: .save {r4, r5, r7, lr} 534; CHECK-NEXT: push {r4, r5, r7, lr} 535; CHECK-NEXT: .pad #16 536; CHECK-NEXT: sub sp, #16 537; CHECK-NEXT: ldrb r2, [r0] 538; CHECK-NEXT: mov r5, sp 539; CHECK-NEXT: ldrb r3, [r0, #2] 540; CHECK-NEXT: vmov.16 q0[0], r2 541; CHECK-NEXT: ldrb.w r12, [r0, #1] 542; CHECK-NEXT: ldrb.w lr, [r0, #3] 543; CHECK-NEXT: vmov.16 q0[1], r3 544; CHECK-NEXT: ldrb r4, [r0, #5] 545; CHECK-NEXT: ldrb r0, [r0, #4] 546; CHECK-NEXT: vmov.16 q0[2], r0 547; CHECK-NEXT: add r0, sp, #8 548; CHECK-NEXT: vmov.16 q0[3], r12 549; CHECK-NEXT: vmov.16 q0[4], lr 550; CHECK-NEXT: vmov.16 q0[5], r4 551; CHECK-NEXT: vstrb.16 q0, [r5] 552; CHECK-NEXT: vstrb.16 q0, [r0] 553; CHECK-NEXT: vldrh.u32 q0, [r0] 554; CHECK-NEXT: ldr r2, [sp] 555; CHECK-NEXT: str r2, [r1] 556; CHECK-NEXT: vmov r0, s2 557; CHECK-NEXT: strh r0, [r1, #4] 558; CHECK-NEXT: add sp, #16 559; CHECK-NEXT: pop {r4, r5, r7, pc} 560entry: 561 %l1 = load <2 x i8>, ptr %src, align 4 562 %s2 = getelementptr <2 x i8>, ptr %src, i32 1 563 %l2 = load <2 x i8>, ptr %s2, align 4 564 %s3 = getelementptr <2 x i8>, ptr %src, i32 2 565 %l3 = load <2 x i8>, ptr %s3, align 4 566 %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 567 %t2 = shufflevector <2 x i8> %l3, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 568 %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 569 store <6 x i8> %s, ptr %dst 570 ret void 571} 572 573define void @vst3_v4i8(ptr %src, ptr %dst) { 574; CHECK-LABEL: vst3_v4i8: 575; CHECK: @ %bb.0: @ %entry 576; CHECK-NEXT: .save {r4, r5, r6, lr} 577; CHECK-NEXT: push {r4, r5, r6, lr} 578; CHECK-NEXT: vldrb.u32 q0, [r0, #4] 579; CHECK-NEXT: vldrb.u32 q1, [r0] 580; CHECK-NEXT: vmov r2, lr, d0 581; CHECK-NEXT: vmov r12, r3, d1 582; CHECK-NEXT: vldrb.u32 q0, [r0, #8] 583; CHECK-NEXT: vmov r0, r6, d3 584; CHECK-NEXT: vmov r4, r5, d1 585; CHECK-NEXT: vmov.8 q2[8], r4 586; CHECK-NEXT: vmov.8 q2[9], r6 587; CHECK-NEXT: vmov.8 q2[10], r3 588; CHECK-NEXT: vmov.8 q2[11], r5 589; CHECK-NEXT: vmov r3, s10 590; CHECK-NEXT: str r3, [r1, #8] 591; CHECK-NEXT: vmov r3, r4, d2 592; CHECK-NEXT: vmov.16 q1[0], r3 593; CHECK-NEXT: vmov r3, r5, d0 594; CHECK-NEXT: vmov.16 q1[1], r2 595; CHECK-NEXT: vmov.16 q1[2], r3 596; CHECK-NEXT: vmov.16 q1[3], r4 597; CHECK-NEXT: vmov.16 q1[4], lr 598; CHECK-NEXT: vmov.16 q1[5], r5 599; CHECK-NEXT: vmov.16 q1[6], r0 600; CHECK-NEXT: vmov.16 q1[7], r12 601; CHECK-NEXT: vstrb.16 q1, [r1] 602; CHECK-NEXT: pop {r4, r5, r6, pc} 603entry: 604 %l1 = load <4 x i8>, ptr %src, align 4 605 %s2 = getelementptr <4 x i8>, ptr %src, i32 1 606 %l2 = load <4 x i8>, ptr %s2, align 4 607 %s3 = getelementptr <4 x i8>, ptr %src, i32 2 608 %l3 = load <4 x i8>, ptr %s3, align 4 609 %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 610 %t2 = shufflevector <4 x i8> %l3, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 611 %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 612 store <12 x i8> %s, ptr %dst 613 ret void 614} 615 616define void @vst3_v8i8(ptr %src, ptr %dst) { 617; CHECK-LABEL: vst3_v8i8: 618; CHECK: @ %bb.0: @ %entry 619; CHECK-NEXT: .vsave {d8, d9} 620; CHECK-NEXT: vpush {d8, d9} 621; CHECK-NEXT: vldrb.u16 q2, [r0, #8] 622; CHECK-NEXT: vldrb.u16 q1, [r0, #16] 623; CHECK-NEXT: vldrb.u16 q3, [r0] 624; CHECK-NEXT: vmovx.f16 s2, s6 625; CHECK-NEXT: vmovx.f16 s0, s10 626; CHECK-NEXT: vins.f16 s0, s2 627; CHECK-NEXT: vins.f16 s1, s11 628; CHECK-NEXT: vmovx.f16 s2, s7 629; CHECK-NEXT: vmovx.f16 s3, s11 630; CHECK-NEXT: vins.f16 s3, s2 631; CHECK-NEXT: vmovx.f16 s16, s1 632; CHECK-NEXT: vmov.f32 s1, s15 633; CHECK-NEXT: vmovx.f16 s18, s15 634; CHECK-NEXT: vmov.f32 s2, s7 635; CHECK-NEXT: vmov.u16 r0, q3[0] 636; CHECK-NEXT: vins.f16 s1, s16 637; CHECK-NEXT: vins.f16 s2, s18 638; CHECK-NEXT: vmov.8 q4[0], r0 639; CHECK-NEXT: vmov.u16 r0, q2[0] 640; CHECK-NEXT: vmov.8 q4[1], r0 641; CHECK-NEXT: vmov.u16 r0, q1[0] 642; CHECK-NEXT: vmov.8 q4[2], r0 643; CHECK-NEXT: vmov.u16 r0, q3[1] 644; CHECK-NEXT: vmov.8 q4[3], r0 645; CHECK-NEXT: vmov.u16 r0, q2[1] 646; CHECK-NEXT: vmov.8 q4[4], r0 647; CHECK-NEXT: vmov.u16 r0, q1[1] 648; CHECK-NEXT: vmov.8 q4[5], r0 649; CHECK-NEXT: vmov.u16 r0, q3[2] 650; CHECK-NEXT: vmov.8 q4[6], r0 651; CHECK-NEXT: vmov.u16 r0, q2[2] 652; CHECK-NEXT: vmov.8 q4[7], r0 653; CHECK-NEXT: vmov.u16 r0, q1[2] 654; CHECK-NEXT: vmov.8 q4[8], r0 655; CHECK-NEXT: vmov.u16 r0, q3[3] 656; CHECK-NEXT: vmov.8 q4[9], r0 657; CHECK-NEXT: vmov.u16 r0, q2[3] 658; CHECK-NEXT: vmov.8 q4[10], r0 659; CHECK-NEXT: vmov.u16 r0, q1[3] 660; CHECK-NEXT: vmov.8 q4[11], r0 661; CHECK-NEXT: vmov.u16 r0, q3[4] 662; CHECK-NEXT: vmov.8 q4[12], r0 663; CHECK-NEXT: vmov.u16 r0, q2[4] 664; CHECK-NEXT: vmov.8 q4[13], r0 665; CHECK-NEXT: vmov.u16 r0, q1[4] 666; CHECK-NEXT: vmov.8 q4[14], r0 667; CHECK-NEXT: vmov.u16 r0, q3[5] 668; CHECK-NEXT: vmov.8 q4[15], r0 669; CHECK-NEXT: vstrb.16 q0, [r1, #16] 670; CHECK-NEXT: vstrw.32 q4, [r1] 671; CHECK-NEXT: vpop {d8, d9} 672; CHECK-NEXT: bx lr 673entry: 674 %l1 = load <8 x i8>, ptr %src, align 4 675 %s2 = getelementptr <8 x i8>, ptr %src, i32 1 676 %l2 = load <8 x i8>, ptr %s2, align 4 677 %s3 = getelementptr <8 x i8>, ptr %src, i32 2 678 %l3 = load <8 x i8>, ptr %s3, align 4 679 %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 680 %t2 = shufflevector <8 x i8> %l3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 681 %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 682 store <24 x i8> %s, ptr %dst 683 ret void 684} 685 686define void @vst3_v16i8(ptr %src, ptr %dst) { 687; CHECK-LABEL: vst3_v16i8: 688; CHECK: @ %bb.0: @ %entry 689; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 690; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 691; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 692; CHECK-NEXT: vldrw.u32 q3, [r0] 693; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 694; CHECK-NEXT: vmov.u8 r2, q2[5] 695; CHECK-NEXT: vmov.8 q4[0], r2 696; CHECK-NEXT: vmov.u8 r2, q3[6] 697; CHECK-NEXT: vmov.8 q4[2], r2 698; CHECK-NEXT: vmov.u8 r2, q2[6] 699; CHECK-NEXT: vmov.8 q4[3], r2 700; CHECK-NEXT: vmov.u8 r2, q3[7] 701; CHECK-NEXT: vmov.8 q4[5], r2 702; CHECK-NEXT: vmov.u8 r2, q2[7] 703; CHECK-NEXT: vmov.8 q4[6], r2 704; CHECK-NEXT: vmov.u8 r2, q3[8] 705; CHECK-NEXT: vmov.8 q4[8], r2 706; CHECK-NEXT: vmov.u8 r2, q2[8] 707; CHECK-NEXT: vmov.8 q4[9], r2 708; CHECK-NEXT: vmov.u8 r2, q3[9] 709; CHECK-NEXT: vmov.8 q4[11], r2 710; CHECK-NEXT: vmov.u8 r2, q2[9] 711; CHECK-NEXT: vmov.8 q4[12], r2 712; CHECK-NEXT: vmov.u8 r2, q3[10] 713; CHECK-NEXT: vmov.8 q4[14], r2 714; CHECK-NEXT: vmov.u8 r2, q2[10] 715; CHECK-NEXT: vmov.8 q4[15], r2 716; CHECK-NEXT: vmov.u8 r0, q1[5] 717; CHECK-NEXT: vmov.u8 r2, q4[0] 718; CHECK-NEXT: vmov.8 q0[0], r2 719; CHECK-NEXT: vmov.8 q0[1], r0 720; CHECK-NEXT: vmov.u8 r0, q4[2] 721; CHECK-NEXT: vmov.8 q0[2], r0 722; CHECK-NEXT: vmov.u8 r0, q4[3] 723; CHECK-NEXT: vmov.8 q0[3], r0 724; CHECK-NEXT: vmov.u8 r0, q1[6] 725; CHECK-NEXT: vmov.8 q0[4], r0 726; CHECK-NEXT: vmov.u8 r0, q4[5] 727; CHECK-NEXT: vmov.8 q0[5], r0 728; CHECK-NEXT: vmov.u8 r0, q4[6] 729; CHECK-NEXT: vmov.8 q0[6], r0 730; CHECK-NEXT: vmov.u8 r0, q1[7] 731; CHECK-NEXT: vmov.8 q0[7], r0 732; CHECK-NEXT: vmov.u8 r0, q4[8] 733; CHECK-NEXT: vmov.8 q0[8], r0 734; CHECK-NEXT: vmov.u8 r0, q4[9] 735; CHECK-NEXT: vmov.8 q0[9], r0 736; CHECK-NEXT: vmov.u8 r0, q1[8] 737; CHECK-NEXT: vmov.8 q0[10], r0 738; CHECK-NEXT: vmov.u8 r0, q4[11] 739; CHECK-NEXT: vmov.8 q0[11], r0 740; CHECK-NEXT: vmov.u8 r0, q4[12] 741; CHECK-NEXT: vmov.8 q0[12], r0 742; CHECK-NEXT: vmov.u8 r0, q1[9] 743; CHECK-NEXT: vmov.8 q0[13], r0 744; CHECK-NEXT: vmov.u8 r0, q4[14] 745; CHECK-NEXT: vmov.8 q0[14], r0 746; CHECK-NEXT: vmov.u8 r0, q4[15] 747; CHECK-NEXT: vmov.8 q0[15], r0 748; CHECK-NEXT: vmov.u8 r0, q3[0] 749; CHECK-NEXT: vmov.8 q5[0], r0 750; CHECK-NEXT: vmov.u8 r0, q2[0] 751; CHECK-NEXT: vmov.8 q5[1], r0 752; CHECK-NEXT: vmov.u8 r0, q3[1] 753; CHECK-NEXT: vmov.8 q5[3], r0 754; CHECK-NEXT: vmov.u8 r0, q2[1] 755; CHECK-NEXT: vmov.8 q5[4], r0 756; CHECK-NEXT: vmov.u8 r0, q3[2] 757; CHECK-NEXT: vmov.8 q5[6], r0 758; CHECK-NEXT: vmov.u8 r0, q2[2] 759; CHECK-NEXT: vmov.8 q5[7], r0 760; CHECK-NEXT: vmov.u8 r0, q3[3] 761; CHECK-NEXT: vmov.8 q5[9], r0 762; CHECK-NEXT: vmov.u8 r0, q2[3] 763; CHECK-NEXT: vmov.8 q5[10], r0 764; CHECK-NEXT: vmov.u8 r0, q3[4] 765; CHECK-NEXT: vmov.8 q5[12], r0 766; CHECK-NEXT: vmov.u8 r0, q2[4] 767; CHECK-NEXT: vmov.8 q5[13], r0 768; CHECK-NEXT: vmov.u8 r0, q3[5] 769; CHECK-NEXT: vmov.8 q5[15], r0 770; CHECK-NEXT: vstrw.32 q0, [r1, #16] 771; CHECK-NEXT: vmov.u8 r0, q5[0] 772; CHECK-NEXT: vmov.8 q4[0], r0 773; CHECK-NEXT: vmov.u8 r0, q5[1] 774; CHECK-NEXT: vmov.8 q4[1], r0 775; CHECK-NEXT: vmov.u8 r0, q1[0] 776; CHECK-NEXT: vmov.8 q4[2], r0 777; CHECK-NEXT: vmov.u8 r0, q5[3] 778; CHECK-NEXT: vmov.8 q4[3], r0 779; CHECK-NEXT: vmov.u8 r0, q5[4] 780; CHECK-NEXT: vmov.8 q4[4], r0 781; CHECK-NEXT: vmov.u8 r0, q1[1] 782; CHECK-NEXT: vmov.8 q4[5], r0 783; CHECK-NEXT: vmov.u8 r0, q5[6] 784; CHECK-NEXT: vmov.8 q4[6], r0 785; CHECK-NEXT: vmov.u8 r0, q5[7] 786; CHECK-NEXT: vmov.8 q4[7], r0 787; CHECK-NEXT: vmov.u8 r0, q1[2] 788; CHECK-NEXT: vmov.8 q4[8], r0 789; CHECK-NEXT: vmov.u8 r0, q5[9] 790; CHECK-NEXT: vmov.8 q4[9], r0 791; CHECK-NEXT: vmov.u8 r0, q5[10] 792; CHECK-NEXT: vmov.8 q4[10], r0 793; CHECK-NEXT: vmov.u8 r0, q1[3] 794; CHECK-NEXT: vmov.8 q4[11], r0 795; CHECK-NEXT: vmov.u8 r0, q5[12] 796; CHECK-NEXT: vmov.8 q4[12], r0 797; CHECK-NEXT: vmov.u8 r0, q5[13] 798; CHECK-NEXT: vmov.8 q4[13], r0 799; CHECK-NEXT: vmov.u8 r0, q1[4] 800; CHECK-NEXT: vmov.8 q4[14], r0 801; CHECK-NEXT: vmov.u8 r0, q5[15] 802; CHECK-NEXT: vmov.8 q4[15], r0 803; CHECK-NEXT: vmov.u8 r0, q1[10] 804; CHECK-NEXT: vmov.8 q5[0], r0 805; CHECK-NEXT: vmov.u8 r0, q3[11] 806; CHECK-NEXT: vmov.8 q6[1], r0 807; CHECK-NEXT: vmov.u8 r0, q2[11] 808; CHECK-NEXT: vmov.8 q6[2], r0 809; CHECK-NEXT: vmov.u8 r0, q3[12] 810; CHECK-NEXT: vmov.8 q6[4], r0 811; CHECK-NEXT: vmov.u8 r0, q2[12] 812; CHECK-NEXT: vmov.8 q6[5], r0 813; CHECK-NEXT: vmov.u8 r0, q3[13] 814; CHECK-NEXT: vmov.8 q6[7], r0 815; CHECK-NEXT: vmov.u8 r0, q2[13] 816; CHECK-NEXT: vmov.8 q6[8], r0 817; CHECK-NEXT: vmov.u8 r0, q3[14] 818; CHECK-NEXT: vmov.8 q6[10], r0 819; CHECK-NEXT: vmov.u8 r0, q2[14] 820; CHECK-NEXT: vmov.8 q6[11], r0 821; CHECK-NEXT: vmov.u8 r0, q3[15] 822; CHECK-NEXT: vmov.8 q6[13], r0 823; CHECK-NEXT: vmov.u8 r0, q2[15] 824; CHECK-NEXT: vmov.8 q6[14], r0 825; CHECK-NEXT: vstrw.32 q4, [r1] 826; CHECK-NEXT: vmov.u8 r0, q6[1] 827; CHECK-NEXT: vmov.8 q5[1], r0 828; CHECK-NEXT: vmov.u8 r0, q6[2] 829; CHECK-NEXT: vmov.8 q5[2], r0 830; CHECK-NEXT: vmov.u8 r0, q1[11] 831; CHECK-NEXT: vmov.8 q5[3], r0 832; CHECK-NEXT: vmov.u8 r0, q6[4] 833; CHECK-NEXT: vmov.8 q5[4], r0 834; CHECK-NEXT: vmov.u8 r0, q6[5] 835; CHECK-NEXT: vmov.8 q5[5], r0 836; CHECK-NEXT: vmov.u8 r0, q1[12] 837; CHECK-NEXT: vmov.8 q5[6], r0 838; CHECK-NEXT: vmov.u8 r0, q6[7] 839; CHECK-NEXT: vmov.8 q5[7], r0 840; CHECK-NEXT: vmov.u8 r0, q6[8] 841; CHECK-NEXT: vmov.8 q5[8], r0 842; CHECK-NEXT: vmov.u8 r0, q1[13] 843; CHECK-NEXT: vmov.8 q5[9], r0 844; CHECK-NEXT: vmov.u8 r0, q6[10] 845; CHECK-NEXT: vmov.8 q5[10], r0 846; CHECK-NEXT: vmov.u8 r0, q6[11] 847; CHECK-NEXT: vmov.8 q5[11], r0 848; CHECK-NEXT: vmov.u8 r0, q1[14] 849; CHECK-NEXT: vmov.8 q5[12], r0 850; CHECK-NEXT: vmov.u8 r0, q6[13] 851; CHECK-NEXT: vmov.8 q5[13], r0 852; CHECK-NEXT: vmov.u8 r0, q6[14] 853; CHECK-NEXT: vmov.8 q5[14], r0 854; CHECK-NEXT: vmov.u8 r0, q1[15] 855; CHECK-NEXT: vmov.8 q5[15], r0 856; CHECK-NEXT: vstrw.32 q5, [r1, #32] 857; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 858; CHECK-NEXT: bx lr 859entry: 860 %l1 = load <16 x i8>, ptr %src, align 4 861 %s2 = getelementptr <16 x i8>, ptr %src, i32 1 862 %l2 = load <16 x i8>, ptr %s2, align 4 863 %s3 = getelementptr <16 x i8>, ptr %src, i32 2 864 %l3 = load <16 x i8>, ptr %s3, align 4 865 %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 866 %t2 = shufflevector <16 x i8> %l3, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 867 %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 868 store <48 x i8> %s, ptr %dst 869 ret void 870} 871 872; i64 873 874define void @vst3_v2i64(ptr %src, ptr %dst) { 875; CHECK-LABEL: vst3_v2i64: 876; CHECK: @ %bb.0: @ %entry 877; CHECK-NEXT: vldrw.u32 q0, [r0] 878; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 879; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 880; CHECK-NEXT: vmov.f64 d6, d2 881; CHECK-NEXT: vmov.f64 d7, d1 882; CHECK-NEXT: vmov.f64 d1, d4 883; CHECK-NEXT: vstrw.32 q3, [r1, #16] 884; CHECK-NEXT: vmov.f64 d2, d5 885; CHECK-NEXT: vstrw.32 q0, [r1] 886; CHECK-NEXT: vstrw.32 q1, [r1, #32] 887; CHECK-NEXT: bx lr 888entry: 889 %l1 = load <2 x i64>, ptr %src, align 4 890 %s2 = getelementptr <2 x i64>, ptr %src, i32 1 891 %l2 = load <2 x i64>, ptr %s2, align 4 892 %s3 = getelementptr <2 x i64>, ptr %src, i32 2 893 %l3 = load <2 x i64>, ptr %s3, align 4 894 %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 895 %t2 = shufflevector <2 x i64> %l3, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 896 %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 897 store <6 x i64> %s, ptr %dst 898 ret void 899} 900 901define void @vst3_v4i64(ptr %src, ptr %dst) { 902; CHECK-LABEL: vst3_v4i64: 903; CHECK: @ %bb.0: @ %entry 904; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 905; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 906; CHECK-NEXT: vldrw.u32 q7, [r0, #48] 907; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 908; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 909; CHECK-NEXT: vldrw.u32 q1, [r0] 910; CHECK-NEXT: vmov.f64 d6, d15 911; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 912; CHECK-NEXT: vldrw.u32 q4, [r0, #64] 913; CHECK-NEXT: vmov.f64 d15, d13 914; CHECK-NEXT: vmov.f64 d7, d1 915; CHECK-NEXT: vmov.f64 d10, d2 916; CHECK-NEXT: vstrw.32 q3, [r1, #80] 917; CHECK-NEXT: vmov.f64 d11, d12 918; CHECK-NEXT: vmov.f64 d2, d8 919; CHECK-NEXT: vstrw.32 q5, [r1] 920; CHECK-NEXT: vmov.f64 d1, d5 921; CHECK-NEXT: vstrw.32 q1, [r1, #16] 922; CHECK-NEXT: vmov.f64 d8, d15 923; CHECK-NEXT: vstrw.32 q0, [r1, #64] 924; CHECK-NEXT: vmov.f64 d12, d4 925; CHECK-NEXT: vstrw.32 q4, [r1, #32] 926; CHECK-NEXT: vmov.f64 d13, d14 927; CHECK-NEXT: vstrw.32 q6, [r1, #48] 928; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 929; CHECK-NEXT: bx lr 930entry: 931 %l1 = load <4 x i64>, ptr %src, align 4 932 %s2 = getelementptr <4 x i64>, ptr %src, i32 1 933 %l2 = load <4 x i64>, ptr %s2, align 4 934 %s3 = getelementptr <4 x i64>, ptr %src, i32 2 935 %l3 = load <4 x i64>, ptr %s3, align 4 936 %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 937 %t2 = shufflevector <4 x i64> %l3, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 938 %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 939 store <12 x i64> %s, ptr %dst 940 ret void 941} 942 943; f32 944 945define void @vst3_v2f32(ptr %src, ptr %dst) { 946; CHECK-LABEL: vst3_v2f32: 947; CHECK: @ %bb.0: @ %entry 948; CHECK-NEXT: ldr r2, [r0, #20] 949; CHECK-NEXT: vldr s0, [r0] 950; CHECK-NEXT: vldr s3, [r0, #4] 951; CHECK-NEXT: vldr s1, [r0, #8] 952; CHECK-NEXT: vldr s2, [r0, #16] 953; CHECK-NEXT: ldr r0, [r0, #12] 954; CHECK-NEXT: strd r0, r2, [r1, #16] 955; CHECK-NEXT: vstrw.32 q0, [r1] 956; CHECK-NEXT: bx lr 957entry: 958 %l1 = load <2 x float>, ptr %src, align 4 959 %s2 = getelementptr <2 x float>, ptr %src, i32 1 960 %l2 = load <2 x float>, ptr %s2, align 4 961 %s3 = getelementptr <2 x float>, ptr %src, i32 2 962 %l3 = load <2 x float>, ptr %s3, align 4 963 %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 964 %t2 = shufflevector <2 x float> %l3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 965 %s = shufflevector <4 x float> %t1, <4 x float> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 966 store <6 x float> %s, ptr %dst 967 ret void 968} 969 970define void @vst3_v4f32(ptr %src, ptr %dst) { 971; CHECK-LABEL: vst3_v4f32: 972; CHECK: @ %bb.0: @ %entry 973; CHECK-NEXT: .vsave {d8, d9} 974; CHECK-NEXT: vpush {d8, d9} 975; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 976; CHECK-NEXT: vldrw.u32 q2, [r0] 977; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 978; CHECK-NEXT: vmov.f32 s12, s1 979; CHECK-NEXT: vmov.f32 s13, s5 980; CHECK-NEXT: vmov.f32 s18, s4 981; CHECK-NEXT: vmov.f32 s4, s6 982; CHECK-NEXT: vmov.f32 s14, s10 983; CHECK-NEXT: vmov.f32 s15, s2 984; CHECK-NEXT: vmov.f32 s16, s8 985; CHECK-NEXT: vstrw.32 q3, [r1, #16] 986; CHECK-NEXT: vmov.f32 s17, s0 987; CHECK-NEXT: vmov.f32 s19, s9 988; CHECK-NEXT: vmov.f32 s5, s11 989; CHECK-NEXT: vstrw.32 q4, [r1] 990; CHECK-NEXT: vmov.f32 s6, s3 991; CHECK-NEXT: vstrw.32 q1, [r1, #32] 992; CHECK-NEXT: vpop {d8, d9} 993; CHECK-NEXT: bx lr 994entry: 995 %l1 = load <4 x float>, ptr %src, align 4 996 %s2 = getelementptr <4 x float>, ptr %src, i32 1 997 %l2 = load <4 x float>, ptr %s2, align 4 998 %s3 = getelementptr <4 x float>, ptr %src, i32 2 999 %l3 = load <4 x float>, ptr %s3, align 4 1000 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1001 %t2 = shufflevector <4 x float> %l3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1002 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1003 store <12 x float> %s, ptr %dst 1004 ret void 1005} 1006 1007define void @vst3_v8f32(ptr %src, ptr %dst) { 1008; CHECK-LABEL: vst3_v8f32: 1009; CHECK: @ %bb.0: @ %entry 1010; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1011; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1012; CHECK-NEXT: .pad #32 1013; CHECK-NEXT: sub sp, #32 1014; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 1015; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 1016; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 1017; CHECK-NEXT: vldrw.u32 q1, [r0] 1018; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 1019; CHECK-NEXT: vmov.f32 s0, s2 1020; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload 1021; CHECK-NEXT: vmov.f32 s1, s15 1022; CHECK-NEXT: vmov.f32 s2, s11 1023; CHECK-NEXT: vldrw.u32 q7, [r0, #64] 1024; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 1025; CHECK-NEXT: vldrw.u32 q4, [r0, #32] 1026; CHECK-NEXT: vmov.f32 s0, s12 1027; CHECK-NEXT: vmov.f32 s1, s8 1028; CHECK-NEXT: vmov.f32 s3, s13 1029; CHECK-NEXT: vmov.f32 s2, s24 1030; CHECK-NEXT: vstrw.32 q0, [r1, #48] 1031; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 1032; CHECK-NEXT: vmov.f32 s20, s4 1033; CHECK-NEXT: vmov.f32 s23, s5 1034; CHECK-NEXT: vstrw.32 q0, [r1, #80] 1035; CHECK-NEXT: vmov.f32 s12, s9 1036; CHECK-NEXT: vmov.f32 s15, s10 1037; CHECK-NEXT: vmov.f32 s13, s25 1038; CHECK-NEXT: vmov.f32 s9, s7 1039; CHECK-NEXT: vstrw.32 q3, [r1, #64] 1040; CHECK-NEXT: vmov.f32 s21, s16 1041; CHECK-NEXT: vmov.f32 s22, s28 1042; CHECK-NEXT: vmov.f32 s8, s30 1043; CHECK-NEXT: vstrw.32 q5, [r1] 1044; CHECK-NEXT: vmov.f32 s10, s19 1045; CHECK-NEXT: vmov.f32 s11, s31 1046; CHECK-NEXT: vmov.f32 s5, s29 1047; CHECK-NEXT: vstrw.32 q2, [r1, #32] 1048; CHECK-NEXT: vmov.f32 s4, s17 1049; CHECK-NEXT: vmov.f32 s7, s18 1050; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1051; CHECK-NEXT: add sp, #32 1052; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1053; CHECK-NEXT: bx lr 1054entry: 1055 %l1 = load <8 x float>, ptr %src, align 4 1056 %s2 = getelementptr <8 x float>, ptr %src, i32 1 1057 %l2 = load <8 x float>, ptr %s2, align 4 1058 %s3 = getelementptr <8 x float>, ptr %src, i32 2 1059 %l3 = load <8 x float>, ptr %s3, align 4 1060 %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1061 %t2 = shufflevector <8 x float> %l3, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1062 %s = shufflevector <16 x float> %t1, <16 x float> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1063 store <24 x float> %s, ptr %dst 1064 ret void 1065} 1066 1067define void @vst3_v16f32(ptr %src, ptr %dst) { 1068; CHECK-LABEL: vst3_v16f32: 1069; CHECK: @ %bb.0: @ %entry 1070; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1071; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1072; CHECK-NEXT: .pad #128 1073; CHECK-NEXT: sub sp, #128 1074; CHECK-NEXT: vldrw.u32 q3, [r0, #176] 1075; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 1076; CHECK-NEXT: vldrw.u32 q1, [r0] 1077; CHECK-NEXT: vldrw.u32 q0, [r0, #128] 1078; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill 1079; CHECK-NEXT: vldrw.u32 q3, [r0, #160] 1080; CHECK-NEXT: vmov.f32 s24, s9 1081; CHECK-NEXT: vldrw.u32 q5, [r0, #144] 1082; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill 1083; CHECK-NEXT: vldrw.u32 q3, [r0, #96] 1084; CHECK-NEXT: vmov.f32 s26, s6 1085; CHECK-NEXT: vldrw.u32 q7, [r0, #112] 1086; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill 1087; CHECK-NEXT: vldrw.u32 q3, [r0, #80] 1088; CHECK-NEXT: vmov.f32 s27, s10 1089; CHECK-NEXT: vldrw.u32 q4, [r0, #48] 1090; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill 1091; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1092; CHECK-NEXT: vmov.f32 s25, s1 1093; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill 1094; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 1095; CHECK-NEXT: vstrw.32 q6, [r1, #16] 1096; CHECK-NEXT: vmov.f32 s24, s2 1097; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill 1098; CHECK-NEXT: vmov.f32 s27, s3 1099; CHECK-NEXT: vmov.f32 s14, s0 1100; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 1101; CHECK-NEXT: vmov.f32 s12, s4 1102; CHECK-NEXT: vmov.f32 s15, s5 1103; CHECK-NEXT: vmov.f32 s13, s8 1104; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill 1105; CHECK-NEXT: vmov.f32 s25, s7 1106; CHECK-NEXT: vmov.f32 s6, s0 1107; CHECK-NEXT: vmov.f32 s13, s1 1108; CHECK-NEXT: vmov.f32 s0, s2 1109; CHECK-NEXT: vmov.f32 s4, s16 1110; CHECK-NEXT: vmov.f32 s5, s28 1111; CHECK-NEXT: vmov.f32 s7, s17 1112; CHECK-NEXT: vmov.f32 s1, s19 1113; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill 1114; CHECK-NEXT: vmov.f32 s2, s31 1115; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload 1116; CHECK-NEXT: vmov.f32 s26, s11 1117; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload 1118; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill 1119; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload 1120; CHECK-NEXT: vmov.f32 s15, s30 1121; CHECK-NEXT: vstrw.32 q6, [r1, #32] 1122; CHECK-NEXT: vmov.f32 s17, s1 1123; CHECK-NEXT: vldrw.u32 q6, [sp, #80] @ 16-byte Reload 1124; CHECK-NEXT: vmov.f32 s30, s0 1125; CHECK-NEXT: vmov.f32 s0, s2 1126; CHECK-NEXT: vmov.f32 s1, s11 1127; CHECK-NEXT: vmov.f32 s2, s7 1128; CHECK-NEXT: vmov.f32 s14, s18 1129; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill 1130; CHECK-NEXT: vmov.f32 s18, s10 1131; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload 1132; CHECK-NEXT: vmov.f32 s28, s8 1133; CHECK-NEXT: vmov.f32 s31, s9 1134; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload 1135; CHECK-NEXT: vmov.f32 s12, s29 1136; CHECK-NEXT: vmov.f32 s29, s4 1137; CHECK-NEXT: vstrw.32 q3, [r1, #160] 1138; CHECK-NEXT: vmov.f32 s16, s5 1139; CHECK-NEXT: vstrw.32 q7, [r1, #96] 1140; CHECK-NEXT: vmov.f32 s19, s6 1141; CHECK-NEXT: vmov.f32 s4, s8 1142; CHECK-NEXT: vstrw.32 q4, [r1, #112] 1143; CHECK-NEXT: vmov.f32 s6, s20 1144; CHECK-NEXT: vmov.f32 s20, s22 1145; CHECK-NEXT: vmov.f32 s5, s0 1146; CHECK-NEXT: vmov.f32 s8, s1 1147; CHECK-NEXT: vmov.f32 s11, s2 1148; CHECK-NEXT: vmov.f32 s22, s3 1149; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload 1150; CHECK-NEXT: vmov.f32 s7, s9 1151; CHECK-NEXT: vstrw.32 q0, [r1, #128] 1152; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 1153; CHECK-NEXT: vmov.f32 s9, s21 1154; CHECK-NEXT: vstrw.32 q1, [r1, #48] 1155; CHECK-NEXT: vstrw.32 q0, [r1, #144] 1156; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 1157; CHECK-NEXT: vmov.f32 s21, s27 1158; CHECK-NEXT: vstrw.32 q2, [r1, #64] 1159; CHECK-NEXT: vstrw.32 q0, [r1, #176] 1160; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload 1161; CHECK-NEXT: vstrw.32 q5, [r1, #80] 1162; CHECK-NEXT: vstrw.32 q0, [r1] 1163; CHECK-NEXT: add sp, #128 1164; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1165; CHECK-NEXT: bx lr 1166entry: 1167 %l1 = load <16 x float>, ptr %src, align 4 1168 %s2 = getelementptr <16 x float>, ptr %src, i32 1 1169 %l2 = load <16 x float>, ptr %s2, align 4 1170 %s3 = getelementptr <16 x float>, ptr %src, i32 2 1171 %l3 = load <16 x float>, ptr %s3, align 4 1172 %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1173 %t2 = shufflevector <16 x float> %l3, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1174 %s = shufflevector <32 x float> %t1, <32 x float> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 1175 store <48 x float> %s, ptr %dst 1176 ret void 1177} 1178 1179; f16 1180 1181define void @vst3_v2f16(ptr %src, ptr %dst) { 1182; CHECK-LABEL: vst3_v2f16: 1183; CHECK: @ %bb.0: @ %entry 1184; CHECK-NEXT: ldrd r2, r3, [r0] 1185; CHECK-NEXT: ldr r0, [r0, #8] 1186; CHECK-NEXT: vmov.32 q0[0], r2 1187; CHECK-NEXT: vmov.32 q0[1], r3 1188; CHECK-NEXT: vmov.32 q1[0], r0 1189; CHECK-NEXT: vmovx.f16 s2, s0 1190; CHECK-NEXT: vmovx.f16 s6, s4 1191; CHECK-NEXT: vins.f16 s4, s2 1192; CHECK-NEXT: vmovx.f16 s2, s1 1193; CHECK-NEXT: vins.f16 s0, s1 1194; CHECK-NEXT: vmov.f32 s1, s4 1195; CHECK-NEXT: vins.f16 s2, s6 1196; CHECK-NEXT: vmov r3, s2 1197; CHECK-NEXT: vmov r0, r2, d0 1198; CHECK-NEXT: stm r1!, {r0, r2, r3} 1199; CHECK-NEXT: bx lr 1200entry: 1201 %l1 = load <2 x half>, ptr %src, align 4 1202 %s2 = getelementptr <2 x half>, ptr %src, i32 1 1203 %l2 = load <2 x half>, ptr %s2, align 4 1204 %s3 = getelementptr <2 x half>, ptr %src, i32 2 1205 %l3 = load <2 x half>, ptr %s3, align 4 1206 %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1207 %t2 = shufflevector <2 x half> %l3, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1208 %s = shufflevector <4 x half> %t1, <4 x half> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1209 store <6 x half> %s, ptr %dst 1210 ret void 1211} 1212 1213define void @vst3_v4f16(ptr %src, ptr %dst) { 1214; CHECK-LABEL: vst3_v4f16: 1215; CHECK: @ %bb.0: @ %entry 1216; CHECK-NEXT: .save {r4, lr} 1217; CHECK-NEXT: push {r4, lr} 1218; CHECK-NEXT: ldrd lr, r12, [r0] 1219; CHECK-NEXT: ldrd r3, r2, [r0, #8] 1220; CHECK-NEXT: ldrd r4, r0, [r0, #16] 1221; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 1222; CHECK-NEXT: vmov.32 q1[0], r4 1223; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 1224; CHECK-NEXT: vmov.32 q1[1], r0 1225; CHECK-NEXT: vmovx.f16 s9, s3 1226; CHECK-NEXT: vmovx.f16 s6, s0 1227; CHECK-NEXT: vins.f16 s0, s2 1228; CHECK-NEXT: vmovx.f16 s8, s4 1229; CHECK-NEXT: vmovx.f16 s2, s2 1230; CHECK-NEXT: vins.f16 s4, s6 1231; CHECK-NEXT: vmovx.f16 s6, s1 1232; CHECK-NEXT: vins.f16 s2, s8 1233; CHECK-NEXT: vmovx.f16 s8, s5 1234; CHECK-NEXT: vins.f16 s5, s6 1235; CHECK-NEXT: vins.f16 s9, s8 1236; CHECK-NEXT: vmov.f32 s8, s5 1237; CHECK-NEXT: vins.f16 s1, s3 1238; CHECK-NEXT: vmov r0, r2, d4 1239; CHECK-NEXT: vmov q2, q0 1240; CHECK-NEXT: vmov.f32 s9, s4 1241; CHECK-NEXT: vmov.f32 s10, s2 1242; CHECK-NEXT: vmov.f32 s11, s1 1243; CHECK-NEXT: vstrw.32 q2, [r1] 1244; CHECK-NEXT: strd r0, r2, [r1, #16] 1245; CHECK-NEXT: pop {r4, pc} 1246entry: 1247 %l1 = load <4 x half>, ptr %src, align 4 1248 %s2 = getelementptr <4 x half>, ptr %src, i32 1 1249 %l2 = load <4 x half>, ptr %s2, align 4 1250 %s3 = getelementptr <4 x half>, ptr %src, i32 2 1251 %l3 = load <4 x half>, ptr %s3, align 4 1252 %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1253 %t2 = shufflevector <4 x half> %l3, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1254 %s = shufflevector <8 x half> %t1, <8 x half> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1255 store <12 x half> %s, ptr %dst 1256 ret void 1257} 1258 1259define void @vst3_v8f16(ptr %src, ptr %dst) { 1260; CHECK-LABEL: vst3_v8f16: 1261; CHECK: @ %bb.0: @ %entry 1262; CHECK-NEXT: .vsave {d8, d9} 1263; CHECK-NEXT: vpush {d8, d9} 1264; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 1265; CHECK-NEXT: vldrw.u32 q3, [r0] 1266; CHECK-NEXT: vmovx.f16 s0, s18 1267; CHECK-NEXT: vmov.f32 s4, s15 1268; CHECK-NEXT: vmov r2, s0 1269; CHECK-NEXT: vins.f16 s4, s19 1270; CHECK-NEXT: vmov.16 q0[0], r2 1271; CHECK-NEXT: vmovx.f16 s10, s16 1272; CHECK-NEXT: vmov.f32 s1, s4 1273; CHECK-NEXT: vmovx.f16 s4, s19 1274; CHECK-NEXT: vmov r2, s4 1275; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 1276; CHECK-NEXT: vmov.16 q0[6], r2 1277; CHECK-NEXT: vmov r0, s10 1278; CHECK-NEXT: vmovx.f16 s8, s6 1279; CHECK-NEXT: vmovx.f16 s2, s15 1280; CHECK-NEXT: vins.f16 s0, s8 1281; CHECK-NEXT: vmovx.f16 s8, s7 1282; CHECK-NEXT: vins.f16 s3, s8 1283; CHECK-NEXT: vmov.f32 s8, s12 1284; CHECK-NEXT: vins.f16 s8, s16 1285; CHECK-NEXT: vins.f16 s7, s2 1286; CHECK-NEXT: vmov.f32 s2, s13 1287; CHECK-NEXT: vmov.16 q2[4], r0 1288; CHECK-NEXT: vins.f16 s2, s17 1289; CHECK-NEXT: vmov.f32 s11, s2 1290; CHECK-NEXT: vmovx.f16 s2, s12 1291; CHECK-NEXT: vmovx.f16 s12, s4 1292; CHECK-NEXT: vins.f16 s4, s2 1293; CHECK-NEXT: vins.f16 s10, s12 1294; CHECK-NEXT: vmovx.f16 s12, s17 1295; CHECK-NEXT: vmov.f32 s2, s14 1296; CHECK-NEXT: vmov r0, s12 1297; CHECK-NEXT: vins.f16 s2, s18 1298; CHECK-NEXT: vmov.16 q4[2], r0 1299; CHECK-NEXT: vmovx.f16 s12, s5 1300; CHECK-NEXT: vmov.f32 s18, s2 1301; CHECK-NEXT: vmovx.f16 s2, s13 1302; CHECK-NEXT: vins.f16 s5, s2 1303; CHECK-NEXT: vmovx.f16 s2, s14 1304; CHECK-NEXT: vins.f16 s6, s2 1305; CHECK-NEXT: vmov.f32 s2, s7 1306; CHECK-NEXT: vmov.f32 s9, s4 1307; CHECK-NEXT: vins.f16 s17, s12 1308; CHECK-NEXT: vmov.f32 s16, s5 1309; CHECK-NEXT: vstrw.32 q0, [r1, #32] 1310; CHECK-NEXT: vmov.f32 s19, s6 1311; CHECK-NEXT: vstrw.32 q2, [r1] 1312; CHECK-NEXT: vstrw.32 q4, [r1, #16] 1313; CHECK-NEXT: vpop {d8, d9} 1314; CHECK-NEXT: bx lr 1315entry: 1316 %l1 = load <8 x half>, ptr %src, align 4 1317 %s2 = getelementptr <8 x half>, ptr %src, i32 1 1318 %l2 = load <8 x half>, ptr %s2, align 4 1319 %s3 = getelementptr <8 x half>, ptr %src, i32 2 1320 %l3 = load <8 x half>, ptr %s3, align 4 1321 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1322 %t2 = shufflevector <8 x half> %l3, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1323 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1324 store <24 x half> %s, ptr %dst 1325 ret void 1326} 1327 1328define void @vst3_v16f16(ptr %src, ptr %dst) { 1329; CHECK-LABEL: vst3_v16f16: 1330; CHECK: @ %bb.0: @ %entry 1331; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1332; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1333; CHECK-NEXT: .pad #48 1334; CHECK-NEXT: sub sp, #48 1335; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 1336; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 1337; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 1338; CHECK-NEXT: vmov.f32 s8, s12 1339; CHECK-NEXT: vmovx.f16 s2, s4 1340; CHECK-NEXT: vmov.f32 s0, s13 1341; CHECK-NEXT: vins.f16 s8, s4 1342; CHECK-NEXT: vmov r2, s2 1343; CHECK-NEXT: vins.f16 s0, s5 1344; CHECK-NEXT: vmov.16 q2[4], r2 1345; CHECK-NEXT: vmov q4, q3 1346; CHECK-NEXT: vmov.f32 s11, s0 1347; CHECK-NEXT: vmovx.f16 s0, s16 1348; CHECK-NEXT: vmov.f32 s12, s8 1349; CHECK-NEXT: vmov.f64 d11, d9 1350; CHECK-NEXT: vmov.f32 s21, s17 1351; CHECK-NEXT: vmov.f64 d7, d5 1352; CHECK-NEXT: vldrw.u32 q2, [r0, #80] 1353; CHECK-NEXT: vmovx.f16 s2, s8 1354; CHECK-NEXT: vins.f16 s8, s0 1355; CHECK-NEXT: vins.f16 s14, s2 1356; CHECK-NEXT: vmovx.f16 s2, s24 1357; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill 1358; CHECK-NEXT: vldrw.u32 q3, [r0] 1359; CHECK-NEXT: vmov r2, s2 1360; CHECK-NEXT: vmov.f32 s16, s12 1361; CHECK-NEXT: vmov.f32 s0, s13 1362; CHECK-NEXT: vins.f16 s16, s24 1363; CHECK-NEXT: vmov.16 q4[4], r2 1364; CHECK-NEXT: vins.f16 s0, s25 1365; CHECK-NEXT: vmov.f32 s19, s0 1366; CHECK-NEXT: vmovx.f16 s0, s12 1367; CHECK-NEXT: vmov.f64 d15, d13 1368; CHECK-NEXT: vmov.f32 s17, s13 1369; CHECK-NEXT: vmov.f32 s24, s16 1370; CHECK-NEXT: vmov.f64 d13, d9 1371; CHECK-NEXT: vmov.f64 d9, d7 1372; CHECK-NEXT: vldrw.u32 q3, [r0, #64] 1373; CHECK-NEXT: vmovx.f16 s2, s12 1374; CHECK-NEXT: vins.f16 s12, s0 1375; CHECK-NEXT: vins.f16 s26, s2 1376; CHECK-NEXT: vmovx.f16 s2, s30 1377; CHECK-NEXT: vmov.f32 s0, s19 1378; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill 1379; CHECK-NEXT: vmov r0, s2 1380; CHECK-NEXT: vins.f16 s0, s31 1381; CHECK-NEXT: vmov.f32 s29, s25 1382; CHECK-NEXT: vmov.16 q6[0], r0 1383; CHECK-NEXT: vmov.f32 s25, s0 1384; CHECK-NEXT: vmovx.f16 s0, s31 1385; CHECK-NEXT: vmov r0, s0 1386; CHECK-NEXT: vmovx.f16 s0, s14 1387; CHECK-NEXT: vmov.16 q6[6], r0 1388; CHECK-NEXT: vmovx.f16 s2, s15 1389; CHECK-NEXT: vins.f16 s24, s0 1390; CHECK-NEXT: vmovx.f16 s0, s19 1391; CHECK-NEXT: vins.f16 s15, s0 1392; CHECK-NEXT: vmovx.f16 s0, s6 1393; CHECK-NEXT: vmov.f32 s4, s23 1394; CHECK-NEXT: vins.f16 s27, s2 1395; CHECK-NEXT: vmov r0, s0 1396; CHECK-NEXT: vins.f16 s4, s7 1397; CHECK-NEXT: vmov.16 q0[0], r0 1398; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill 1399; CHECK-NEXT: vmov.f32 s1, s4 1400; CHECK-NEXT: vmovx.f16 s4, s7 1401; CHECK-NEXT: vmov r0, s4 1402; CHECK-NEXT: vmovx.f16 s4, s10 1403; CHECK-NEXT: vmov.16 q0[6], r0 1404; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload 1405; CHECK-NEXT: vins.f16 s0, s4 1406; CHECK-NEXT: vmovx.f16 s4, s11 1407; CHECK-NEXT: vmovx.f16 s2, s23 1408; CHECK-NEXT: vins.f16 s3, s4 1409; CHECK-NEXT: vmovx.f16 s4, s5 1410; CHECK-NEXT: vins.f16 s11, s2 1411; CHECK-NEXT: vmov.f32 s2, s22 1412; CHECK-NEXT: vmov r0, s4 1413; CHECK-NEXT: vins.f16 s2, s6 1414; CHECK-NEXT: vmov.16 q1[2], r0 1415; CHECK-NEXT: vmov.f32 s29, s12 1416; CHECK-NEXT: vmovx.f16 s4, s21 1417; CHECK-NEXT: vmovx.f16 s12, s9 1418; CHECK-NEXT: vins.f16 s9, s4 1419; CHECK-NEXT: vmovx.f16 s4, s22 1420; CHECK-NEXT: vins.f16 s10, s4 1421; CHECK-NEXT: vmov.f32 s21, s17 1422; CHECK-NEXT: vmov.f32 s22, s18 1423; CHECK-NEXT: vins.f16 s5, s12 1424; CHECK-NEXT: vmov.f32 s4, s18 1425; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 1426; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill 1427; CHECK-NEXT: vmov.f32 s6, s2 1428; CHECK-NEXT: vmovx.f16 s12, s17 1429; CHECK-NEXT: vins.f16 s4, s18 1430; CHECK-NEXT: vmov r0, s12 1431; CHECK-NEXT: vmovx.f16 s12, s13 1432; CHECK-NEXT: vmov.16 q7[2], r0 1433; CHECK-NEXT: vmov.f32 s2, s11 1434; CHECK-NEXT: vmov.f32 s30, s4 1435; CHECK-NEXT: vmovx.f16 s4, s21 1436; CHECK-NEXT: vins.f16 s13, s4 1437; CHECK-NEXT: vmovx.f16 s4, s22 1438; CHECK-NEXT: vins.f16 s14, s4 1439; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload 1440; CHECK-NEXT: vstrw.32 q0, [r1, #80] 1441; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload 1442; CHECK-NEXT: vmov.f32 s26, s15 1443; CHECK-NEXT: vins.f16 s29, s12 1444; CHECK-NEXT: vmov.f32 s21, s8 1445; CHECK-NEXT: vstrw.32 q6, [r1, #32] 1446; CHECK-NEXT: vmov.f32 s4, s9 1447; CHECK-NEXT: vstrw.32 q5, [r1, #48] 1448; CHECK-NEXT: vmov.f32 s7, s10 1449; CHECK-NEXT: vstrw.32 q0, [r1] 1450; CHECK-NEXT: vmov.f32 s28, s13 1451; CHECK-NEXT: vstrw.32 q1, [r1, #64] 1452; CHECK-NEXT: vmov.f32 s31, s14 1453; CHECK-NEXT: vstrw.32 q7, [r1, #16] 1454; CHECK-NEXT: add sp, #48 1455; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1456; CHECK-NEXT: bx lr 1457entry: 1458 %l1 = load <16 x half>, ptr %src, align 4 1459 %s2 = getelementptr <16 x half>, ptr %src, i32 1 1460 %l2 = load <16 x half>, ptr %s2, align 4 1461 %s3 = getelementptr <16 x half>, ptr %src, i32 2 1462 %l3 = load <16 x half>, ptr %s3, align 4 1463 %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1464 %t2 = shufflevector <16 x half> %l3, <16 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1465 %s = shufflevector <32 x half> %t1, <32 x half> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 1466 store <48 x half> %s, ptr %dst 1467 ret void 1468} 1469 1470; f64 1471 1472define void @vst3_v2f64(ptr %src, ptr %dst) { 1473; CHECK-LABEL: vst3_v2f64: 1474; CHECK: @ %bb.0: @ %entry 1475; CHECK-NEXT: vldrw.u32 q0, [r0] 1476; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 1477; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1478; CHECK-NEXT: vmov.f64 d6, d2 1479; CHECK-NEXT: vmov.f64 d7, d1 1480; CHECK-NEXT: vmov.f64 d1, d4 1481; CHECK-NEXT: vstrw.32 q3, [r1, #16] 1482; CHECK-NEXT: vmov.f64 d2, d5 1483; CHECK-NEXT: vstrw.32 q0, [r1] 1484; CHECK-NEXT: vstrw.32 q1, [r1, #32] 1485; CHECK-NEXT: bx lr 1486entry: 1487 %l1 = load <2 x double>, ptr %src, align 4 1488 %s2 = getelementptr <2 x double>, ptr %src, i32 1 1489 %l2 = load <2 x double>, ptr %s2, align 4 1490 %s3 = getelementptr <2 x double>, ptr %src, i32 2 1491 %l3 = load <2 x double>, ptr %s3, align 4 1492 %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1493 %t2 = shufflevector <2 x double> %l3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1494 %s = shufflevector <4 x double> %t1, <4 x double> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 1495 store <6 x double> %s, ptr %dst 1496 ret void 1497} 1498 1499define void @vst3_v4f64(ptr %src, ptr %dst) { 1500; CHECK-LABEL: vst3_v4f64: 1501; CHECK: @ %bb.0: @ %entry 1502; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1503; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1504; CHECK-NEXT: vldrw.u32 q7, [r0, #48] 1505; CHECK-NEXT: vldrw.u32 q6, [r0, #32] 1506; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 1507; CHECK-NEXT: vldrw.u32 q1, [r0] 1508; CHECK-NEXT: vmov.f64 d6, d15 1509; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 1510; CHECK-NEXT: vldrw.u32 q4, [r0, #64] 1511; CHECK-NEXT: vmov.f64 d15, d13 1512; CHECK-NEXT: vmov.f64 d7, d1 1513; CHECK-NEXT: vmov.f64 d10, d2 1514; CHECK-NEXT: vstrw.32 q3, [r1, #80] 1515; CHECK-NEXT: vmov.f64 d11, d12 1516; CHECK-NEXT: vmov.f64 d2, d8 1517; CHECK-NEXT: vstrw.32 q5, [r1] 1518; CHECK-NEXT: vmov.f64 d1, d5 1519; CHECK-NEXT: vstrw.32 q1, [r1, #16] 1520; CHECK-NEXT: vmov.f64 d8, d15 1521; CHECK-NEXT: vstrw.32 q0, [r1, #64] 1522; CHECK-NEXT: vmov.f64 d12, d4 1523; CHECK-NEXT: vstrw.32 q4, [r1, #32] 1524; CHECK-NEXT: vmov.f64 d13, d14 1525; CHECK-NEXT: vstrw.32 q6, [r1, #48] 1526; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1527; CHECK-NEXT: bx lr 1528entry: 1529 %l1 = load <4 x double>, ptr %src, align 4 1530 %s2 = getelementptr <4 x double>, ptr %src, i32 1 1531 %l2 = load <4 x double>, ptr %s2, align 4 1532 %s3 = getelementptr <4 x double>, ptr %src, i32 2 1533 %l3 = load <4 x double>, ptr %s3, align 4 1534 %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1535 %t2 = shufflevector <4 x double> %l3, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1536 %s = shufflevector <8 x double> %t1, <8 x double> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 1537 store <12 x double> %s, ptr %dst 1538 ret void 1539} 1540