1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vst2_v2i32(ptr %src, ptr %dst) { 7; CHECK-LABEL: vst2_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: ldm.w r0, {r2, r3, r12} 10; CHECK-NEXT: ldr r0, [r0, #12] 11; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 12; CHECK-NEXT: vmov q0[3], q0[1], r12, r0 13; CHECK-NEXT: vstrw.32 q0, [r1] 14; CHECK-NEXT: bx lr 15entry: 16 %l1 = load <2 x i32>, ptr %src, align 4 17 %s2 = getelementptr <2 x i32>, ptr %src, i32 1 18 %l2 = load <2 x i32>, ptr %s2, align 4 19 %s = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 20 store <4 x i32> %s, ptr %dst, align 4 21 ret void 22} 23 24define void @vst2_v4i32(ptr %src, ptr %dst) { 25; CHECK-LABEL: vst2_v4i32: 26; CHECK: @ %bb.0: @ %entry 27; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 28; CHECK-NEXT: vldrw.u32 q0, [r0] 29; CHECK-NEXT: vst20.32 {q0, q1}, [r1] 30; CHECK-NEXT: vst21.32 {q0, q1}, [r1] 31; CHECK-NEXT: bx lr 32entry: 33 %l1 = load <4 x i32>, ptr %src, align 4 34 %s2 = getelementptr <4 x i32>, ptr %src, i32 1 35 %l2 = load <4 x i32>, ptr %s2, align 4 36 %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 37 store <8 x i32> %s, ptr %dst, align 4 38 ret void 39} 40 41define void @vst2_v8i32(ptr %src, ptr %dst) { 42; CHECK-LABEL: vst2_v8i32: 43; CHECK: @ %bb.0: @ %entry 44; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 45; CHECK-NEXT: vldrw.u32 q0, [r0] 46; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 47; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 48; CHECK-NEXT: vst20.32 {q0, q1}, [r1] 49; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! 50; CHECK-NEXT: vst20.32 {q2, q3}, [r1] 51; CHECK-NEXT: vst21.32 {q2, q3}, [r1] 52; CHECK-NEXT: bx lr 53entry: 54 %l1 = load <8 x i32>, ptr %src, align 4 55 %s2 = getelementptr <8 x i32>, ptr %src, i32 1 56 %l2 = load <8 x i32>, ptr %s2, align 4 57 %s = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 58 store <16 x i32> %s, ptr %dst, align 4 59 ret void 60} 61 62define void @vst2_v16i32(ptr %src, ptr %dst) { 63; CHECK-LABEL: vst2_v16i32: 64; CHECK: @ %bb.0: @ %entry 65; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 66; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 67; CHECK-NEXT: vldrw.u32 q7, [r0, #64] 68; CHECK-NEXT: vldrw.u32 q6, [r0] 69; CHECK-NEXT: vldrw.u32 q1, [r0, #112] 70; CHECK-NEXT: vldrw.u32 q3, [r0, #96] 71; CHECK-NEXT: vldrw.u32 q5, [r0, #80] 72; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 73; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 74; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 75; CHECK-NEXT: add.w r0, r1, #96 76; CHECK-NEXT: add.w r2, r1, #64 77; CHECK-NEXT: vst20.32 {q6, q7}, [r1] 78; CHECK-NEXT: vst21.32 {q6, q7}, [r1]! 79; CHECK-NEXT: vst20.32 {q4, q5}, [r1] 80; CHECK-NEXT: vst20.32 {q2, q3}, [r2] 81; CHECK-NEXT: vst20.32 {q0, q1}, [r0] 82; CHECK-NEXT: vst21.32 {q4, q5}, [r1] 83; CHECK-NEXT: vst21.32 {q2, q3}, [r2] 84; CHECK-NEXT: vst21.32 {q0, q1}, [r0] 85; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 86; CHECK-NEXT: bx lr 87entry: 88 %l1 = load <16 x i32>, ptr %src, align 4 89 %s2 = getelementptr <16 x i32>, ptr %src, i32 1 90 %l2 = load <16 x i32>, ptr %s2, align 4 91 %s = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 92 store <32 x i32> %s, ptr %dst, align 4 93 ret void 94} 95 96define void @vst2_v4i32_align1(ptr %src, ptr %dst) { 97; CHECK-LABEL: vst2_v4i32_align1: 98; CHECK: @ %bb.0: @ %entry 99; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 100; CHECK-NEXT: vldrw.u32 q1, [r0] 101; CHECK-NEXT: vmov.f32 s8, s6 102; CHECK-NEXT: vmov.f32 s9, s2 103; CHECK-NEXT: vmov.f32 s10, s7 104; CHECK-NEXT: vmov.f32 s11, s3 105; CHECK-NEXT: vmov.f32 s12, s4 106; CHECK-NEXT: vstrb.8 q2, [r1, #16] 107; CHECK-NEXT: vmov.f32 s13, s0 108; CHECK-NEXT: vmov.f32 s14, s5 109; CHECK-NEXT: vmov.f32 s15, s1 110; CHECK-NEXT: vstrb.8 q3, [r1] 111; CHECK-NEXT: bx lr 112entry: 113 %l1 = load <4 x i32>, ptr %src, align 4 114 %s2 = getelementptr <4 x i32>, ptr %src, i32 1 115 %l2 = load <4 x i32>, ptr %s2, align 4 116 %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 117 store <8 x i32> %s, ptr %dst, align 1 118 ret void 119} 120 121; i16 122 123define void @vst2_v2i16(ptr %src, ptr %dst) { 124; CHECK-LABEL: vst2_v2i16: 125; CHECK: @ %bb.0: @ %entry 126; CHECK-NEXT: ldrh r2, [r0, #2] 127; CHECK-NEXT: ldrh r3, [r0] 128; CHECK-NEXT: ldrh.w r12, [r0, #6] 129; CHECK-NEXT: ldrh r0, [r0, #4] 130; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 131; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 132; CHECK-NEXT: vstrh.32 q0, [r1] 133; CHECK-NEXT: bx lr 134entry: 135 %l1 = load <2 x i16>, ptr %src, align 4 136 %s2 = getelementptr <2 x i16>, ptr %src, i32 1 137 %l2 = load <2 x i16>, ptr %s2, align 4 138 %s = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 139 store <4 x i16> %s, ptr %dst, align 2 140 ret void 141} 142 143define void @vst2_v4i16(ptr %src, ptr %dst) { 144; CHECK-LABEL: vst2_v4i16: 145; CHECK: @ %bb.0: @ %entry 146; CHECK-NEXT: vldrh.u32 q0, [r0, #8] 147; CHECK-NEXT: vldrh.u32 q1, [r0] 148; CHECK-NEXT: vmovnt.i32 q1, q0 149; CHECK-NEXT: vstrh.16 q1, [r1] 150; CHECK-NEXT: bx lr 151entry: 152 %l1 = load <4 x i16>, ptr %src, align 4 153 %s2 = getelementptr <4 x i16>, ptr %src, i32 1 154 %l2 = load <4 x i16>, ptr %s2, align 4 155 %s = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 156 store <8 x i16> %s, ptr %dst, align 2 157 ret void 158} 159 160define void @vst2_v8i16(ptr %src, ptr %dst) { 161; CHECK-LABEL: vst2_v8i16: 162; CHECK: @ %bb.0: @ %entry 163; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 164; CHECK-NEXT: vldrw.u32 q0, [r0] 165; CHECK-NEXT: vst20.16 {q0, q1}, [r1] 166; CHECK-NEXT: vst21.16 {q0, q1}, [r1] 167; CHECK-NEXT: bx lr 168entry: 169 %l1 = load <8 x i16>, ptr %src, align 4 170 %s2 = getelementptr <8 x i16>, ptr %src, i32 1 171 %l2 = load <8 x i16>, ptr %s2, align 4 172 %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 173 store <16 x i16> %s, ptr %dst, align 2 174 ret void 175} 176 177define void @vst2_v16i16(ptr %src, ptr %dst) { 178; CHECK-LABEL: vst2_v16i16: 179; CHECK: @ %bb.0: @ %entry 180; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 181; CHECK-NEXT: vldrw.u32 q0, [r0] 182; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 183; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 184; CHECK-NEXT: vst20.16 {q0, q1}, [r1] 185; CHECK-NEXT: vst21.16 {q0, q1}, [r1]! 186; CHECK-NEXT: vst20.16 {q2, q3}, [r1] 187; CHECK-NEXT: vst21.16 {q2, q3}, [r1] 188; CHECK-NEXT: bx lr 189entry: 190 %l1 = load <16 x i16>, ptr %src, align 4 191 %s2 = getelementptr <16 x i16>, ptr %src, i32 1 192 %l2 = load <16 x i16>, ptr %s2, align 4 193 %s = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 194 store <32 x i16> %s, ptr %dst, align 2 195 ret void 196} 197 198define void @vst2_v8i16_align1(ptr %src, ptr %dst) { 199; CHECK-LABEL: vst2_v8i16_align1: 200; CHECK: @ %bb.0: @ %entry 201; CHECK-NEXT: vldrw.u32 q2, [r0] 202; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 203; CHECK-NEXT: vmovx.f16 s1, s10 204; CHECK-NEXT: vmovx.f16 s0, s6 205; CHECK-NEXT: vins.f16 s10, s6 206; CHECK-NEXT: vmovx.f16 s3, s11 207; CHECK-NEXT: vmovx.f16 s6, s7 208; CHECK-NEXT: vins.f16 s11, s7 209; CHECK-NEXT: vins.f16 s3, s6 210; CHECK-NEXT: vmovx.f16 s6, s8 211; CHECK-NEXT: vins.f16 s8, s4 212; CHECK-NEXT: vmovx.f16 s4, s4 213; CHECK-NEXT: vmov q3, q2 214; CHECK-NEXT: vins.f16 s6, s4 215; CHECK-NEXT: vmovx.f16 s15, s9 216; CHECK-NEXT: vins.f16 s9, s5 217; CHECK-NEXT: vmovx.f16 s4, s5 218; CHECK-NEXT: vins.f16 s1, s0 219; CHECK-NEXT: vmov.f32 s0, s10 220; CHECK-NEXT: vins.f16 s15, s4 221; CHECK-NEXT: vmov.f32 s2, s11 222; CHECK-NEXT: vmov.f32 s13, s6 223; CHECK-NEXT: vstrb.8 q0, [r1, #16] 224; CHECK-NEXT: vmov.f32 s14, s9 225; CHECK-NEXT: vstrb.8 q3, [r1] 226; CHECK-NEXT: bx lr 227entry: 228 %l1 = load <8 x i16>, ptr %src, align 4 229 %s2 = getelementptr <8 x i16>, ptr %src, i32 1 230 %l2 = load <8 x i16>, ptr %s2, align 4 231 %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 232 store <16 x i16> %s, ptr %dst, align 1 233 ret void 234} 235 236; i8 237 238define void @vst2_v2i8(ptr %src, ptr %dst) { 239; CHECK-LABEL: vst2_v2i8: 240; CHECK: @ %bb.0: @ %entry 241; CHECK-NEXT: ldrb r2, [r0] 242; CHECK-NEXT: ldrb r3, [r0, #1] 243; CHECK-NEXT: ldrb.w r12, [r0, #2] 244; CHECK-NEXT: ldrb r0, [r0, #3] 245; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 246; CHECK-NEXT: vmov q0[3], q0[1], r12, r0 247; CHECK-NEXT: vstrb.32 q0, [r1] 248; CHECK-NEXT: bx lr 249entry: 250 %l1 = load <2 x i8>, ptr %src, align 4 251 %s2 = getelementptr <2 x i8>, ptr %src, i32 1 252 %l2 = load <2 x i8>, ptr %s2, align 4 253 %s = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 254 store <4 x i8> %s, ptr %dst, align 1 255 ret void 256} 257 258define void @vst2_v4i8(ptr %src, ptr %dst) { 259; CHECK-LABEL: vst2_v4i8: 260; CHECK: @ %bb.0: @ %entry 261; CHECK-NEXT: vldrb.u32 q0, [r0, #4] 262; CHECK-NEXT: vldrb.u32 q1, [r0] 263; CHECK-NEXT: vmovnt.i32 q1, q0 264; CHECK-NEXT: vstrb.16 q1, [r1] 265; CHECK-NEXT: bx lr 266entry: 267 %l1 = load <4 x i8>, ptr %src, align 4 268 %s2 = getelementptr <4 x i8>, ptr %src, i32 1 269 %l2 = load <4 x i8>, ptr %s2, align 4 270 %s = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 271 store <8 x i8> %s, ptr %dst, align 1 272 ret void 273} 274 275define void @vst2_v8i8(ptr %src, ptr %dst) { 276; CHECK-LABEL: vst2_v8i8: 277; CHECK: @ %bb.0: @ %entry 278; CHECK-NEXT: vldrb.u16 q0, [r0, #8] 279; CHECK-NEXT: vldrb.u16 q1, [r0] 280; CHECK-NEXT: vmovnt.i16 q1, q0 281; CHECK-NEXT: vstrb.8 q1, [r1] 282; CHECK-NEXT: bx lr 283entry: 284 %l1 = load <8 x i8>, ptr %src, align 4 285 %s2 = getelementptr <8 x i8>, ptr %src, i32 1 286 %l2 = load <8 x i8>, ptr %s2, align 4 287 %s = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 288 store <16 x i8> %s, ptr %dst, align 1 289 ret void 290} 291 292define void @vst2_v16i8(ptr %src, ptr %dst) { 293; CHECK-LABEL: vst2_v16i8: 294; CHECK: @ %bb.0: @ %entry 295; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 296; CHECK-NEXT: vldrw.u32 q0, [r0] 297; CHECK-NEXT: vst20.8 {q0, q1}, [r1] 298; CHECK-NEXT: vst21.8 {q0, q1}, [r1] 299; CHECK-NEXT: bx lr 300entry: 301 %l1 = load <16 x i8>, ptr %src, align 4 302 %s2 = getelementptr <16 x i8>, ptr %src, i32 1 303 %l2 = load <16 x i8>, ptr %s2, align 4 304 %s = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 305 store <32 x i8> %s, ptr %dst, align 1 306 ret void 307} 308 309; i64 310 311define void @vst2_v2i64(ptr %src, ptr %dst) { 312; CHECK-LABEL: vst2_v2i64: 313; CHECK: @ %bb.0: @ %entry 314; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 315; CHECK-NEXT: vldrw.u32 q1, [r0] 316; CHECK-NEXT: vmov.f64 d4, d3 317; CHECK-NEXT: vmov.f64 d5, d1 318; CHECK-NEXT: vmov.f64 d3, d0 319; CHECK-NEXT: vstrw.32 q2, [r1, #16] 320; CHECK-NEXT: vstrw.32 q1, [r1] 321; CHECK-NEXT: bx lr 322entry: 323 %l1 = load <2 x i64>, ptr %src, align 4 324 %s2 = getelementptr <2 x i64>, ptr %src, i32 1 325 %l2 = load <2 x i64>, ptr %s2, align 4 326 %s = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 327 store <4 x i64> %s, ptr %dst, align 8 328 ret void 329} 330 331define void @vst2_v4i64(ptr %src, ptr %dst) { 332; CHECK-LABEL: vst2_v4i64: 333; CHECK: @ %bb.0: @ %entry 334; CHECK-NEXT: .vsave {d8, d9} 335; CHECK-NEXT: vpush {d8, d9} 336; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 337; CHECK-NEXT: vldrw.u32 q2, [r0] 338; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 339; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 340; CHECK-NEXT: vmov.f64 d8, d4 341; CHECK-NEXT: vmov.f64 d9, d0 342; CHECK-NEXT: vmov.f64 d0, d5 343; CHECK-NEXT: vstrw.32 q4, [r1] 344; CHECK-NEXT: vmov.f64 d5, d2 345; CHECK-NEXT: vstrw.32 q0, [r1, #16] 346; CHECK-NEXT: vmov.f64 d4, d6 347; CHECK-NEXT: vmov.f64 d2, d7 348; CHECK-NEXT: vstrw.32 q2, [r1, #32] 349; CHECK-NEXT: vstrw.32 q1, [r1, #48] 350; CHECK-NEXT: vpop {d8, d9} 351; CHECK-NEXT: bx lr 352entry: 353 %l1 = load <4 x i64>, ptr %src, align 4 354 %s2 = getelementptr <4 x i64>, ptr %src, i32 1 355 %l2 = load <4 x i64>, ptr %s2, align 4 356 %s = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 357 store <8 x i64> %s, ptr %dst, align 8 358 ret void 359} 360 361; f32 362 363define void @vst2_v2f32(ptr %src, ptr %dst) { 364; CHECK-LABEL: vst2_v2f32: 365; CHECK: @ %bb.0: @ %entry 366; CHECK-NEXT: vldr s0, [r0] 367; CHECK-NEXT: vldr s2, [r0, #4] 368; CHECK-NEXT: vldr s1, [r0, #8] 369; CHECK-NEXT: vldr s3, [r0, #12] 370; CHECK-NEXT: vstrw.32 q0, [r1] 371; CHECK-NEXT: bx lr 372entry: 373 %l1 = load <2 x float>, ptr %src, align 4 374 %s2 = getelementptr <2 x float>, ptr %src, i32 1 375 %l2 = load <2 x float>, ptr %s2, align 4 376 %s = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 377 store <4 x float> %s, ptr %dst, align 4 378 ret void 379} 380 381define void @vst2_v4f32(ptr %src, ptr %dst) { 382; CHECK-LABEL: vst2_v4f32: 383; CHECK: @ %bb.0: @ %entry 384; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 385; CHECK-NEXT: vldrw.u32 q0, [r0] 386; CHECK-NEXT: vst20.32 {q0, q1}, [r1] 387; CHECK-NEXT: vst21.32 {q0, q1}, [r1] 388; CHECK-NEXT: bx lr 389entry: 390 %l1 = load <4 x float>, ptr %src, align 4 391 %s2 = getelementptr <4 x float>, ptr %src, i32 1 392 %l2 = load <4 x float>, ptr %s2, align 4 393 %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 394 store <8 x float> %s, ptr %dst, align 4 395 ret void 396} 397 398define void @vst2_v8f32(ptr %src, ptr %dst) { 399; CHECK-LABEL: vst2_v8f32: 400; CHECK: @ %bb.0: @ %entry 401; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 402; CHECK-NEXT: vldrw.u32 q0, [r0] 403; CHECK-NEXT: vldrw.u32 q3, [r0, #48] 404; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 405; CHECK-NEXT: vst20.32 {q0, q1}, [r1] 406; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! 407; CHECK-NEXT: vst20.32 {q2, q3}, [r1] 408; CHECK-NEXT: vst21.32 {q2, q3}, [r1] 409; CHECK-NEXT: bx lr 410entry: 411 %l1 = load <8 x float>, ptr %src, align 4 412 %s2 = getelementptr <8 x float>, ptr %src, i32 1 413 %l2 = load <8 x float>, ptr %s2, align 4 414 %s = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 415 store <16 x float> %s, ptr %dst, align 4 416 ret void 417} 418 419define void @vst2_v16f32(ptr %src, ptr %dst) { 420; CHECK-LABEL: vst2_v16f32: 421; CHECK: @ %bb.0: @ %entry 422; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 423; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 424; CHECK-NEXT: vldrw.u32 q7, [r0, #64] 425; CHECK-NEXT: vldrw.u32 q6, [r0] 426; CHECK-NEXT: vldrw.u32 q1, [r0, #112] 427; CHECK-NEXT: vldrw.u32 q3, [r0, #96] 428; CHECK-NEXT: vldrw.u32 q5, [r0, #80] 429; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 430; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 431; CHECK-NEXT: vldrw.u32 q4, [r0, #16] 432; CHECK-NEXT: add.w r0, r1, #96 433; CHECK-NEXT: add.w r2, r1, #64 434; CHECK-NEXT: vst20.32 {q6, q7}, [r1] 435; CHECK-NEXT: vst21.32 {q6, q7}, [r1]! 436; CHECK-NEXT: vst20.32 {q4, q5}, [r1] 437; CHECK-NEXT: vst20.32 {q2, q3}, [r2] 438; CHECK-NEXT: vst20.32 {q0, q1}, [r0] 439; CHECK-NEXT: vst21.32 {q4, q5}, [r1] 440; CHECK-NEXT: vst21.32 {q2, q3}, [r2] 441; CHECK-NEXT: vst21.32 {q0, q1}, [r0] 442; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 443; CHECK-NEXT: bx lr 444entry: 445 %l1 = load <16 x float>, ptr %src, align 4 446 %s2 = getelementptr <16 x float>, ptr %src, i32 1 447 %l2 = load <16 x float>, ptr %s2, align 4 448 %s = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 449 store <32 x float> %s, ptr %dst, align 4 450 ret void 451} 452 453define void @vst2_v4f32_align1(ptr %src, ptr %dst) { 454; CHECK-LABEL: vst2_v4f32_align1: 455; CHECK: @ %bb.0: @ %entry 456; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 457; CHECK-NEXT: vldrw.u32 q1, [r0] 458; CHECK-NEXT: vmov.f32 s8, s6 459; CHECK-NEXT: vmov.f32 s9, s2 460; CHECK-NEXT: vmov.f32 s10, s7 461; CHECK-NEXT: vmov.f32 s11, s3 462; CHECK-NEXT: vmov.f32 s12, s4 463; CHECK-NEXT: vstrb.8 q2, [r1, #16] 464; CHECK-NEXT: vmov.f32 s13, s0 465; CHECK-NEXT: vmov.f32 s14, s5 466; CHECK-NEXT: vmov.f32 s15, s1 467; CHECK-NEXT: vstrb.8 q3, [r1] 468; CHECK-NEXT: bx lr 469entry: 470 %l1 = load <4 x float>, ptr %src, align 4 471 %s2 = getelementptr <4 x float>, ptr %src, i32 1 472 %l2 = load <4 x float>, ptr %s2, align 4 473 %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 474 store <8 x float> %s, ptr %dst, align 1 475 ret void 476} 477 478; f16 479 480define void @vst2_v2f16(ptr %src, ptr %dst) { 481; CHECK-LABEL: vst2_v2f16: 482; CHECK: @ %bb.0: @ %entry 483; CHECK-NEXT: ldrd r0, r2, [r0] 484; CHECK-NEXT: vmov.32 q1[0], r0 485; CHECK-NEXT: vmov.32 q0[0], r2 486; CHECK-NEXT: vmovx.f16 s5, s4 487; CHECK-NEXT: vins.f16 s4, s0 488; CHECK-NEXT: vmovx.f16 s0, s0 489; CHECK-NEXT: vins.f16 s5, s0 490; CHECK-NEXT: vmov r0, r2, d2 491; CHECK-NEXT: str r2, [r1, #4] 492; CHECK-NEXT: str r0, [r1] 493; CHECK-NEXT: bx lr 494entry: 495 %l1 = load <2 x half>, ptr %src, align 4 496 %s2 = getelementptr <2 x half>, ptr %src, i32 1 497 %l2 = load <2 x half>, ptr %s2, align 4 498 %s = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 499 store <4 x half> %s, ptr %dst, align 2 500 ret void 501} 502 503define void @vst2_v4f16(ptr %src, ptr %dst) { 504; CHECK-LABEL: vst2_v4f16: 505; CHECK: @ %bb.0: @ %entry 506; CHECK-NEXT: ldrd r2, r12, [r0] 507; CHECK-NEXT: ldrd r3, r0, [r0, #8] 508; CHECK-NEXT: vmov.32 q0[0], r2 509; CHECK-NEXT: vmov.32 q1[0], r3 510; CHECK-NEXT: vmov.32 q0[1], r12 511; CHECK-NEXT: vmov.32 q1[1], r0 512; CHECK-NEXT: vmovx.f16 s2, s0 513; CHECK-NEXT: vins.f16 s0, s4 514; CHECK-NEXT: vmovx.f16 s4, s4 515; CHECK-NEXT: vins.f16 s2, s4 516; CHECK-NEXT: vmovx.f16 s4, s1 517; CHECK-NEXT: vins.f16 s1, s5 518; CHECK-NEXT: vmovx.f16 s6, s5 519; CHECK-NEXT: vmov q2, q0 520; CHECK-NEXT: vins.f16 s4, s6 521; CHECK-NEXT: vmov.f32 s9, s2 522; CHECK-NEXT: vmov.f32 s10, s1 523; CHECK-NEXT: vmov.f32 s11, s4 524; CHECK-NEXT: vstrh.16 q2, [r1] 525; CHECK-NEXT: bx lr 526entry: 527 %l1 = load <4 x half>, ptr %src, align 4 528 %s2 = getelementptr <4 x half>, ptr %src, i32 1 529 %l2 = load <4 x half>, ptr %s2, align 4 530 %s = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 531 store <8 x half> %s, ptr %dst, align 2 532 ret void 533} 534 535define void @vst2_v8f16(ptr %src, ptr %dst) { 536; CHECK-LABEL: vst2_v8f16: 537; CHECK: @ %bb.0: @ %entry 538; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 539; CHECK-NEXT: vldrw.u32 q0, [r0] 540; CHECK-NEXT: vst20.16 {q0, q1}, [r1] 541; CHECK-NEXT: vst21.16 {q0, q1}, [r1] 542; CHECK-NEXT: bx lr 543entry: 544 %l1 = load <8 x half>, ptr %src, align 4 545 %s2 = getelementptr <8 x half>, ptr %src, i32 1 546 %l2 = load <8 x half>, ptr %s2, align 4 547 %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 548 store <16 x half> %s, ptr %dst, align 2 549 ret void 550} 551 552define void @vst2_v16f16(ptr %src, ptr %dst) { 553; CHECK-LABEL: vst2_v16f16: 554; CHECK: @ %bb.0: @ %entry 555; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 556; CHECK-NEXT: vldrw.u32 q2, [r0] 557; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 558; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 559; CHECK-NEXT: vst20.16 {q2, q3}, [r1] 560; CHECK-NEXT: vst21.16 {q2, q3}, [r1]! 561; CHECK-NEXT: vst20.16 {q0, q1}, [r1] 562; CHECK-NEXT: vst21.16 {q0, q1}, [r1] 563; CHECK-NEXT: bx lr 564entry: 565 %l1 = load <16 x half>, ptr %src, align 4 566 %s2 = getelementptr <16 x half>, ptr %src, i32 1 567 %l2 = load <16 x half>, ptr %s2, align 4 568 %s = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 569 store <32 x half> %s, ptr %dst, align 2 570 ret void 571} 572 573define void @vst2_v8f16_align1(ptr %src, ptr %dst) { 574; CHECK-LABEL: vst2_v8f16_align1: 575; CHECK: @ %bb.0: @ %entry 576; CHECK-NEXT: vldrw.u32 q1, [r0] 577; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 578; CHECK-NEXT: vmovx.f16 s1, s6 579; CHECK-NEXT: vmovx.f16 s0, s10 580; CHECK-NEXT: vins.f16 s1, s0 581; CHECK-NEXT: vmovx.f16 s3, s7 582; CHECK-NEXT: vmovx.f16 s0, s11 583; CHECK-NEXT: vins.f16 s6, s10 584; CHECK-NEXT: vins.f16 s3, s0 585; CHECK-NEXT: vmovx.f16 s10, s4 586; CHECK-NEXT: vmovx.f16 s0, s8 587; CHECK-NEXT: vins.f16 s7, s11 588; CHECK-NEXT: vins.f16 s4, s8 589; CHECK-NEXT: vins.f16 s10, s0 590; CHECK-NEXT: vmovx.f16 s8, s5 591; CHECK-NEXT: vins.f16 s5, s9 592; CHECK-NEXT: vmovx.f16 s0, s9 593; CHECK-NEXT: vmov q3, q1 594; CHECK-NEXT: vins.f16 s8, s0 595; CHECK-NEXT: vmov.f32 s0, s6 596; CHECK-NEXT: vmov.f32 s2, s7 597; CHECK-NEXT: vmov.f32 s13, s10 598; CHECK-NEXT: vstrb.8 q0, [r1, #16] 599; CHECK-NEXT: vmov.f32 s14, s5 600; CHECK-NEXT: vmov.f32 s15, s8 601; CHECK-NEXT: vstrb.8 q3, [r1] 602; CHECK-NEXT: bx lr 603entry: 604 %l1 = load <8 x half>, ptr %src, align 4 605 %s2 = getelementptr <8 x half>, ptr %src, i32 1 606 %l2 = load <8 x half>, ptr %s2, align 4 607 %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 608 store <16 x half> %s, ptr %dst, align 1 609 ret void 610} 611 612; f64 613 614define void @vst2_v2f64(ptr %src, ptr %dst) { 615; CHECK-LABEL: vst2_v2f64: 616; CHECK: @ %bb.0: @ %entry 617; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 618; CHECK-NEXT: vldrw.u32 q1, [r0] 619; CHECK-NEXT: vmov.f64 d4, d3 620; CHECK-NEXT: vmov.f64 d5, d1 621; CHECK-NEXT: vmov.f64 d3, d0 622; CHECK-NEXT: vstrw.32 q2, [r1, #16] 623; CHECK-NEXT: vstrw.32 q1, [r1] 624; CHECK-NEXT: bx lr 625entry: 626 %l1 = load <2 x double>, ptr %src, align 4 627 %s2 = getelementptr <2 x double>, ptr %src, i32 1 628 %l2 = load <2 x double>, ptr %s2, align 4 629 %s = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 630 store <4 x double> %s, ptr %dst, align 8 631 ret void 632} 633 634define void @vst2_v4f64(ptr %src, ptr %dst) { 635; CHECK-LABEL: vst2_v4f64: 636; CHECK: @ %bb.0: @ %entry 637; CHECK-NEXT: .vsave {d8, d9} 638; CHECK-NEXT: vpush {d8, d9} 639; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 640; CHECK-NEXT: vldrw.u32 q2, [r0] 641; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 642; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 643; CHECK-NEXT: vmov.f64 d8, d4 644; CHECK-NEXT: vmov.f64 d9, d0 645; CHECK-NEXT: vmov.f64 d0, d5 646; CHECK-NEXT: vstrw.32 q4, [r1] 647; CHECK-NEXT: vmov.f64 d5, d2 648; CHECK-NEXT: vstrw.32 q0, [r1, #16] 649; CHECK-NEXT: vmov.f64 d4, d6 650; CHECK-NEXT: vmov.f64 d2, d7 651; CHECK-NEXT: vstrw.32 q2, [r1, #32] 652; CHECK-NEXT: vstrw.32 q1, [r1, #48] 653; CHECK-NEXT: vpop {d8, d9} 654; CHECK-NEXT: bx lr 655entry: 656 %l1 = load <4 x double>, ptr %src, align 4 657 %s2 = getelementptr <4 x double>, ptr %src, i32 1 658 %l2 = load <4 x double>, ptr %s2, align 4 659 %s = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 660 store <8 x double> %s, ptr %dst, align 8 661 ret void 662} 663