1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s 3 4; i32 5 6; Expand 7define arm_aapcs_vfpcc void @ptr_v2i32(<2 x i32> %v, ptr %offptr) { 8; CHECK-LABEL: ptr_v2i32: 9; CHECK: @ %bb.0: @ %entry 10; CHECK-NEXT: vmov r2, s0 11; CHECK-NEXT: ldrd r0, r1, [r0] 12; CHECK-NEXT: str r2, [r0] 13; CHECK-NEXT: vmov r0, s2 14; CHECK-NEXT: str r0, [r1] 15; CHECK-NEXT: bx lr 16entry: 17 %offs = load <2 x ptr>, ptr %offptr, align 4 18 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %v, <2 x ptr> %offs, i32 4, <2 x i1> <i1 true, i1 true>) 19 ret void 20} 21 22; VSTRW.32 Qd, [offs, 0] 23define arm_aapcs_vfpcc void @ptr_v4i32(<4 x i32> %v, ptr %offptr) { 24; CHECK-LABEL: ptr_v4i32: 25; CHECK: @ %bb.0: @ %entry 26; CHECK-NEXT: vldrw.u32 q1, [r0] 27; CHECK-NEXT: vstrw.32 q0, [q1] 28; CHECK-NEXT: bx lr 29entry: 30 %offs = load <4 x ptr>, ptr %offptr, align 4 31 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %v, <4 x ptr> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 32 ret void 33} 34 35; Expand 36define arm_aapcs_vfpcc void @ptr_v8i32(<8 x i32> %v, ptr %offptr) { 37; CHECK-LABEL: ptr_v8i32: 38; CHECK: @ %bb.0: @ %entry 39; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 40; CHECK-NEXT: push {r4, r5, r6, r7, lr} 41; CHECK-NEXT: vldrw.u32 q2, [r0] 42; CHECK-NEXT: vmov r3, r4, d0 43; CHECK-NEXT: vmov r1, r2, d4 44; CHECK-NEXT: vmov lr, r12, d5 45; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 46; CHECK-NEXT: vmov r0, r5, d1 47; CHECK-NEXT: str r3, [r1] 48; CHECK-NEXT: vmov r1, r7, d4 49; CHECK-NEXT: str r4, [r2] 50; CHECK-NEXT: vmov r2, r4, d5 51; CHECK-NEXT: str.w r0, [lr] 52; CHECK-NEXT: vmov r0, r3, d2 53; CHECK-NEXT: str.w r5, [r12] 54; CHECK-NEXT: vmov r5, r6, d3 55; CHECK-NEXT: str r0, [r1] 56; CHECK-NEXT: str r3, [r7] 57; CHECK-NEXT: str r5, [r2] 58; CHECK-NEXT: str r6, [r4] 59; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 60entry: 61 %offs = load <8 x ptr>, ptr %offptr, align 4 62 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %v, <8 x ptr> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 63 ret void 64} 65 66; Expand 67define arm_aapcs_vfpcc void @ptr_v16i32(<16 x i32> %v, ptr %offptr) { 68; CHECK-LABEL: ptr_v16i32: 69; CHECK: @ %bb.0: @ %entry 70; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 71; CHECK-NEXT: push {r4, r5, r6, r7, lr} 72; CHECK-NEXT: .pad #4 73; CHECK-NEXT: sub sp, #4 74; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 75; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 76; CHECK-NEXT: vldrw.u32 q4, [r0] 77; CHECK-NEXT: vmov r3, r4, d0 78; CHECK-NEXT: vldrw.u32 q5, [r0, #32] 79; CHECK-NEXT: vldrw.u32 q6, [r0, #16] 80; CHECK-NEXT: vmov r1, r2, d8 81; CHECK-NEXT: vmov lr, r12, d9 82; CHECK-NEXT: vldrw.u32 q4, [r0, #48] 83; CHECK-NEXT: vmov r0, r5, d1 84; CHECK-NEXT: str r3, [r1] 85; CHECK-NEXT: vmov r1, r3, d12 86; CHECK-NEXT: str r4, [r2] 87; CHECK-NEXT: vmov r2, r7, d13 88; CHECK-NEXT: str.w r0, [lr] 89; CHECK-NEXT: vmov r0, r4, d2 90; CHECK-NEXT: str.w r5, [r12] 91; CHECK-NEXT: vmov r5, r6, d3 92; CHECK-NEXT: str r0, [r1] 93; CHECK-NEXT: vmov r0, r1, d10 94; CHECK-NEXT: str r4, [r3] 95; CHECK-NEXT: vmov r3, r4, d11 96; CHECK-NEXT: str r5, [r2] 97; CHECK-NEXT: vmov r2, r5, d4 98; CHECK-NEXT: str r6, [r7] 99; CHECK-NEXT: vmov r7, r6, d5 100; CHECK-NEXT: str r2, [r0] 101; CHECK-NEXT: vmov r0, r2, d8 102; CHECK-NEXT: str r5, [r1] 103; CHECK-NEXT: vmov r1, r5, d9 104; CHECK-NEXT: str r7, [r3] 105; CHECK-NEXT: vmov r3, r7, d6 106; CHECK-NEXT: str r6, [r4] 107; CHECK-NEXT: vmov r6, r4, d7 108; CHECK-NEXT: str r3, [r0] 109; CHECK-NEXT: str r7, [r2] 110; CHECK-NEXT: str r6, [r1] 111; CHECK-NEXT: str r4, [r5] 112; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 113; CHECK-NEXT: add sp, #4 114; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 115entry: 116 %offs = load <16 x ptr>, ptr %offptr, align 4 117 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %v, <16 x ptr> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 118 ret void 119} 120 121; f32 122 123; Expand 124define arm_aapcs_vfpcc void @ptr_v2f32(<2 x float> %v, ptr %offptr) { 125; CHECK-LABEL: ptr_v2f32: 126; CHECK: @ %bb.0: @ %entry 127; CHECK-NEXT: ldrd r0, r1, [r0] 128; CHECK-NEXT: vstr s0, [r0] 129; CHECK-NEXT: vstr s1, [r1] 130; CHECK-NEXT: bx lr 131entry: 132 %offs = load <2 x ptr>, ptr %offptr, align 4 133 call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> %v, <2 x ptr> %offs, i32 4, <2 x i1> <i1 true, i1 true>) 134 ret void 135} 136 137; VSTRW.32 Qd, [offs, 0] 138define arm_aapcs_vfpcc void @ptr_v4f32(<4 x float> %v, ptr %offptr) { 139; CHECK-LABEL: ptr_v4f32: 140; CHECK: @ %bb.0: @ %entry 141; CHECK-NEXT: vldrw.u32 q1, [r0] 142; CHECK-NEXT: vstrw.32 q0, [q1] 143; CHECK-NEXT: bx lr 144entry: 145 %offs = load <4 x ptr>, ptr %offptr, align 4 146 call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %v, <4 x ptr> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 147 ret void 148} 149 150; Expand 151define arm_aapcs_vfpcc void @ptr_v8f32(<8 x float> %v, ptr %offptr) { 152; CHECK-LABEL: ptr_v8f32: 153; CHECK: @ %bb.0: @ %entry 154; CHECK-NEXT: .save {r4, r5, r7, lr} 155; CHECK-NEXT: push {r4, r5, r7, lr} 156; CHECK-NEXT: vldrw.u32 q2, [r0] 157; CHECK-NEXT: vmov r1, lr, d4 158; CHECK-NEXT: vmov r3, r12, d5 159; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 160; CHECK-NEXT: vmov r0, r2, d4 161; CHECK-NEXT: vmov r4, r5, d5 162; CHECK-NEXT: vstr s0, [r1] 163; CHECK-NEXT: vstr s1, [lr] 164; CHECK-NEXT: vstr s2, [r3] 165; CHECK-NEXT: vstr s3, [r12] 166; CHECK-NEXT: vstr s4, [r0] 167; CHECK-NEXT: vstr s5, [r2] 168; CHECK-NEXT: vstr s6, [r4] 169; CHECK-NEXT: vstr s7, [r5] 170; CHECK-NEXT: pop {r4, r5, r7, pc} 171entry: 172 %offs = load <8 x ptr>, ptr %offptr, align 4 173 call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %v, <8 x ptr> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 174 ret void 175} 176 177; i16 178 179; Expand. 180define arm_aapcs_vfpcc void @ptr_i16(<8 x i16> %v, ptr %offptr) { 181; CHECK-LABEL: ptr_i16: 182; CHECK: @ %bb.0: @ %entry 183; CHECK-NEXT: .save {r4, r5, r6, lr} 184; CHECK-NEXT: push {r4, r5, r6, lr} 185; CHECK-NEXT: vldrw.u32 q1, [r0] 186; CHECK-NEXT: vmov.u16 r6, q0[0] 187; CHECK-NEXT: vmov r1, r2, d2 188; CHECK-NEXT: vmov r3, r12, d3 189; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 190; CHECK-NEXT: vmov r0, lr, d2 191; CHECK-NEXT: vmov r4, r5, d3 192; CHECK-NEXT: strh r6, [r1] 193; CHECK-NEXT: vmov.u16 r1, q0[1] 194; CHECK-NEXT: strh r1, [r2] 195; CHECK-NEXT: vmov.u16 r1, q0[2] 196; CHECK-NEXT: strh r1, [r3] 197; CHECK-NEXT: vmov.u16 r1, q0[3] 198; CHECK-NEXT: strh.w r1, [r12] 199; CHECK-NEXT: vmov.u16 r1, q0[4] 200; CHECK-NEXT: strh r1, [r0] 201; CHECK-NEXT: vmov.u16 r0, q0[5] 202; CHECK-NEXT: strh.w r0, [lr] 203; CHECK-NEXT: vmov.u16 r0, q0[6] 204; CHECK-NEXT: strh r0, [r4] 205; CHECK-NEXT: vmov.u16 r0, q0[7] 206; CHECK-NEXT: strh r0, [r5] 207; CHECK-NEXT: pop {r4, r5, r6, pc} 208entry: 209 %offs = load <8 x ptr>, ptr %offptr, align 4 210 call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %v, <8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 211 ret void 212} 213 214; Expand 215define arm_aapcs_vfpcc void @ptr_v2i16_trunc(<2 x i32> %v, ptr %offptr) { 216; CHECK-LABEL: ptr_v2i16_trunc: 217; CHECK: @ %bb.0: @ %entry 218; CHECK-NEXT: vmov r2, s0 219; CHECK-NEXT: ldrd r0, r1, [r0] 220; CHECK-NEXT: strh r2, [r0] 221; CHECK-NEXT: vmov r0, s2 222; CHECK-NEXT: strh r0, [r1] 223; CHECK-NEXT: bx lr 224entry: 225 %offs = load <2 x ptr>, ptr %offptr, align 4 226 %ext = trunc <2 x i32> %v to <2 x i16> 227 call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> %ext, <2 x ptr> %offs, i32 2, <2 x i1> <i1 true, i1 true>) 228 ret void 229} 230 231define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, ptr %offptr) { 232; CHECK-LABEL: ptr_v4i16_trunc: 233; CHECK: @ %bb.0: @ %entry 234; CHECK-NEXT: vldrw.u32 q1, [r0] 235; CHECK-NEXT: movs r0, #0 236; CHECK-NEXT: vstrh.32 q0, [r0, q1] 237; CHECK-NEXT: bx lr 238entry: 239 %offs = load <4 x ptr>, ptr %offptr, align 4 240 %ext = trunc <4 x i32> %v to <4 x i16> 241 call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %ext, <4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 242 ret void 243} 244 245define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x ptr> %offs) { 246; CHECK-LABEL: ptr_v4i16_dup: 247; CHECK: @ %bb.0: @ %entry 248; CHECK-NEXT: vdup.32 q1, r0 249; CHECK-NEXT: movs r1, #0 250; CHECK-NEXT: vmovlb.u16 q1, q1 251; CHECK-NEXT: vstrh.32 q1, [r1, q0] 252; CHECK-NEXT: bx lr 253entry: 254 %ext = trunc i32 %v to i16 255 %splatinsert = insertelement <4 x i16> poison, i16 %ext, i32 0 256 %splat = shufflevector <4 x i16> %splatinsert, <4 x i16> poison, <4 x i32> zeroinitializer 257 call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %splat, <4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 258 ret void 259} 260 261; Expand 262define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, ptr %offptr) { 263; CHECK-LABEL: ptr_v8i16_trunc: 264; CHECK: @ %bb.0: @ %entry 265; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 266; CHECK-NEXT: push {r4, r5, r6, r7, lr} 267; CHECK-NEXT: vldrw.u32 q2, [r0] 268; CHECK-NEXT: vmov r3, r4, d0 269; CHECK-NEXT: vmov r1, r2, d4 270; CHECK-NEXT: vmov lr, r12, d5 271; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 272; CHECK-NEXT: vmov r0, r5, d1 273; CHECK-NEXT: strh r3, [r1] 274; CHECK-NEXT: vmov r1, r7, d4 275; CHECK-NEXT: strh r4, [r2] 276; CHECK-NEXT: vmov r2, r4, d5 277; CHECK-NEXT: strh.w r0, [lr] 278; CHECK-NEXT: vmov r0, r3, d2 279; CHECK-NEXT: strh.w r5, [r12] 280; CHECK-NEXT: vmov r5, r6, d3 281; CHECK-NEXT: strh r0, [r1] 282; CHECK-NEXT: strh r3, [r7] 283; CHECK-NEXT: strh r5, [r2] 284; CHECK-NEXT: strh r6, [r4] 285; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 286entry: 287 %offs = load <8 x ptr>, ptr %offptr, align 4 288 %ext = trunc <8 x i32> %v to <8 x i16> 289 call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %ext, <8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 290 ret void 291} 292 293; f16 294 295; Expand. 296define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, ptr %offptr) { 297; CHECK-LABEL: ptr_f16: 298; CHECK: @ %bb.0: @ %entry 299; CHECK-NEXT: vldrw.u32 q2, [r0] 300; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 301; CHECK-NEXT: vmov r0, r1, d4 302; CHECK-NEXT: vstr.16 s0, [r0] 303; CHECK-NEXT: vmovx.f16 s0, s0 304; CHECK-NEXT: vstr.16 s0, [r1] 305; CHECK-NEXT: vmov r0, r1, d5 306; CHECK-NEXT: vmovx.f16 s0, s1 307; CHECK-NEXT: vstr.16 s1, [r0] 308; CHECK-NEXT: vstr.16 s0, [r1] 309; CHECK-NEXT: vmov r0, r1, d2 310; CHECK-NEXT: vmovx.f16 s0, s2 311; CHECK-NEXT: vstr.16 s2, [r0] 312; CHECK-NEXT: vstr.16 s0, [r1] 313; CHECK-NEXT: vmov r0, r1, d3 314; CHECK-NEXT: vmovx.f16 s0, s3 315; CHECK-NEXT: vstr.16 s3, [r0] 316; CHECK-NEXT: vstr.16 s0, [r1] 317; CHECK-NEXT: bx lr 318entry: 319 %offs = load <8 x ptr>, ptr %offptr, align 4 320 call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %v, <8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 321 ret void 322} 323 324define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, ptr %offptr) { 325; CHECK-LABEL: ptr_v4f16: 326; CHECK: @ %bb.0: @ %entry 327; CHECK-NEXT: vldrw.u32 q1, [r0] 328; CHECK-NEXT: vmov r0, r1, d2 329; CHECK-NEXT: vstr.16 s0, [r0] 330; CHECK-NEXT: vmovx.f16 s0, s0 331; CHECK-NEXT: vstr.16 s0, [r1] 332; CHECK-NEXT: vmov r0, r1, d3 333; CHECK-NEXT: vmovx.f16 s0, s1 334; CHECK-NEXT: vstr.16 s1, [r0] 335; CHECK-NEXT: vstr.16 s0, [r1] 336; CHECK-NEXT: bx lr 337entry: 338 %offs = load <4 x ptr>, ptr %offptr, align 4 339 call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> %v, <4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 340 ret void 341} 342 343define arm_aapcs_vfpcc void @ptr_v4f16_dup(half %v, <4 x ptr> %offs) { 344; CHECK-LABEL: ptr_v4f16_dup: 345; CHECK: @ %bb.0: @ %entry 346; CHECK-NEXT: vmov r0, r1, d2 347; CHECK-NEXT: vmov r2, r3, d3 348; CHECK-NEXT: vstr.16 s0, [r0] 349; CHECK-NEXT: vstr.16 s0, [r1] 350; CHECK-NEXT: vstr.16 s0, [r2] 351; CHECK-NEXT: vstr.16 s0, [r3] 352; CHECK-NEXT: bx lr 353entry: 354 %splatinsert = insertelement <4 x half> poison, half %v, i32 0 355 %splat = shufflevector <4 x half> %splatinsert, <4 x half> poison, <4 x i32> zeroinitializer 356 call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> %splat, <4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 357 ret void 358} 359 360; i8 361 362; Expand. 363define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, ptr %offptr) { 364; CHECK-LABEL: ptr_i8: 365; CHECK: @ %bb.0: @ %entry 366; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} 367; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} 368; CHECK-NEXT: vldrw.u32 q1, [r0] 369; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 370; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 371; CHECK-NEXT: vmov.u8 r6, q0[0] 372; CHECK-NEXT: vmov r1, r2, d2 373; CHECK-NEXT: vmov.u8 r5, q0[4] 374; CHECK-NEXT: vmov r3, r12, d3 375; CHECK-NEXT: vldrw.u32 q1, [r0, #48] 376; CHECK-NEXT: vmov lr, r4, d4 377; CHECK-NEXT: vmov.u8 r7, q0[6] 378; CHECK-NEXT: vmov r0, r8, d5 379; CHECK-NEXT: strb r6, [r1] 380; CHECK-NEXT: vmov.u8 r1, q0[1] 381; CHECK-NEXT: strb r1, [r2] 382; CHECK-NEXT: vmov.u8 r6, q0[2] 383; CHECK-NEXT: vmov r1, r9, d6 384; CHECK-NEXT: strb r6, [r3] 385; CHECK-NEXT: vmov.u8 r3, q0[3] 386; CHECK-NEXT: vmov.u8 r2, q0[8] 387; CHECK-NEXT: strb.w r3, [r12] 388; CHECK-NEXT: vmov r3, r6, d7 389; CHECK-NEXT: strb.w r5, [lr] 390; CHECK-NEXT: vmov.u8 r5, q0[5] 391; CHECK-NEXT: strb r5, [r4] 392; CHECK-NEXT: vmov r5, r4, d2 393; CHECK-NEXT: strb r7, [r0] 394; CHECK-NEXT: vmov.u8 r0, q0[7] 395; CHECK-NEXT: strb.w r0, [r8] 396; CHECK-NEXT: vmov r0, r7, d3 397; CHECK-NEXT: strb r2, [r1] 398; CHECK-NEXT: vmov.u8 r1, q0[9] 399; CHECK-NEXT: strb.w r1, [r9] 400; CHECK-NEXT: vmov.u8 r1, q0[10] 401; CHECK-NEXT: strb r1, [r3] 402; CHECK-NEXT: vmov.u8 r1, q0[11] 403; CHECK-NEXT: strb r1, [r6] 404; CHECK-NEXT: vmov.u8 r1, q0[12] 405; CHECK-NEXT: strb r1, [r5] 406; CHECK-NEXT: vmov.u8 r1, q0[13] 407; CHECK-NEXT: strb r1, [r4] 408; CHECK-NEXT: vmov.u8 r1, q0[14] 409; CHECK-NEXT: strb r1, [r0] 410; CHECK-NEXT: vmov.u8 r0, q0[15] 411; CHECK-NEXT: strb r0, [r7] 412; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} 413entry: 414 %offs = load <16 x ptr>, ptr %offptr, align 4 415 call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %v, <16 x ptr> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 416 ret void 417} 418 419; Expand 420define arm_aapcs_vfpcc void @ptr_v8i8_trunc16(<8 x i16> %v, ptr %offptr) { 421; CHECK-LABEL: ptr_v8i8_trunc16: 422; CHECK: @ %bb.0: @ %entry 423; CHECK-NEXT: .save {r4, r5, r6, lr} 424; CHECK-NEXT: push {r4, r5, r6, lr} 425; CHECK-NEXT: vldrw.u32 q1, [r0] 426; CHECK-NEXT: vmov.u16 r6, q0[0] 427; CHECK-NEXT: vmov r1, r2, d2 428; CHECK-NEXT: vmov r3, r12, d3 429; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 430; CHECK-NEXT: vmov r0, lr, d2 431; CHECK-NEXT: vmov r4, r5, d3 432; CHECK-NEXT: strb r6, [r1] 433; CHECK-NEXT: vmov.u16 r1, q0[1] 434; CHECK-NEXT: strb r1, [r2] 435; CHECK-NEXT: vmov.u16 r1, q0[2] 436; CHECK-NEXT: strb r1, [r3] 437; CHECK-NEXT: vmov.u16 r1, q0[3] 438; CHECK-NEXT: strb.w r1, [r12] 439; CHECK-NEXT: vmov.u16 r1, q0[4] 440; CHECK-NEXT: strb r1, [r0] 441; CHECK-NEXT: vmov.u16 r0, q0[5] 442; CHECK-NEXT: strb.w r0, [lr] 443; CHECK-NEXT: vmov.u16 r0, q0[6] 444; CHECK-NEXT: strb r0, [r4] 445; CHECK-NEXT: vmov.u16 r0, q0[7] 446; CHECK-NEXT: strb r0, [r5] 447; CHECK-NEXT: pop {r4, r5, r6, pc} 448entry: 449 %offs = load <8 x ptr>, ptr %offptr, align 4 450 %ext = trunc <8 x i16> %v to <8 x i8> 451 call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %ext, <8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 452 ret void 453} 454 455define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, ptr %offptr) { 456; CHECK-LABEL: ptr_v4i8_trunc32: 457; CHECK: @ %bb.0: @ %entry 458; CHECK-NEXT: vldrw.u32 q1, [r0] 459; CHECK-NEXT: movs r0, #0 460; CHECK-NEXT: vstrb.32 q0, [r0, q1] 461; CHECK-NEXT: bx lr 462entry: 463 %offs = load <4 x ptr>, ptr %offptr, align 4 464 %ext = trunc <4 x i32> %v to <4 x i8> 465 call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %ext, <4 x ptr> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 466 ret void 467} 468 469; Expand 470define arm_aapcs_vfpcc void @ptr_v8i8_trunc32(<8 x i32> %v, ptr %offptr) { 471; CHECK-LABEL: ptr_v8i8_trunc32: 472; CHECK: @ %bb.0: @ %entry 473; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 474; CHECK-NEXT: push {r4, r5, r6, r7, lr} 475; CHECK-NEXT: vldrw.u32 q2, [r0] 476; CHECK-NEXT: vmov r3, r4, d0 477; CHECK-NEXT: vmov r1, r2, d4 478; CHECK-NEXT: vmov lr, r12, d5 479; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 480; CHECK-NEXT: vmov r0, r5, d1 481; CHECK-NEXT: strb r3, [r1] 482; CHECK-NEXT: vmov r1, r7, d4 483; CHECK-NEXT: strb r4, [r2] 484; CHECK-NEXT: vmov r2, r4, d5 485; CHECK-NEXT: strb.w r0, [lr] 486; CHECK-NEXT: vmov r0, r3, d2 487; CHECK-NEXT: strb.w r5, [r12] 488; CHECK-NEXT: vmov r5, r6, d3 489; CHECK-NEXT: strb r0, [r1] 490; CHECK-NEXT: strb r3, [r7] 491; CHECK-NEXT: strb r5, [r2] 492; CHECK-NEXT: strb r6, [r4] 493; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 494entry: 495 %offs = load <8 x ptr>, ptr %offptr, align 4 496 %ext = trunc <8 x i32> %v to <8 x i8> 497 call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %ext, <8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 498 ret void 499} 500 501; loops 502 503define void @foo_ptr_p_int32_t(ptr %dest, ptr %src, i32 %n) { 504; CHECK-LABEL: foo_ptr_p_int32_t: 505; CHECK: @ %bb.0: @ %entry 506; CHECK-NEXT: bic r3, r2, #15 507; CHECK-NEXT: cmp r3, #1 508; CHECK-NEXT: it lt 509; CHECK-NEXT: bxlt lr 510; CHECK-NEXT: .LBB19_1: @ %vector.body 511; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 512; CHECK-NEXT: vldrw.u32 q0, [r1], #16 513; CHECK-NEXT: subs r2, #4 514; CHECK-NEXT: vptt.i32 ne, q0, zr 515; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 516; CHECK-NEXT: vstrwt.32 q1, [q0] 517; CHECK-NEXT: bne .LBB19_1 518; CHECK-NEXT: @ %bb.2: @ %for.end 519; CHECK-NEXT: bx lr 520entry: 521 %and = and i32 %n, -16 522 %cmp11 = icmp sgt i32 %and, 0 523 br i1 %cmp11, label %vector.body, label %for.end 524 525vector.body: ; preds = %entry, %vector.body 526 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 527 %0 = getelementptr inbounds ptr, ptr %src, i32 %index 528 %wide.load = load <4 x ptr>, ptr %0, align 4 529 %1 = icmp ne <4 x ptr> %wide.load, zeroinitializer 530 %2 = getelementptr inbounds i32, ptr %dest, i32 %index 531 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef) 532 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %wide.masked.load, <4 x ptr> %wide.load, i32 4, <4 x i1> %1) 533 %index.next = add i32 %index, 4 534 %3 = icmp eq i32 %index.next, %n 535 br i1 %3, label %for.end, label %vector.body 536 537for.end: ; preds = %vector.body, %entry 538 ret void 539} 540 541define void @foo_ptr_p_float(ptr %dest, ptr %src, i32 %n) { 542; CHECK-LABEL: foo_ptr_p_float: 543; CHECK: @ %bb.0: @ %entry 544; CHECK-NEXT: bic r3, r2, #15 545; CHECK-NEXT: cmp r3, #1 546; CHECK-NEXT: it lt 547; CHECK-NEXT: bxlt lr 548; CHECK-NEXT: .LBB20_1: @ %vector.body 549; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 550; CHECK-NEXT: vldrw.u32 q0, [r1], #16 551; CHECK-NEXT: subs r2, #4 552; CHECK-NEXT: vptt.i32 ne, q0, zr 553; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 554; CHECK-NEXT: vstrwt.32 q1, [q0] 555; CHECK-NEXT: bne .LBB20_1 556; CHECK-NEXT: @ %bb.2: @ %for.end 557; CHECK-NEXT: bx lr 558entry: 559 %and = and i32 %n, -16 560 %cmp11 = icmp sgt i32 %and, 0 561 br i1 %cmp11, label %vector.body, label %for.end 562 563vector.body: ; preds = %entry, %vector.body 564 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 565 %0 = getelementptr inbounds ptr, ptr %src, i32 %index 566 %wide.load = load <4 x ptr>, ptr %0, align 4 567 %1 = icmp ne <4 x ptr> %wide.load, zeroinitializer 568 %2 = getelementptr inbounds float, ptr %dest, i32 %index 569 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef) 570 %3 = bitcast <4 x ptr> %wide.load to <4 x ptr> 571 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %wide.masked.load, <4 x ptr> %3, i32 4, <4 x i1> %1) 572 %index.next = add i32 %index, 4 573 %4 = icmp eq i32 %index.next, %n 574 br i1 %4, label %for.end, label %vector.body 575 576for.end: ; preds = %vector.body, %entry 577 ret void 578} 579 580; VLSTW.u32 Qd, [P, 4] 581define arm_aapcs_vfpcc void @qi4(<4 x i32> %v, <4 x ptr> %p) { 582; CHECK-LABEL: qi4: 583; CHECK: @ %bb.0: @ %entry 584; CHECK-NEXT: movs r0, #16 585; CHECK-NEXT: vadd.i32 q1, q1, r0 586; CHECK-NEXT: vstrw.32 q0, [q1] 587; CHECK-NEXT: bx lr 588entry: 589 %g = getelementptr inbounds i32, <4 x ptr> %p, i32 4 590 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %v, <4 x ptr> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 591 ret void 592} 593 594declare void @llvm.masked.scatter.v2i16.v2p0(<2 x i16>, <2 x ptr>, i32, <2 x i1>) 595declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>) 596declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>) 597declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32, <4 x i1>) 598declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>) 599declare void @llvm.masked.scatter.v4f16.v4p0(<4 x half>, <4 x ptr>, i32, <4 x i1>) 600declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>) 601declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>) 602declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>) 603declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>) 604declare void @llvm.masked.scatter.v8f16.v8p0(<8 x half>, <8 x ptr>, i32, <8 x i1>) 605declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>) 606declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>) 607declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>) 608declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>) 609declare <4 x i32> @llvm.masked.load.v4i32.v4p0(ptr, i32, <4 x i1>, <4 x i32>) 610