1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s 3 4; i32 5 6define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(ptr %offptr) { 7; CHECK-LABEL: ptr_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: ldrd r0, r1, [r0] 10; CHECK-NEXT: ldr r1, [r1] 11; CHECK-NEXT: ldr r0, [r0] 12; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 13; CHECK-NEXT: bx lr 14entry: 15 %offs = load <2 x ptr>, ptr %offptr, align 4 16 %gather = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) 17 ret <2 x i32> %gather 18} 19 20define arm_aapcs_vfpcc <4 x i32> @ptr_v4i32(ptr %offptr) { 21; CHECK-LABEL: ptr_v4i32: 22; CHECK: @ %bb.0: @ %entry 23; CHECK-NEXT: vldrw.u32 q1, [r0] 24; CHECK-NEXT: vldrw.u32 q0, [q1] 25; CHECK-NEXT: bx lr 26entry: 27 %offs = load <4 x ptr>, ptr %offptr, align 4 28 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 29 ret <4 x i32> %gather 30} 31 32define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(ptr %offptr) { 33; CHECK-LABEL: ptr_v8i32: 34; CHECK: @ %bb.0: @ %entry 35; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 36; CHECK-NEXT: push {r4, r5, r6, r7, lr} 37; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 38; CHECK-NEXT: vmov r1, r2, d1 39; CHECK-NEXT: vmov r3, r12, d0 40; CHECK-NEXT: vldrw.u32 q0, [r0] 41; CHECK-NEXT: vmov r0, lr, d1 42; CHECK-NEXT: ldr r7, [r2] 43; CHECK-NEXT: vmov r2, r4, d0 44; CHECK-NEXT: ldr r6, [r1] 45; CHECK-NEXT: ldr r3, [r3] 46; CHECK-NEXT: ldr r0, [r0] 47; CHECK-NEXT: ldr.w r1, [r12] 48; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 49; CHECK-NEXT: ldr.w r5, [lr] 50; CHECK-NEXT: vmov q1[3], q1[1], r1, r7 51; CHECK-NEXT: ldr r2, [r2] 52; CHECK-NEXT: ldr r4, [r4] 53; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 54; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 55; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 56entry: 57 %offs = load <8 x ptr>, ptr %offptr, align 4 58 %gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 59 ret <8 x i32> %gather 60} 61 62define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(ptr %offptr) { 63; CHECK-LABEL: ptr_v16i32: 64; CHECK: @ %bb.0: @ %entry 65; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 66; CHECK-NEXT: push {r4, r5, r6, r7, lr} 67; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 68; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 69; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 70; CHECK-NEXT: vmov r1, r2, d1 71; CHECK-NEXT: vmov r3, lr, d0 72; CHECK-NEXT: vldrw.u32 q0, [r0] 73; CHECK-NEXT: vmov r4, r5, d1 74; CHECK-NEXT: ldr r7, [r2] 75; CHECK-NEXT: vmov r2, r6, d0 76; CHECK-NEXT: ldr.w r12, [r1] 77; CHECK-NEXT: ldr r3, [r3] 78; CHECK-NEXT: ldr r4, [r4] 79; CHECK-NEXT: ldr r5, [r5] 80; CHECK-NEXT: vmov q3[2], q3[0], r3, r12 81; CHECK-NEXT: ldr.w r1, [lr] 82; CHECK-NEXT: vmov q3[3], q3[1], r1, r7 83; CHECK-NEXT: ldr r2, [r2] 84; CHECK-NEXT: ldr r6, [r6] 85; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 86; CHECK-NEXT: vmov r2, r4, d3 87; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 88; CHECK-NEXT: vmov r6, r5, d2 89; CHECK-NEXT: ldr r2, [r2] 90; CHECK-NEXT: ldr r6, [r6] 91; CHECK-NEXT: ldr r5, [r5] 92; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 93; CHECK-NEXT: ldr r6, [r4] 94; CHECK-NEXT: vmov r0, r2, d5 95; CHECK-NEXT: vmov q1[3], q1[1], r5, r6 96; CHECK-NEXT: vmov r6, r5, d4 97; CHECK-NEXT: ldr r0, [r0] 98; CHECK-NEXT: ldr r6, [r6] 99; CHECK-NEXT: ldr r2, [r2] 100; CHECK-NEXT: ldr r5, [r5] 101; CHECK-NEXT: vmov q2[2], q2[0], r6, r0 102; CHECK-NEXT: vmov q2[3], q2[1], r5, r2 103; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 104entry: 105 %offs = load <16 x ptr>, ptr %offptr, align 4 106 %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef) 107 ret <16 x i32> %gather 108} 109 110; f32 111 112define arm_aapcs_vfpcc <2 x float> @ptr_v2f32(ptr %offptr) { 113; CHECK-LABEL: ptr_v2f32: 114; CHECK: @ %bb.0: @ %entry 115; CHECK-NEXT: ldrd r0, r1, [r0] 116; CHECK-NEXT: vldr s1, [r1] 117; CHECK-NEXT: vldr s0, [r0] 118; CHECK-NEXT: bx lr 119entry: 120 %offs = load <2 x ptr>, ptr %offptr, align 4 121 %gather = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef) 122 ret <2 x float> %gather 123} 124 125define arm_aapcs_vfpcc <4 x float> @ptr_v4f32(ptr %offptr) { 126; CHECK-LABEL: ptr_v4f32: 127; CHECK: @ %bb.0: @ %entry 128; CHECK-NEXT: vldrw.u32 q1, [r0] 129; CHECK-NEXT: vldrw.u32 q0, [q1] 130; CHECK-NEXT: bx lr 131entry: 132 %offs = load <4 x ptr>, ptr %offptr, align 4 133 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 134 ret <4 x float> %gather 135} 136 137define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(ptr %offptr) { 138; CHECK-LABEL: ptr_v8f32: 139; CHECK: @ %bb.0: @ %entry 140; CHECK-NEXT: .save {r4, r5, r7, lr} 141; CHECK-NEXT: push {r4, r5, r7, lr} 142; CHECK-NEXT: vldrw.u32 q0, [r0] 143; CHECK-NEXT: vmov r12, r2, d1 144; CHECK-NEXT: vmov lr, r1, d0 145; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 146; CHECK-NEXT: vmov r0, r3, d1 147; CHECK-NEXT: vmov r4, r5, d0 148; CHECK-NEXT: vldr s3, [r2] 149; CHECK-NEXT: vldr s2, [r12] 150; CHECK-NEXT: vldr s1, [r1] 151; CHECK-NEXT: vldr s0, [lr] 152; CHECK-NEXT: vldr s7, [r3] 153; CHECK-NEXT: vldr s6, [r0] 154; CHECK-NEXT: vldr s5, [r5] 155; CHECK-NEXT: vldr s4, [r4] 156; CHECK-NEXT: pop {r4, r5, r7, pc} 157entry: 158 %offs = load <8 x ptr>, ptr %offptr, align 4 159 %gather = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef) 160 ret <8 x float> %gather 161} 162 163; i16 164 165define arm_aapcs_vfpcc <8 x i16> @ptr_i16(ptr %offptr) { 166; CHECK-LABEL: ptr_i16: 167; CHECK: @ %bb.0: @ %entry 168; CHECK-NEXT: .save {r4, r5, r6, lr} 169; CHECK-NEXT: push {r4, r5, r6, lr} 170; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 171; CHECK-NEXT: vmov r1, r2, d0 172; CHECK-NEXT: vmov r3, r12, d1 173; CHECK-NEXT: vldrw.u32 q0, [r0] 174; CHECK-NEXT: vmov r4, r5, d0 175; CHECK-NEXT: vmov r0, lr, d1 176; CHECK-NEXT: ldrh r1, [r1] 177; CHECK-NEXT: ldrh r6, [r3] 178; CHECK-NEXT: ldrh r2, [r2] 179; CHECK-NEXT: ldrh r4, [r4] 180; CHECK-NEXT: ldrh r5, [r5] 181; CHECK-NEXT: vmov.16 q0[0], r4 182; CHECK-NEXT: ldrh r0, [r0] 183; CHECK-NEXT: vmov.16 q0[1], r5 184; CHECK-NEXT: ldrh.w r3, [lr] 185; CHECK-NEXT: vmov.16 q0[2], r0 186; CHECK-NEXT: ldrh.w r12, [r12] 187; CHECK-NEXT: vmov.16 q0[3], r3 188; CHECK-NEXT: vmov.16 q0[4], r1 189; CHECK-NEXT: vmov.16 q0[5], r2 190; CHECK-NEXT: vmov.16 q0[6], r6 191; CHECK-NEXT: vmov.16 q0[7], r12 192; CHECK-NEXT: pop {r4, r5, r6, pc} 193entry: 194 %offs = load <8 x ptr>, ptr %offptr, align 4 195 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 196 ret <8 x i16> %gather 197} 198 199define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(ptr %offptr) { 200; CHECK-LABEL: ptr_v2i16_sext: 201; CHECK: @ %bb.0: @ %entry 202; CHECK-NEXT: ldrd r0, r1, [r0] 203; CHECK-NEXT: ldrsh.w r1, [r1] 204; CHECK-NEXT: ldrsh.w r0, [r0] 205; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 206; CHECK-NEXT: asrs r1, r1, #31 207; CHECK-NEXT: asrs r0, r0, #31 208; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 209; CHECK-NEXT: bx lr 210entry: 211 %offs = load <2 x ptr>, ptr %offptr, align 4 212 %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef) 213 %ext = sext <2 x i16> %gather to <2 x i32> 214 ret <2 x i32> %ext 215} 216 217define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(ptr %offptr) { 218; CHECK-LABEL: ptr_v2i16_zext: 219; CHECK: @ %bb.0: @ %entry 220; CHECK-NEXT: ldrd r0, r1, [r0] 221; CHECK-NEXT: vmov.i64 q0, #0xffff 222; CHECK-NEXT: ldrh r1, [r1] 223; CHECK-NEXT: ldrh r0, [r0] 224; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 225; CHECK-NEXT: vand q0, q1, q0 226; CHECK-NEXT: bx lr 227entry: 228 %offs = load <2 x ptr>, ptr %offptr, align 4 229 %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef) 230 %ext = zext <2 x i16> %gather to <2 x i32> 231 ret <2 x i32> %ext 232} 233 234define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(ptr %offptr) { 235; CHECK-LABEL: ptr_v4i16_sext: 236; CHECK: @ %bb.0: @ %entry 237; CHECK-NEXT: vldrw.u32 q1, [r0] 238; CHECK-NEXT: movs r1, #0 239; CHECK-NEXT: vldrh.s32 q0, [r1, q1] 240; CHECK-NEXT: bx lr 241entry: 242 %offs = load <4 x ptr>, ptr %offptr, align 4 243 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 244 %ext = sext <4 x i16> %gather to <4 x i32> 245 ret <4 x i32> %ext 246} 247 248define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(ptr %offptr) { 249; CHECK-LABEL: ptr_v4i16_zext: 250; CHECK: @ %bb.0: @ %entry 251; CHECK-NEXT: vldrw.u32 q1, [r0] 252; CHECK-NEXT: movs r1, #0 253; CHECK-NEXT: vldrh.u32 q0, [r1, q1] 254; CHECK-NEXT: bx lr 255entry: 256 %offs = load <4 x ptr>, ptr %offptr, align 4 257 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 258 %ext = zext <4 x i16> %gather to <4 x i32> 259 ret <4 x i32> %ext 260} 261 262define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(ptr %offptr) { 263; CHECK-LABEL: ptr_v4i16: 264; CHECK: @ %bb.0: @ %entry 265; CHECK-NEXT: vldrw.u32 q1, [r0] 266; CHECK-NEXT: movs r1, #0 267; CHECK-NEXT: vldrh.u32 q0, [r1, q1] 268; CHECK-NEXT: bx lr 269entry: 270 %offs = load <4 x ptr>, ptr %offptr, align 4 271 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 272 ret <4 x i16> %gather 273} 274 275define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(ptr %offptr) { 276; CHECK-LABEL: ptr_v8i16_sext: 277; CHECK: @ %bb.0: @ %entry 278; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 279; CHECK-NEXT: push {r4, r5, r6, r7, lr} 280; CHECK-NEXT: .pad #16 281; CHECK-NEXT: sub sp, #16 282; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 283; CHECK-NEXT: vmov r3, r1, d1 284; CHECK-NEXT: vmov r12, r2, d0 285; CHECK-NEXT: vldrw.u32 q0, [r0] 286; CHECK-NEXT: vmov lr, r0, d1 287; CHECK-NEXT: ldrh r7, [r1] 288; CHECK-NEXT: ldrh.w r1, [r12] 289; CHECK-NEXT: ldrh r2, [r2] 290; CHECK-NEXT: ldrh r4, [r0] 291; CHECK-NEXT: vmov r0, r5, d0 292; CHECK-NEXT: ldrh.w r6, [lr] 293; CHECK-NEXT: ldrh r3, [r3] 294; CHECK-NEXT: ldrh r0, [r0] 295; CHECK-NEXT: ldrh r5, [r5] 296; CHECK-NEXT: vmov.16 q0[0], r0 297; CHECK-NEXT: mov r0, sp 298; CHECK-NEXT: vmov.16 q0[1], r5 299; CHECK-NEXT: vmov.16 q0[2], r6 300; CHECK-NEXT: vmov.16 q0[3], r4 301; CHECK-NEXT: vmov.16 q0[4], r1 302; CHECK-NEXT: vmov.16 q0[5], r2 303; CHECK-NEXT: vmov.16 q0[6], r3 304; CHECK-NEXT: vmov.16 q0[7], r7 305; CHECK-NEXT: vstrw.32 q0, [r0] 306; CHECK-NEXT: vldrh.s32 q0, [r0] 307; CHECK-NEXT: vldrh.s32 q1, [r0, #8] 308; CHECK-NEXT: add sp, #16 309; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 310entry: 311 %offs = load <8 x ptr>, ptr %offptr, align 4 312 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 313 %ext = sext <8 x i16> %gather to <8 x i32> 314 ret <8 x i32> %ext 315} 316 317define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(ptr %offptr) { 318; CHECK-LABEL: ptr_v8i16_zext: 319; CHECK: @ %bb.0: @ %entry 320; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 321; CHECK-NEXT: push {r4, r5, r6, r7, lr} 322; CHECK-NEXT: .pad #16 323; CHECK-NEXT: sub sp, #16 324; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 325; CHECK-NEXT: vmov r3, r1, d1 326; CHECK-NEXT: vmov r12, r2, d0 327; CHECK-NEXT: vldrw.u32 q0, [r0] 328; CHECK-NEXT: vmov lr, r0, d1 329; CHECK-NEXT: ldrh r7, [r1] 330; CHECK-NEXT: ldrh.w r1, [r12] 331; CHECK-NEXT: ldrh r2, [r2] 332; CHECK-NEXT: ldrh r4, [r0] 333; CHECK-NEXT: vmov r0, r5, d0 334; CHECK-NEXT: ldrh.w r6, [lr] 335; CHECK-NEXT: ldrh r3, [r3] 336; CHECK-NEXT: ldrh r0, [r0] 337; CHECK-NEXT: ldrh r5, [r5] 338; CHECK-NEXT: vmov.16 q0[0], r0 339; CHECK-NEXT: mov r0, sp 340; CHECK-NEXT: vmov.16 q0[1], r5 341; CHECK-NEXT: vmov.16 q0[2], r6 342; CHECK-NEXT: vmov.16 q0[3], r4 343; CHECK-NEXT: vmov.16 q0[4], r1 344; CHECK-NEXT: vmov.16 q0[5], r2 345; CHECK-NEXT: vmov.16 q0[6], r3 346; CHECK-NEXT: vmov.16 q0[7], r7 347; CHECK-NEXT: vstrw.32 q0, [r0] 348; CHECK-NEXT: vldrh.u32 q0, [r0] 349; CHECK-NEXT: vldrh.u32 q1, [r0, #8] 350; CHECK-NEXT: add sp, #16 351; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 352entry: 353 %offs = load <8 x ptr>, ptr %offptr, align 4 354 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 355 %ext = zext <8 x i16> %gather to <8 x i32> 356 ret <8 x i32> %ext 357} 358 359; f16 360 361define arm_aapcs_vfpcc <8 x half> @ptr_f16(ptr %offptr) { 362; CHECK-LABEL: ptr_f16: 363; CHECK: @ %bb.0: @ %entry 364; CHECK-NEXT: vldrw.u32 q0, [r0] 365; CHECK-NEXT: vmov r1, r2, d0 366; CHECK-NEXT: vldr.16 s4, [r2] 367; CHECK-NEXT: vldr.16 s0, [r1] 368; CHECK-NEXT: vmov r1, r2, d1 369; CHECK-NEXT: vins.f16 s0, s4 370; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 371; CHECK-NEXT: vldr.16 s1, [r1] 372; CHECK-NEXT: vldr.16 s2, [r2] 373; CHECK-NEXT: vmov r0, r1, d2 374; CHECK-NEXT: vins.f16 s1, s2 375; CHECK-NEXT: vldr.16 s4, [r1] 376; CHECK-NEXT: vldr.16 s2, [r0] 377; CHECK-NEXT: vmov r0, r1, d3 378; CHECK-NEXT: vldr.16 s3, [r0] 379; CHECK-NEXT: vins.f16 s2, s4 380; CHECK-NEXT: vldr.16 s4, [r1] 381; CHECK-NEXT: vins.f16 s3, s4 382; CHECK-NEXT: bx lr 383entry: 384 %offs = load <8 x ptr>, ptr %offptr, align 4 385 %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef) 386 ret <8 x half> %gather 387} 388 389define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(ptr %offptr) { 390; CHECK-LABEL: ptr_v4f16: 391; CHECK: @ %bb.0: @ %entry 392; CHECK-NEXT: vldrw.u32 q0, [r0] 393; CHECK-NEXT: vmov r0, r1, d0 394; CHECK-NEXT: vldr.16 s4, [r1] 395; CHECK-NEXT: vldr.16 s0, [r0] 396; CHECK-NEXT: vmov r0, r1, d1 397; CHECK-NEXT: vldr.16 s2, [r1] 398; CHECK-NEXT: vldr.16 s1, [r0] 399; CHECK-NEXT: vins.f16 s0, s4 400; CHECK-NEXT: vins.f16 s1, s2 401; CHECK-NEXT: bx lr 402entry: 403 %offs = load <4 x ptr>, ptr %offptr, align 4 404 %gather = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x half> undef) 405 ret <4 x half> %gather 406} 407 408; i8 409 410define arm_aapcs_vfpcc <16 x i8> @ptr_i8(ptr %offptr) { 411; CHECK-LABEL: ptr_i8: 412; CHECK: @ %bb.0: @ %entry 413; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 414; CHECK-NEXT: push {r4, r5, r6, r7, lr} 415; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 416; CHECK-NEXT: vldrw.u32 q2, [r0] 417; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 418; CHECK-NEXT: vmov r1, r2, d0 419; CHECK-NEXT: vmov r6, r7, d4 420; CHECK-NEXT: vmov r4, r3, d1 421; CHECK-NEXT: ldrb r5, [r1] 422; CHECK-NEXT: ldrb r1, [r2] 423; CHECK-NEXT: ldrb r2, [r6] 424; CHECK-NEXT: ldrb.w r12, [r3] 425; CHECK-NEXT: vmov.8 q0[0], r2 426; CHECK-NEXT: vmov r2, r3, d3 427; CHECK-NEXT: ldrb.w lr, [r4] 428; CHECK-NEXT: ldrb r4, [r2] 429; CHECK-NEXT: ldrb r2, [r3] 430; CHECK-NEXT: ldrb r3, [r7] 431; CHECK-NEXT: vmov.8 q0[1], r3 432; CHECK-NEXT: vmov r3, r6, d5 433; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 434; CHECK-NEXT: ldrb r3, [r3] 435; CHECK-NEXT: ldrb r6, [r6] 436; CHECK-NEXT: vmov.8 q0[2], r3 437; CHECK-NEXT: vmov r0, r3, d4 438; CHECK-NEXT: vmov.8 q0[3], r6 439; CHECK-NEXT: ldrb r0, [r0] 440; CHECK-NEXT: ldrb r3, [r3] 441; CHECK-NEXT: vmov.8 q0[4], r0 442; CHECK-NEXT: vmov.8 q0[5], r3 443; CHECK-NEXT: vmov r0, r3, d5 444; CHECK-NEXT: ldrb r0, [r0] 445; CHECK-NEXT: ldrb r3, [r3] 446; CHECK-NEXT: vmov.8 q0[6], r0 447; CHECK-NEXT: vmov.8 q0[7], r3 448; CHECK-NEXT: vmov r0, r3, d2 449; CHECK-NEXT: ldrb r0, [r0] 450; CHECK-NEXT: ldrb r3, [r3] 451; CHECK-NEXT: vmov.8 q0[8], r0 452; CHECK-NEXT: vmov.8 q0[9], r3 453; CHECK-NEXT: vmov.8 q0[10], r4 454; CHECK-NEXT: vmov.8 q0[11], r2 455; CHECK-NEXT: vmov.8 q0[12], r5 456; CHECK-NEXT: vmov.8 q0[13], r1 457; CHECK-NEXT: vmov.8 q0[14], lr 458; CHECK-NEXT: vmov.8 q0[15], r12 459; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 460entry: 461 %offs = load <16 x ptr>, ptr %offptr, align 4 462 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 463 ret <16 x i8> %gather 464} 465 466define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_sext16(ptr %offptr) { 467; CHECK-LABEL: ptr_v8i8_sext16: 468; CHECK: @ %bb.0: @ %entry 469; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 470; CHECK-NEXT: push {r4, r5, r6, r7, lr} 471; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 472; CHECK-NEXT: vmov r3, r1, d1 473; CHECK-NEXT: vmov r12, r2, d0 474; CHECK-NEXT: vldrw.u32 q0, [r0] 475; CHECK-NEXT: vmov r4, r5, d0 476; CHECK-NEXT: vmov lr, r0, d1 477; CHECK-NEXT: ldrb r7, [r1] 478; CHECK-NEXT: ldrb.w r1, [r12] 479; CHECK-NEXT: ldrb r2, [r2] 480; CHECK-NEXT: ldrb r4, [r4] 481; CHECK-NEXT: ldrb r5, [r5] 482; CHECK-NEXT: vmov.16 q0[0], r4 483; CHECK-NEXT: ldrb.w r6, [lr] 484; CHECK-NEXT: vmov.16 q0[1], r5 485; CHECK-NEXT: ldrb r0, [r0] 486; CHECK-NEXT: vmov.16 q0[2], r6 487; CHECK-NEXT: ldrb r3, [r3] 488; CHECK-NEXT: vmov.16 q0[3], r0 489; CHECK-NEXT: vmov.16 q0[4], r1 490; CHECK-NEXT: vmov.16 q0[5], r2 491; CHECK-NEXT: vmov.16 q0[6], r3 492; CHECK-NEXT: vmov.16 q0[7], r7 493; CHECK-NEXT: vmovlb.s8 q0, q0 494; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 495entry: 496 %offs = load <8 x ptr>, ptr %offptr, align 4 497 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 498 %ext = sext <8 x i8> %gather to <8 x i16> 499 ret <8 x i16> %ext 500} 501 502define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_zext16(ptr %offptr) { 503; CHECK-LABEL: ptr_v8i8_zext16: 504; CHECK: @ %bb.0: @ %entry 505; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 506; CHECK-NEXT: push {r4, r5, r6, r7, lr} 507; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 508; CHECK-NEXT: vmov r3, r1, d1 509; CHECK-NEXT: vmov r12, r2, d0 510; CHECK-NEXT: vldrw.u32 q0, [r0] 511; CHECK-NEXT: vmov r4, r5, d0 512; CHECK-NEXT: vmov lr, r0, d1 513; CHECK-NEXT: ldrb r7, [r1] 514; CHECK-NEXT: ldrb.w r1, [r12] 515; CHECK-NEXT: ldrb r2, [r2] 516; CHECK-NEXT: ldrb r4, [r4] 517; CHECK-NEXT: ldrb r5, [r5] 518; CHECK-NEXT: vmov.16 q0[0], r4 519; CHECK-NEXT: ldrb.w r6, [lr] 520; CHECK-NEXT: vmov.16 q0[1], r5 521; CHECK-NEXT: ldrb r0, [r0] 522; CHECK-NEXT: vmov.16 q0[2], r6 523; CHECK-NEXT: ldrb r3, [r3] 524; CHECK-NEXT: vmov.16 q0[3], r0 525; CHECK-NEXT: vmov.16 q0[4], r1 526; CHECK-NEXT: vmov.16 q0[5], r2 527; CHECK-NEXT: vmov.16 q0[6], r3 528; CHECK-NEXT: vmov.16 q0[7], r7 529; CHECK-NEXT: vmovlb.u8 q0, q0 530; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 531entry: 532 %offs = load <8 x ptr>, ptr %offptr, align 4 533 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 534 %ext = zext <8 x i8> %gather to <8 x i16> 535 ret <8 x i16> %ext 536} 537 538define arm_aapcs_vfpcc <8 x i8> @ptr_v8i8(ptr %offptr) { 539; CHECK-LABEL: ptr_v8i8: 540; CHECK: @ %bb.0: @ %entry 541; CHECK-NEXT: .save {r4, r5, r6, lr} 542; CHECK-NEXT: push {r4, r5, r6, lr} 543; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 544; CHECK-NEXT: vmov r1, r2, d0 545; CHECK-NEXT: vmov r3, r12, d1 546; CHECK-NEXT: vldrw.u32 q0, [r0] 547; CHECK-NEXT: vmov r4, r5, d0 548; CHECK-NEXT: vmov r0, lr, d1 549; CHECK-NEXT: ldrb r1, [r1] 550; CHECK-NEXT: ldrb r6, [r3] 551; CHECK-NEXT: ldrb r2, [r2] 552; CHECK-NEXT: ldrb r4, [r4] 553; CHECK-NEXT: ldrb r5, [r5] 554; CHECK-NEXT: vmov.16 q0[0], r4 555; CHECK-NEXT: ldrb r0, [r0] 556; CHECK-NEXT: vmov.16 q0[1], r5 557; CHECK-NEXT: ldrb.w r3, [lr] 558; CHECK-NEXT: vmov.16 q0[2], r0 559; CHECK-NEXT: ldrb.w r12, [r12] 560; CHECK-NEXT: vmov.16 q0[3], r3 561; CHECK-NEXT: vmov.16 q0[4], r1 562; CHECK-NEXT: vmov.16 q0[5], r2 563; CHECK-NEXT: vmov.16 q0[6], r6 564; CHECK-NEXT: vmov.16 q0[7], r12 565; CHECK-NEXT: pop {r4, r5, r6, pc} 566entry: 567 %offs = load <8 x ptr>, ptr %offptr, align 4 568 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 569 ret <8 x i8> %gather 570} 571 572define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(ptr %offptr) { 573; CHECK-LABEL: ptr_v4i8_sext32: 574; CHECK: @ %bb.0: @ %entry 575; CHECK-NEXT: vldrw.u32 q1, [r0] 576; CHECK-NEXT: movs r1, #0 577; CHECK-NEXT: vldrb.s32 q0, [r1, q1] 578; CHECK-NEXT: bx lr 579entry: 580 %offs = load <4 x ptr>, ptr %offptr, align 4 581 %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef) 582 %ext = sext <4 x i8> %gather to <4 x i32> 583 ret <4 x i32> %ext 584} 585 586define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(ptr %offptr) { 587; CHECK-LABEL: ptr_v4i8_zext32: 588; CHECK: @ %bb.0: @ %entry 589; CHECK-NEXT: vldrw.u32 q1, [r0] 590; CHECK-NEXT: movs r1, #0 591; CHECK-NEXT: vldrb.u32 q0, [r1, q1] 592; CHECK-NEXT: bx lr 593entry: 594 %offs = load <4 x ptr>, ptr %offptr, align 4 595 %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef) 596 %ext = zext <4 x i8> %gather to <4 x i32> 597 ret <4 x i32> %ext 598} 599 600define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(ptr %offptr) { 601; CHECK-LABEL: ptr_v4i8: 602; CHECK: @ %bb.0: @ %entry 603; CHECK-NEXT: vldrw.u32 q1, [r0] 604; CHECK-NEXT: movs r1, #0 605; CHECK-NEXT: vldrb.u32 q0, [r1, q1] 606; CHECK-NEXT: bx lr 607entry: 608 %offs = load <4 x ptr>, ptr %offptr, align 4 609 %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef) 610 ret <4 x i8> %gather 611} 612 613define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(ptr %offptr) { 614; CHECK-LABEL: ptr_v8i8_sext32: 615; CHECK: @ %bb.0: @ %entry 616; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 617; CHECK-NEXT: push {r4, r5, r6, r7, lr} 618; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 619; CHECK-NEXT: vmov r1, r2, d1 620; CHECK-NEXT: vmov r3, r12, d0 621; CHECK-NEXT: vldrw.u32 q0, [r0] 622; CHECK-NEXT: vmov r0, lr, d1 623; CHECK-NEXT: ldrb r7, [r2] 624; CHECK-NEXT: vmov r2, r4, d0 625; CHECK-NEXT: ldrb r6, [r1] 626; CHECK-NEXT: ldrb r3, [r3] 627; CHECK-NEXT: ldrb r0, [r0] 628; CHECK-NEXT: ldrb.w r1, [r12] 629; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 630; CHECK-NEXT: ldrb.w r5, [lr] 631; CHECK-NEXT: vmov q1[3], q1[1], r1, r7 632; CHECK-NEXT: vmovlb.s8 q1, q1 633; CHECK-NEXT: vmovlb.s16 q1, q1 634; CHECK-NEXT: ldrb r2, [r2] 635; CHECK-NEXT: ldrb r4, [r4] 636; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 637; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 638; CHECK-NEXT: vmovlb.s8 q0, q0 639; CHECK-NEXT: vmovlb.s16 q0, q0 640; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 641entry: 642 %offs = load <8 x ptr>, ptr %offptr, align 4 643 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 644 %ext = sext <8 x i8> %gather to <8 x i32> 645 ret <8 x i32> %ext 646} 647 648define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(ptr %offptr) { 649; CHECK-LABEL: ptr_v8i8_zext32: 650; CHECK: @ %bb.0: @ %entry 651; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 652; CHECK-NEXT: push {r4, r5, r6, r7, lr} 653; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 654; CHECK-NEXT: vmov.i32 q1, #0xff 655; CHECK-NEXT: vmov r1, r2, d1 656; CHECK-NEXT: vmov r12, r3, d0 657; CHECK-NEXT: vldrw.u32 q0, [r0] 658; CHECK-NEXT: vmov r4, r5, d0 659; CHECK-NEXT: vmov r0, lr, d1 660; CHECK-NEXT: ldrb r7, [r2] 661; CHECK-NEXT: ldrb r1, [r1] 662; CHECK-NEXT: ldrb.w r2, [r12] 663; CHECK-NEXT: ldrb r4, [r4] 664; CHECK-NEXT: ldrb r0, [r0] 665; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 666; CHECK-NEXT: ldrb r3, [r3] 667; CHECK-NEXT: ldrb.w r6, [lr] 668; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 669; CHECK-NEXT: ldrb r5, [r5] 670; CHECK-NEXT: vmov q2[3], q2[1], r3, r7 671; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 672; CHECK-NEXT: vand q0, q0, q1 673; CHECK-NEXT: vand q1, q2, q1 674; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 675entry: 676 %offs = load <8 x ptr>, ptr %offptr, align 4 677 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 678 %ext = zext <8 x i8> %gather to <8 x i32> 679 ret <8 x i32> %ext 680} 681 682; loops 683 684define void @foo_ptr_p_int32_t(ptr %dest, ptr %src, i32 %n) { 685; CHECK-LABEL: foo_ptr_p_int32_t: 686; CHECK: @ %bb.0: @ %entry 687; CHECK-NEXT: .save {r7, lr} 688; CHECK-NEXT: push {r7, lr} 689; CHECK-NEXT: bic r2, r2, #15 690; CHECK-NEXT: cmp r2, #1 691; CHECK-NEXT: it lt 692; CHECK-NEXT: poplt {r7, pc} 693; CHECK-NEXT: .LBB26_1: @ %vector.body.preheader 694; CHECK-NEXT: subs r2, #4 695; CHECK-NEXT: movs r3, #1 696; CHECK-NEXT: add.w lr, r3, r2, lsr #2 697; CHECK-NEXT: .LBB26_2: @ %vector.body 698; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 699; CHECK-NEXT: vldrw.u32 q0, [r1], #16 700; CHECK-NEXT: vptt.i32 ne, q0, zr 701; CHECK-NEXT: vldrwt.u32 q1, [q0] 702; CHECK-NEXT: vstrwt.32 q1, [r0], #16 703; CHECK-NEXT: le lr, .LBB26_2 704; CHECK-NEXT: @ %bb.3: @ %for.end 705; CHECK-NEXT: pop {r7, pc} 706entry: 707 %and = and i32 %n, -16 708 %cmp11 = icmp sgt i32 %and, 0 709 br i1 %cmp11, label %vector.body, label %for.end 710 711vector.body: ; preds = %vector.body, %entry 712 %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ] 713 %i = getelementptr inbounds ptr, ptr %src, i32 %index 714 %wide.load = load <4 x ptr>, ptr %i, align 4 715 %i2 = icmp ne <4 x ptr> %wide.load, zeroinitializer 716 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %wide.load, i32 4, <4 x i1> %i2, <4 x i32> undef) 717 %i3 = getelementptr inbounds i32, ptr %dest, i32 %index 718 call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.gather, ptr %i3, i32 4, <4 x i1> %i2) 719 %index.next = add i32 %index, 4 720 %i5 = icmp eq i32 %index.next, %and 721 br i1 %i5, label %for.end, label %vector.body 722 723for.end: ; preds = %vector.body, %entry 724 ret void 725} 726 727define void @foo_ptr_p_float(ptr %dest, ptr %src, i32 %n) { 728; CHECK-LABEL: foo_ptr_p_float: 729; CHECK: @ %bb.0: @ %entry 730; CHECK-NEXT: .save {r7, lr} 731; CHECK-NEXT: push {r7, lr} 732; CHECK-NEXT: bic r2, r2, #15 733; CHECK-NEXT: cmp r2, #1 734; CHECK-NEXT: it lt 735; CHECK-NEXT: poplt {r7, pc} 736; CHECK-NEXT: .LBB27_1: @ %vector.body.preheader 737; CHECK-NEXT: subs r2, #4 738; CHECK-NEXT: movs r3, #1 739; CHECK-NEXT: add.w lr, r3, r2, lsr #2 740; CHECK-NEXT: .LBB27_2: @ %vector.body 741; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 742; CHECK-NEXT: vldrw.u32 q0, [r1], #16 743; CHECK-NEXT: vptt.i32 ne, q0, zr 744; CHECK-NEXT: vldrwt.u32 q1, [q0] 745; CHECK-NEXT: vstrwt.32 q1, [r0], #16 746; CHECK-NEXT: le lr, .LBB27_2 747; CHECK-NEXT: @ %bb.3: @ %for.end 748; CHECK-NEXT: pop {r7, pc} 749entry: 750 %and = and i32 %n, -16 751 %cmp11 = icmp sgt i32 %and, 0 752 br i1 %cmp11, label %vector.body, label %for.end 753 754vector.body: ; preds = %vector.body, %entry 755 %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ] 756 %i = getelementptr inbounds ptr, ptr %src, i32 %index 757 %wide.load = load <4 x ptr>, ptr %i, align 4 758 %i2 = icmp ne <4 x ptr> %wide.load, zeroinitializer 759 %i3 = bitcast <4 x ptr> %wide.load to <4 x ptr> 760 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %i3, i32 4, <4 x i1> %i2, <4 x i32> undef) 761 %i4 = getelementptr inbounds float, ptr %dest, i32 %index 762 call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.gather, ptr %i4, i32 4, <4 x i1> %i2) 763 %index.next = add i32 %index, 4 764 %i6 = icmp eq i32 %index.next, %and 765 br i1 %i6, label %for.end, label %vector.body 766 767for.end: ; preds = %vector.body, %entry 768 ret void 769} 770 771define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x ptr> %p) { 772; CHECK-LABEL: qi4: 773; CHECK: @ %bb.0: @ %entry 774; CHECK-NEXT: movs r0, #16 775; CHECK-NEXT: vadd.i32 q1, q0, r0 776; CHECK-NEXT: vldrw.u32 q0, [q1] 777; CHECK-NEXT: bx lr 778entry: 779 %g = getelementptr inbounds i32, <4 x ptr> %p, i32 4 780 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 781 ret <4 x i32> %gather 782} 783 784define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(ptr %base, ptr %offptr) { 785; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64: 786; CHECK: @ %bb.0: @ %entry 787; CHECK-NEXT: vldrb.u16 q0, [r1] 788; CHECK-NEXT: vldrb.u16 q1, [r0, q0] 789; CHECK-NEXT: vmov.u16 r0, q1[2] 790; CHECK-NEXT: vmov.u16 r1, q1[0] 791; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 792; CHECK-NEXT: vmov.u16 r0, q1[3] 793; CHECK-NEXT: vmov.u16 r1, q1[1] 794; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 795; CHECK-NEXT: vmov.u16 r0, q1[6] 796; CHECK-NEXT: vmov.u16 r1, q1[4] 797; CHECK-NEXT: vmovlb.s8 q0, q0 798; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 799; CHECK-NEXT: vmov.u16 r0, q1[7] 800; CHECK-NEXT: vmov.u16 r1, q1[5] 801; CHECK-NEXT: vmovlb.s16 q0, q0 802; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 803; CHECK-NEXT: vmovlb.s8 q1, q2 804; CHECK-NEXT: vmovlb.s16 q1, q1 805; CHECK-NEXT: bx lr 806entry: 807 %offs = load <8 x i8>, ptr %offptr, align 1 808 %offs.zext = zext <8 x i8> %offs to <8 x i32> 809 %ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 810 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 811 %gather.sext = sext <8 x i8> %gather to <8 x i32> 812 ret <8 x i32> %gather.sext 813} 814 815define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i32(ptr %base) { 816; CHECK-LABEL: gepconstoff_i32: 817; CHECK: @ %bb.0: @ %bb 818; CHECK-NEXT: adr r1, .LCPI30_0 819; CHECK-NEXT: vldrw.u32 q1, [r1] 820; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 821; CHECK-NEXT: bx lr 822; CHECK-NEXT: .p2align 4 823; CHECK-NEXT: @ %bb.1: 824; CHECK-NEXT: .LCPI30_0: 825; CHECK-NEXT: .long 0 @ 0x0 826; CHECK-NEXT: .long 4 @ 0x4 827; CHECK-NEXT: .long 8 @ 0x8 828; CHECK-NEXT: .long 12 @ 0xc 829bb: 830 %a = getelementptr i32, ptr %base, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 831 %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %a, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison) 832 ret <4 x i32> %g 833} 834 835define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i8(ptr %base) { 836; CHECK-LABEL: gepconstoff_i8: 837; CHECK: @ %bb.0: @ %bb 838; CHECK-NEXT: adr r1, .LCPI31_0 839; CHECK-NEXT: vldrw.u32 q1, [r1] 840; CHECK-NEXT: vldrw.u32 q0, [r0, q1] 841; CHECK-NEXT: bx lr 842; CHECK-NEXT: .p2align 4 843; CHECK-NEXT: @ %bb.1: 844; CHECK-NEXT: .LCPI31_0: 845; CHECK-NEXT: .long 4294967292 @ 0xfffffffc 846; CHECK-NEXT: .long 12 @ 0xc 847; CHECK-NEXT: .long 28 @ 0x1c 848; CHECK-NEXT: .long 44 @ 0x2c 849bb: 850 %a = getelementptr i8, ptr %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48> 851 %b = bitcast <4 x ptr> %a to <4 x ptr> 852 %c = getelementptr inbounds i32, <4 x ptr> %b, i32 -1 853 %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %c, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison) 854 ret <4 x i32> %g 855} 856 857define arm_aapcs_vfpcc <4 x i32> @gepconstoff3_i16(ptr %base) { 858; CHECK-LABEL: gepconstoff3_i16: 859; CHECK: @ %bb.0: @ %bb 860; CHECK-NEXT: adr r1, .LCPI32_0 861; CHECK-NEXT: vldrw.u32 q1, [r1] 862; CHECK-NEXT: vldrw.u32 q0, [r0, q1] 863; CHECK-NEXT: bx lr 864; CHECK-NEXT: .p2align 4 865; CHECK-NEXT: @ %bb.1: 866; CHECK-NEXT: .LCPI32_0: 867; CHECK-NEXT: .long 12 @ 0xc 868; CHECK-NEXT: .long 18 @ 0x12 869; CHECK-NEXT: .long 58 @ 0x3a 870; CHECK-NEXT: .long 280 @ 0x118 871bb: 872 %a = getelementptr i16, ptr %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48> 873 %b = bitcast <4 x ptr> %a to <4 x ptr> 874 %c = getelementptr i8, <4 x ptr> %b, <4 x i32> <i32 16, i32 -10, i32 -2, i32 188> 875 %d = bitcast <4 x ptr> %c to <4 x ptr> 876 %e = getelementptr inbounds i32, <4 x ptr> %d, i32 -1 877 %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %e, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison) 878 ret <4 x i32> %g 879} 880 881declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>) 882declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>) 883declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>) 884declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>) 885declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>) 886declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>) 887declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>) 888declare <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i16>) 889declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>) 890declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>) 891declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>) 892declare <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x half>) 893declare <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x half>) 894declare <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x half>) 895declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i8>) 896declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>) 897declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>) 898declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>) 899declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>) 900