1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4; VLDRB.u16 Qd, [base, offs] 5define arm_aapcs_vfpcc void @ext_unscaled_i8_i16(ptr %base, ptr %offptr, <8 x i16> %input) { 6; CHECK-LABEL: ext_unscaled_i8_i16: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vldrh.u16 q1, [r1] 9; CHECK-NEXT: vstrb.16 q0, [r0, q1] 10; CHECK-NEXT: bx lr 11entry: 12 %offs = load <8 x i16>, ptr %offptr, align 2 13 %offs.zext = zext <8 x i16> %offs to <8 x i32> 14 %ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 15 %t = trunc <8 x i16> %input to <8 x i8> 16 call void @llvm.masked.scatter.v8i8(<8 x i8> %t, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 17 ret void 18} 19 20; VLDRB.u16 Qd, [base, offs] 21define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(ptr %base, ptr %offptr, <8 x i16> %input) { 22; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: 23; CHECK: @ %bb.0: @ %entry 24; CHECK-NEXT: vldrb.u16 q1, [r1] 25; CHECK-NEXT: vstrb.16 q0, [r0, q1] 26; CHECK-NEXT: bx lr 27entry: 28 %offs = load <8 x i8>, ptr %offptr, align 1 29 %offs.zext = zext <8 x i8> %offs to <8 x i32> 30 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 31 %input.trunc = trunc <8 x i16> %input to <8 x i8> 32 call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x ptr> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 33 ret void 34} 35 36; VLDRH.16 Qd, [base, offs] 37define arm_aapcs_vfpcc void @unscaled_i16_i16(ptr %base, ptr %offptr, <8 x i16> %input) { 38; CHECK-LABEL: unscaled_i16_i16: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: vldrh.u16 q1, [r1] 41; CHECK-NEXT: vstrh.16 q0, [r0, q1] 42; CHECK-NEXT: bx lr 43entry: 44 %offs = load <8 x i16>, ptr %offptr, align 2 45 %offs.zext = zext <8 x i16> %offs to <8 x i32> 46 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 47 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 48 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 49 ret void 50} 51 52; VLDRH.s16 Qd, [base, offs] 53define arm_aapcs_vfpcc void @unscaled_v8f16_i16(ptr %base, ptr %offptr, <8 x half> %input) { 54; CHECK-LABEL: unscaled_v8f16_i16: 55; CHECK: @ %bb.0: @ %entry 56; CHECK-NEXT: vldrh.u16 q1, [r1] 57; CHECK-NEXT: vstrh.16 q0, [r0, q1] 58; CHECK-NEXT: bx lr 59entry: 60 %offs = load <8 x i16>, ptr %offptr, align 2 61 %offs.zext = zext <8 x i16> %offs to <8 x i32> 62 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 63 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 64 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 65 ret void 66} 67 68; Expand - sext offsets 69define arm_aapcs_vfpcc void @unscaled_v8i16_sext(ptr %base, ptr %offptr, <8 x i16> %input) { 70; CHECK-LABEL: unscaled_v8i16_sext: 71; CHECK: @ %bb.0: @ %entry 72; CHECK-NEXT: .save {r4, r5, r6, lr} 73; CHECK-NEXT: push {r4, r5, r6, lr} 74; CHECK-NEXT: vldrh.s32 q1, [r1] 75; CHECK-NEXT: vmov.u16 r6, q0[0] 76; CHECK-NEXT: vadd.i32 q1, q1, r0 77; CHECK-NEXT: vmov r2, r3, d2 78; CHECK-NEXT: vmov r12, lr, d3 79; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 80; CHECK-NEXT: vadd.i32 q1, q1, r0 81; CHECK-NEXT: vmov r0, r1, d2 82; CHECK-NEXT: vmov r4, r5, d3 83; CHECK-NEXT: strh r6, [r2] 84; CHECK-NEXT: vmov.u16 r2, q0[1] 85; CHECK-NEXT: strh r2, [r3] 86; CHECK-NEXT: vmov.u16 r2, q0[2] 87; CHECK-NEXT: strh.w r2, [r12] 88; CHECK-NEXT: vmov.u16 r2, q0[3] 89; CHECK-NEXT: strh.w r2, [lr] 90; CHECK-NEXT: vmov.u16 r2, q0[4] 91; CHECK-NEXT: strh r2, [r0] 92; CHECK-NEXT: vmov.u16 r0, q0[5] 93; CHECK-NEXT: strh r0, [r1] 94; CHECK-NEXT: vmov.u16 r0, q0[6] 95; CHECK-NEXT: strh r0, [r4] 96; CHECK-NEXT: vmov.u16 r0, q0[7] 97; CHECK-NEXT: strh r0, [r5] 98; CHECK-NEXT: pop {r4, r5, r6, pc} 99entry: 100 %offs = load <8 x i16>, ptr %offptr, align 2 101 %offs.sext = sext <8 x i16> %offs to <8 x i32> 102 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext 103 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 104 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 105 ret void 106} 107 108; Expand - sext offsets 109define arm_aapcs_vfpcc void @unscaled_v8f16_sext(ptr %base, ptr %offptr, <8 x half> %input) { 110; CHECK-LABEL: unscaled_v8f16_sext: 111; CHECK: @ %bb.0: @ %entry 112; CHECK-NEXT: vldrh.s32 q2, [r1] 113; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 114; CHECK-NEXT: vadd.i32 q2, q2, r0 115; CHECK-NEXT: vadd.i32 q1, q1, r0 116; CHECK-NEXT: vmov r1, r2, d4 117; CHECK-NEXT: vstr.16 s0, [r1] 118; CHECK-NEXT: vmovx.f16 s0, s0 119; CHECK-NEXT: vstr.16 s0, [r2] 120; CHECK-NEXT: vmov r1, r2, d5 121; CHECK-NEXT: vmovx.f16 s0, s1 122; CHECK-NEXT: vstr.16 s1, [r1] 123; CHECK-NEXT: vstr.16 s0, [r2] 124; CHECK-NEXT: vmov r0, r1, d2 125; CHECK-NEXT: vmovx.f16 s0, s2 126; CHECK-NEXT: vstr.16 s2, [r0] 127; CHECK-NEXT: vstr.16 s0, [r1] 128; CHECK-NEXT: vmov r0, r1, d3 129; CHECK-NEXT: vmovx.f16 s0, s3 130; CHECK-NEXT: vstr.16 s3, [r0] 131; CHECK-NEXT: vstr.16 s0, [r1] 132; CHECK-NEXT: bx lr 133entry: 134 %offs = load <8 x i16>, ptr %offptr, align 2 135 %offs.sext = sext <8 x i16> %offs to <8 x i32> 136 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext 137 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 138 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 139 ret void 140} 141 142; Expand - i32 offsets 143define arm_aapcs_vfpcc void @unscaled_v8i16_noext(ptr %base, ptr %offptr, <8 x i16> %input) { 144; CHECK-LABEL: unscaled_v8i16_noext: 145; CHECK: @ %bb.0: @ %entry 146; CHECK-NEXT: .save {r4, r5, r6, lr} 147; CHECK-NEXT: push {r4, r5, r6, lr} 148; CHECK-NEXT: vldrw.u32 q1, [r1] 149; CHECK-NEXT: vmov.u16 r6, q0[0] 150; CHECK-NEXT: vadd.i32 q1, q1, r0 151; CHECK-NEXT: vmov r2, r3, d2 152; CHECK-NEXT: vmov r12, lr, d3 153; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 154; CHECK-NEXT: vadd.i32 q1, q1, r0 155; CHECK-NEXT: vmov r0, r1, d2 156; CHECK-NEXT: vmov r4, r5, d3 157; CHECK-NEXT: strh r6, [r2] 158; CHECK-NEXT: vmov.u16 r2, q0[1] 159; CHECK-NEXT: strh r2, [r3] 160; CHECK-NEXT: vmov.u16 r2, q0[2] 161; CHECK-NEXT: strh.w r2, [r12] 162; CHECK-NEXT: vmov.u16 r2, q0[3] 163; CHECK-NEXT: strh.w r2, [lr] 164; CHECK-NEXT: vmov.u16 r2, q0[4] 165; CHECK-NEXT: strh r2, [r0] 166; CHECK-NEXT: vmov.u16 r0, q0[5] 167; CHECK-NEXT: strh r0, [r1] 168; CHECK-NEXT: vmov.u16 r0, q0[6] 169; CHECK-NEXT: strh r0, [r4] 170; CHECK-NEXT: vmov.u16 r0, q0[7] 171; CHECK-NEXT: strh r0, [r5] 172; CHECK-NEXT: pop {r4, r5, r6, pc} 173entry: 174 %offs = load <8 x i32>, ptr %offptr, align 4 175 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs 176 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 177 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 178 ret void 179} 180 181; Expand - i32 offsets 182define arm_aapcs_vfpcc void @unscaled_v8f16_noext(ptr %base, ptr %offptr, <8 x half> %input) { 183; CHECK-LABEL: unscaled_v8f16_noext: 184; CHECK: @ %bb.0: @ %entry 185; CHECK-NEXT: vldrw.u32 q2, [r1] 186; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 187; CHECK-NEXT: vadd.i32 q2, q2, r0 188; CHECK-NEXT: vadd.i32 q1, q1, r0 189; CHECK-NEXT: vmov r1, r2, d4 190; CHECK-NEXT: vstr.16 s0, [r1] 191; CHECK-NEXT: vmovx.f16 s0, s0 192; CHECK-NEXT: vstr.16 s0, [r2] 193; CHECK-NEXT: vmov r1, r2, d5 194; CHECK-NEXT: vmovx.f16 s0, s1 195; CHECK-NEXT: vstr.16 s1, [r1] 196; CHECK-NEXT: vstr.16 s0, [r2] 197; CHECK-NEXT: vmov r0, r1, d2 198; CHECK-NEXT: vmovx.f16 s0, s2 199; CHECK-NEXT: vstr.16 s2, [r0] 200; CHECK-NEXT: vstr.16 s0, [r1] 201; CHECK-NEXT: vmov r0, r1, d3 202; CHECK-NEXT: vmovx.f16 s0, s3 203; CHECK-NEXT: vstr.16 s3, [r0] 204; CHECK-NEXT: vstr.16 s0, [r1] 205; CHECK-NEXT: bx lr 206entry: 207 %offs = load <8 x i32>, ptr %offptr, align 4 208 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs 209 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 210 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 211 ret void 212} 213 214; VLDRH.16 Qd, [base, zext(offs)] 215define arm_aapcs_vfpcc void @unsigned_unscaled_i16_i8(ptr %base, ptr %offptr, <8 x i16> %input) { 216; CHECK-LABEL: unsigned_unscaled_i16_i8: 217; CHECK: @ %bb.0: @ %entry 218; CHECK-NEXT: vldrb.u16 q1, [r1] 219; CHECK-NEXT: vstrh.16 q0, [r0, q1] 220; CHECK-NEXT: bx lr 221entry: 222 %offs = load <8 x i8>, ptr %offptr, align 1 223 %offs.zext = zext <8 x i8> %offs to <8 x i32> 224 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 225 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 226 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 227 ret void 228} 229 230; VLDRH.16 Qd, [base, zext(offs)] 231define arm_aapcs_vfpcc void @unsigned_unscaled_f16_i8(ptr %base, ptr %offptr, <8 x half> %input) { 232; CHECK-LABEL: unsigned_unscaled_f16_i8: 233; CHECK: @ %bb.0: @ %entry 234; CHECK-NEXT: vldrb.u16 q1, [r1] 235; CHECK-NEXT: vstrh.16 q0, [r0, q1] 236; CHECK-NEXT: bx lr 237entry: 238 %offs = load <8 x i8>, ptr %offptr, align 1 239 %offs.zext = zext <8 x i8> %offs to <8 x i32> 240 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 241 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 242 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 243 ret void 244} 245 246; Expand - sext offsets 247define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(ptr %base, ptr %offptr, <8 x i64> %input) { 248; CHECK-LABEL: trunc_signed_unscaled_i64_i8: 249; CHECK: @ %bb.0: @ %entry 250; CHECK-NEXT: .save {r4, r5, r7, lr} 251; CHECK-NEXT: push {r4, r5, r7, lr} 252; CHECK-NEXT: .vsave {d8, d9} 253; CHECK-NEXT: vpush {d8, d9} 254; CHECK-NEXT: vldrb.s32 q4, [r1] 255; CHECK-NEXT: vmov r4, s0 256; CHECK-NEXT: vadd.i32 q4, q4, r0 257; CHECK-NEXT: vmov r2, r3, d8 258; CHECK-NEXT: vmov r12, lr, d9 259; CHECK-NEXT: vldrb.s32 q4, [r1, #4] 260; CHECK-NEXT: vadd.i32 q4, q4, r0 261; CHECK-NEXT: vmov r0, r1, d8 262; CHECK-NEXT: strh r4, [r2] 263; CHECK-NEXT: vmov r2, s2 264; CHECK-NEXT: vmov r4, r5, d9 265; CHECK-NEXT: strh r2, [r3] 266; CHECK-NEXT: vmov r2, s4 267; CHECK-NEXT: strh.w r2, [r12] 268; CHECK-NEXT: vmov r2, s6 269; CHECK-NEXT: strh.w r2, [lr] 270; CHECK-NEXT: vmov r2, s8 271; CHECK-NEXT: strh r2, [r0] 272; CHECK-NEXT: vmov r0, s10 273; CHECK-NEXT: strh r0, [r1] 274; CHECK-NEXT: vmov r0, s12 275; CHECK-NEXT: strh r0, [r4] 276; CHECK-NEXT: vmov r0, s14 277; CHECK-NEXT: strh r0, [r5] 278; CHECK-NEXT: vpop {d8, d9} 279; CHECK-NEXT: pop {r4, r5, r7, pc} 280entry: 281 %offs = load <8 x i8>, ptr %offptr, align 1 282 %offs.sext = sext <8 x i8> %offs to <8 x i32> 283 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext 284 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 285 %input.trunc = trunc <8 x i64> %input to <8 x i16> 286 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 287 ret void 288} 289 290define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(ptr %base, ptr %offptr, <8 x i64> %input) { 291; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: 292; CHECK: @ %bb.0: @ %entry 293; CHECK-NEXT: .vsave {d8, d9} 294; CHECK-NEXT: vpush {d8, d9} 295; CHECK-NEXT: vmov r3, s0 296; CHECK-NEXT: vmov.16 q4[0], r3 297; CHECK-NEXT: vmov r3, s2 298; CHECK-NEXT: vmov.16 q4[1], r3 299; CHECK-NEXT: vmov r3, s4 300; CHECK-NEXT: vmov.16 q4[2], r3 301; CHECK-NEXT: vmov r3, s6 302; CHECK-NEXT: vmov.16 q4[3], r3 303; CHECK-NEXT: vmov r3, s8 304; CHECK-NEXT: vmov.16 q4[4], r3 305; CHECK-NEXT: vmov r3, s10 306; CHECK-NEXT: vmov.16 q4[5], r3 307; CHECK-NEXT: vmov r3, s12 308; CHECK-NEXT: vmov r2, s14 309; CHECK-NEXT: vmov.16 q4[6], r3 310; CHECK-NEXT: vldrb.u16 q0, [r1] 311; CHECK-NEXT: vmov.16 q4[7], r2 312; CHECK-NEXT: vstrh.16 q4, [r0, q0] 313; CHECK-NEXT: vpop {d8, d9} 314; CHECK-NEXT: bx lr 315entry: 316 %offs = load <8 x i8>, ptr %offptr, align 1 317 %offs.zext = zext <8 x i8> %offs to <8 x i32> 318 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 319 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 320 %input.trunc = trunc <8 x i64> %input to <8 x i16> 321 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 322 ret void 323} 324 325; Expand - sext offsets 326define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(ptr %base, ptr %offptr, <8 x i32> %input) { 327; CHECK-LABEL: trunc_signed_unscaled_i32_i8: 328; CHECK: @ %bb.0: @ %entry 329; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 330; CHECK-NEXT: push {r4, r5, r6, r7, lr} 331; CHECK-NEXT: vldrb.s32 q2, [r1] 332; CHECK-NEXT: vmov r4, r5, d0 333; CHECK-NEXT: vadd.i32 q2, q2, r0 334; CHECK-NEXT: vmov r2, r3, d4 335; CHECK-NEXT: vmov r12, lr, d5 336; CHECK-NEXT: vldrb.s32 q2, [r1, #4] 337; CHECK-NEXT: vadd.i32 q2, q2, r0 338; CHECK-NEXT: vmov r0, r6, d1 339; CHECK-NEXT: strh r4, [r2] 340; CHECK-NEXT: vmov r2, r7, d4 341; CHECK-NEXT: strh r5, [r3] 342; CHECK-NEXT: vmov r3, r5, d5 343; CHECK-NEXT: strh.w r0, [r12] 344; CHECK-NEXT: vmov r0, r1, d2 345; CHECK-NEXT: strh.w r6, [lr] 346; CHECK-NEXT: vmov r6, r4, d3 347; CHECK-NEXT: strh r0, [r2] 348; CHECK-NEXT: strh r1, [r7] 349; CHECK-NEXT: strh r6, [r3] 350; CHECK-NEXT: strh r4, [r5] 351; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 352entry: 353 %offs = load <8 x i8>, ptr %offptr, align 1 354 %offs.sext = sext <8 x i8> %offs to <8 x i32> 355 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext 356 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 357 %input.trunc = trunc <8 x i32> %input to <8 x i16> 358 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 359 ret void 360} 361 362define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(ptr %base, ptr %offptr, <8 x i32> %input) { 363; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: 364; CHECK: @ %bb.0: @ %entry 365; CHECK-NEXT: .pad #16 366; CHECK-NEXT: sub sp, #16 367; CHECK-NEXT: mov r2, sp 368; CHECK-NEXT: vstrh.32 q1, [r2, #8] 369; CHECK-NEXT: vstrh.32 q0, [r2] 370; CHECK-NEXT: vldrb.u16 q0, [r1] 371; CHECK-NEXT: vldrw.u32 q1, [r2] 372; CHECK-NEXT: vstrh.16 q1, [r0, q0] 373; CHECK-NEXT: add sp, #16 374; CHECK-NEXT: bx lr 375entry: 376 %offs = load <8 x i8>, ptr %offptr, align 1 377 %offs.zext = zext <8 x i8> %offs to <8 x i32> 378 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext 379 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr> 380 %input.trunc = trunc <8 x i32> %input to <8 x i16> 381 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 382 ret void 383} 384 385; Expand - sext offsets 386define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(ptr %base, ptr %offptr, <8 x i16> %input) { 387; CHECK-LABEL: trunc_signed_unscaled_i16_i8: 388; CHECK: @ %bb.0: @ %entry 389; CHECK-NEXT: .save {r4, r5, r6, lr} 390; CHECK-NEXT: push {r4, r5, r6, lr} 391; CHECK-NEXT: vldrb.s32 q1, [r1] 392; CHECK-NEXT: vmov.u16 r6, q0[0] 393; CHECK-NEXT: vadd.i32 q1, q1, r0 394; CHECK-NEXT: vmov r2, r3, d2 395; CHECK-NEXT: vmov r12, lr, d3 396; CHECK-NEXT: vldrb.s32 q1, [r1, #4] 397; CHECK-NEXT: vadd.i32 q1, q1, r0 398; CHECK-NEXT: vmov r0, r1, d2 399; CHECK-NEXT: vmov r4, r5, d3 400; CHECK-NEXT: strb r6, [r2] 401; CHECK-NEXT: vmov.u16 r2, q0[1] 402; CHECK-NEXT: strb r2, [r3] 403; CHECK-NEXT: vmov.u16 r2, q0[2] 404; CHECK-NEXT: strb.w r2, [r12] 405; CHECK-NEXT: vmov.u16 r2, q0[3] 406; CHECK-NEXT: strb.w r2, [lr] 407; CHECK-NEXT: vmov.u16 r2, q0[4] 408; CHECK-NEXT: strb r2, [r0] 409; CHECK-NEXT: vmov.u16 r0, q0[5] 410; CHECK-NEXT: strb r0, [r1] 411; CHECK-NEXT: vmov.u16 r0, q0[6] 412; CHECK-NEXT: strb r0, [r4] 413; CHECK-NEXT: vmov.u16 r0, q0[7] 414; CHECK-NEXT: strb r0, [r5] 415; CHECK-NEXT: pop {r4, r5, r6, pc} 416entry: 417 %offs = load <8 x i8>, ptr %offptr, align 1 418 %offs.sext = sext <8 x i8> %offs to <8 x i32> 419 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext 420 %input.trunc = trunc <8 x i16> %input to <8 x i8> 421 call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x ptr> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 422 ret void 423} 424 425declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x ptr>, i32, <8 x i1>) 426declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x ptr>, i32, <8 x i1>) 427declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x ptr>, i32, <8 x i1>) 428