1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x i32> @sext_trunc_i32(<4 x i32> %a) { 5; CHECK-LABEL: sext_trunc_i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: bx lr 8entry: 9 %sa = sext <4 x i32> %a to <4 x i64> 10 %t = trunc <4 x i64> %sa to <4 x i32> 11 ret <4 x i32> %t 12} 13 14define arm_aapcs_vfpcc <8 x i16> @sext_trunc_i16(<8 x i16> %a) { 15; CHECK-LABEL: sext_trunc_i16: 16; CHECK: @ %bb.0: @ %entry 17; CHECK-NEXT: bx lr 18entry: 19 %sa = sext <8 x i16> %a to <8 x i32> 20 %t = trunc <8 x i32> %sa to <8 x i16> 21 ret <8 x i16> %t 22} 23 24define arm_aapcs_vfpcc <16 x i8> @sext_trunc_i8(<16 x i8> %a) { 25; CHECK-LABEL: sext_trunc_i8: 26; CHECK: @ %bb.0: @ %entry 27; CHECK-NEXT: bx lr 28entry: 29 %sa = sext <16 x i8> %a to <16 x i16> 30 %t = trunc <16 x i16> %sa to <16 x i8> 31 ret <16 x i8> %t 32} 33 34define arm_aapcs_vfpcc <4 x i32> @zext_trunc_i32(<4 x i32> %a) { 35; CHECK-LABEL: zext_trunc_i32: 36; CHECK: @ %bb.0: @ %entry 37; CHECK-NEXT: bx lr 38entry: 39 %sa = zext <4 x i32> %a to <4 x i64> 40 %t = trunc <4 x i64> %sa to <4 x i32> 41 ret <4 x i32> %t 42} 43 44define arm_aapcs_vfpcc <8 x i16> @zext_trunc_i16(<8 x i16> %a) { 45; CHECK-LABEL: zext_trunc_i16: 46; CHECK: @ %bb.0: @ %entry 47; CHECK-NEXT: bx lr 48entry: 49 %sa = zext <8 x i16> %a to <8 x i32> 50 %t = trunc <8 x i32> %sa to <8 x i16> 51 ret <8 x i16> %t 52} 53 54define arm_aapcs_vfpcc <16 x i8> @zext_trunc_i8(<16 x i8> %a) { 55; CHECK-LABEL: zext_trunc_i8: 56; CHECK: @ %bb.0: @ %entry 57; CHECK-NEXT: bx lr 58entry: 59 %sa = zext <16 x i8> %a to <16 x i16> 60 %t = trunc <16 x i16> %sa to <16 x i8> 61 ret <16 x i8> %t 62} 63 64define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b) { 65; CHECK-LABEL: ext_add_trunc_i32: 66; CHECK: @ %bb.0: @ %entry 67; CHECK-NEXT: vmov.f32 s8, s6 68; CHECK-NEXT: vmov.f32 s6, s7 69; CHECK-NEXT: vmov r0, s8 70; CHECK-NEXT: vmov.f32 s8, s2 71; CHECK-NEXT: vmov.f32 s2, s3 72; CHECK-NEXT: vmov r1, s8 73; CHECK-NEXT: vmov r2, s2 74; CHECK-NEXT: vmov.f32 s2, s5 75; CHECK-NEXT: add.w r12, r1, r0 76; CHECK-NEXT: vmov r1, s6 77; CHECK-NEXT: vmov r0, s0 78; CHECK-NEXT: add r1, r2 79; CHECK-NEXT: vmov r2, s2 80; CHECK-NEXT: vmov.f32 s2, s1 81; CHECK-NEXT: vmov r3, s2 82; CHECK-NEXT: add r2, r3 83; CHECK-NEXT: vmov r3, s4 84; CHECK-NEXT: add r0, r3 85; CHECK-NEXT: vmov q0[2], q0[0], r0, r12 86; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 87; CHECK-NEXT: bx lr 88entry: 89 %sa = sext <4 x i32> %a to <4 x i64> 90 %sb = zext <4 x i32> %b to <4 x i64> 91 %add = add <4 x i64> %sa, %sb 92 %t = trunc <4 x i64> %add to <4 x i32> 93 ret <4 x i32> %t 94} 95 96define arm_aapcs_vfpcc <8 x i16> @ext_add_trunc_v8i16(<8 x i16> %a, <8 x i16> %b) { 97; CHECK-LABEL: ext_add_trunc_v8i16: 98; CHECK: @ %bb.0: @ %entry 99; CHECK-NEXT: vrev32.16 q3, q0 100; CHECK-NEXT: vrev32.16 q2, q1 101; CHECK-NEXT: vadd.i32 q2, q3, q2 102; CHECK-NEXT: vadd.i32 q0, q0, q1 103; CHECK-NEXT: vmovnt.i32 q0, q2 104; CHECK-NEXT: bx lr 105entry: 106 %sa = sext <8 x i16> %a to <8 x i32> 107 %sb = zext <8 x i16> %b to <8 x i32> 108 %add = add <8 x i32> %sa, %sb 109 %t = trunc <8 x i32> %add to <8 x i16> 110 ret <8 x i16> %t 111} 112 113define arm_aapcs_vfpcc <16 x i8> @ext_add_trunc_v16i8(<16 x i8> %a, <16 x i8> %b) { 114; CHECK-LABEL: ext_add_trunc_v16i8: 115; CHECK: @ %bb.0: @ %entry 116; CHECK-NEXT: vrev16.8 q3, q0 117; CHECK-NEXT: vrev16.8 q2, q1 118; CHECK-NEXT: vadd.i16 q2, q3, q2 119; CHECK-NEXT: vadd.i16 q0, q0, q1 120; CHECK-NEXT: vmovnt.i16 q0, q2 121; CHECK-NEXT: bx lr 122entry: 123 %sa = sext <16 x i8> %a to <16 x i16> 124 %sb = zext <16 x i8> %b to <16 x i16> 125 %add = add <16 x i16> %sa, %sb 126 %t = trunc <16 x i16> %add to <16 x i8> 127 ret <16 x i8> %t 128} 129 130define arm_aapcs_vfpcc <16 x i16> @ext_add_trunc_v16i16(<16 x i16> %a, <16 x i16> %b) { 131; CHECK-LABEL: ext_add_trunc_v16i16: 132; CHECK: @ %bb.0: @ %entry 133; CHECK-NEXT: .vsave {d8, d9, d10, d11} 134; CHECK-NEXT: vpush {d8, d9, d10, d11} 135; CHECK-NEXT: vrev32.16 q5, q0 136; CHECK-NEXT: vrev32.16 q4, q2 137; CHECK-NEXT: vadd.i32 q0, q0, q2 138; CHECK-NEXT: vadd.i32 q4, q5, q4 139; CHECK-NEXT: vmovnt.i32 q0, q4 140; CHECK-NEXT: vrev32.16 q4, q1 141; CHECK-NEXT: vrev32.16 q2, q3 142; CHECK-NEXT: vadd.i32 q1, q1, q3 143; CHECK-NEXT: vadd.i32 q2, q4, q2 144; CHECK-NEXT: vmovnt.i32 q1, q2 145; CHECK-NEXT: vpop {d8, d9, d10, d11} 146; CHECK-NEXT: bx lr 147entry: 148 %sa = sext <16 x i16> %a to <16 x i32> 149 %sb = zext <16 x i16> %b to <16 x i32> 150 %add = add <16 x i32> %sa, %sb 151 %t = trunc <16 x i32> %add to <16 x i16> 152 ret <16 x i16> %t 153} 154 155define arm_aapcs_vfpcc <32 x i8> @ext_add_trunc_v32i8(<32 x i8> %a, <32 x i8> %b) { 156; CHECK-LABEL: ext_add_trunc_v32i8: 157; CHECK: @ %bb.0: @ %entry 158; CHECK-NEXT: .vsave {d8, d9, d10, d11} 159; CHECK-NEXT: vpush {d8, d9, d10, d11} 160; CHECK-NEXT: vrev16.8 q5, q0 161; CHECK-NEXT: vrev16.8 q4, q2 162; CHECK-NEXT: vadd.i16 q0, q0, q2 163; CHECK-NEXT: vadd.i16 q4, q5, q4 164; CHECK-NEXT: vmovnt.i16 q0, q4 165; CHECK-NEXT: vrev16.8 q4, q1 166; CHECK-NEXT: vrev16.8 q2, q3 167; CHECK-NEXT: vadd.i16 q1, q1, q3 168; CHECK-NEXT: vadd.i16 q2, q4, q2 169; CHECK-NEXT: vmovnt.i16 q1, q2 170; CHECK-NEXT: vpop {d8, d9, d10, d11} 171; CHECK-NEXT: bx lr 172entry: 173 %sa = sext <32 x i8> %a to <32 x i16> 174 %sb = zext <32 x i8> %b to <32 x i16> 175 %add = add <32 x i16> %sa, %sb 176 %t = trunc <32 x i16> %add to <32 x i8> 177 ret <32 x i8> %t 178} 179 180define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) { 181; CHECK-LABEL: ext_add_ashr_trunc_i32: 182; CHECK: @ %bb.0: @ %entry 183; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 184; CHECK-NEXT: push {r4, r5, r6, r7, lr} 185; CHECK-NEXT: vmov.f32 s12, s6 186; CHECK-NEXT: vmov.i64 q2, #0xffffffff 187; CHECK-NEXT: vmov.f32 s6, s5 188; CHECK-NEXT: vmov.f32 s14, s7 189; CHECK-NEXT: vand q1, q1, q2 190; CHECK-NEXT: vmov r3, r7, d2 191; CHECK-NEXT: vand q3, q3, q2 192; CHECK-NEXT: vmov.f32 s4, s2 193; CHECK-NEXT: vmov r0, r1, d6 194; CHECK-NEXT: vmov.f32 s2, s3 195; CHECK-NEXT: vmov lr, r12, d7 196; CHECK-NEXT: vmov r2, s4 197; CHECK-NEXT: asrs r5, r2, #31 198; CHECK-NEXT: adds r2, r2, r0 199; CHECK-NEXT: vmov r0, s2 200; CHECK-NEXT: adcs r1, r5 201; CHECK-NEXT: vmov r5, s0 202; CHECK-NEXT: asrl r2, r1, #1 203; CHECK-NEXT: asrs r1, r0, #31 204; CHECK-NEXT: adds.w r0, r0, lr 205; CHECK-NEXT: adc.w r1, r1, r12 206; CHECK-NEXT: asrs r4, r5, #31 207; CHECK-NEXT: adds r6, r5, r3 208; CHECK-NEXT: vmov r3, r5, d3 209; CHECK-NEXT: vmov.f32 s6, s1 210; CHECK-NEXT: asrl r0, r1, #1 211; CHECK-NEXT: adcs r7, r4 212; CHECK-NEXT: asrl r6, r7, #1 213; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 214; CHECK-NEXT: vmov r1, s6 215; CHECK-NEXT: adds r6, r1, r3 216; CHECK-NEXT: asr.w r2, r1, #31 217; CHECK-NEXT: adc.w r1, r2, r5 218; CHECK-NEXT: asrl r6, r1, #1 219; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 220; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 221entry: 222 %sa = sext <4 x i32> %a to <4 x i64> 223 %sb = zext <4 x i32> %b to <4 x i64> 224 %add = add <4 x i64> %sa, %sb 225 %sh = ashr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1> 226 %t = trunc <4 x i64> %sh to <4 x i32> 227 ret <4 x i32> %t 228} 229 230define arm_aapcs_vfpcc <8 x i16> @ext_add_ashr_trunc_i16(<8 x i16> %a, <8 x i16> %b) { 231; CHECK-LABEL: ext_add_ashr_trunc_i16: 232; CHECK: @ %bb.0: @ %entry 233; CHECK-NEXT: vmovlt.u16 q2, q1 234; CHECK-NEXT: vmovlt.s16 q3, q0 235; CHECK-NEXT: vmovlb.u16 q1, q1 236; CHECK-NEXT: vmovlb.s16 q0, q0 237; CHECK-NEXT: vhadd.s32 q2, q3, q2 238; CHECK-NEXT: vhadd.s32 q0, q0, q1 239; CHECK-NEXT: vmovnt.i32 q0, q2 240; CHECK-NEXT: bx lr 241entry: 242 %sa = sext <8 x i16> %a to <8 x i32> 243 %sb = zext <8 x i16> %b to <8 x i32> 244 %add = add <8 x i32> %sa, %sb 245 %sh = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 246 %t = trunc <8 x i32> %sh to <8 x i16> 247 ret <8 x i16> %t 248} 249 250define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8(<16 x i8> %a, <16 x i8> %b) { 251; CHECK-LABEL: ext_add_ashr_trunc_i8: 252; CHECK: @ %bb.0: @ %entry 253; CHECK-NEXT: vmovlt.u8 q2, q1 254; CHECK-NEXT: vmovlt.s8 q3, q0 255; CHECK-NEXT: vmovlb.u8 q1, q1 256; CHECK-NEXT: vmovlb.s8 q0, q0 257; CHECK-NEXT: vhadd.s16 q2, q3, q2 258; CHECK-NEXT: vhadd.s16 q0, q0, q1 259; CHECK-NEXT: vmovnt.i16 q0, q2 260; CHECK-NEXT: bx lr 261entry: 262 %sa = sext <16 x i8> %a to <16 x i16> 263 %sb = zext <16 x i8> %b to <16 x i16> 264 %add = add <16 x i16> %sa, %sb 265 %sh = ashr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 266 %t = trunc <16 x i16> %sh to <16 x i8> 267 ret <16 x i8> %t 268} 269 270define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) { 271; CHECK-LABEL: ext_add_ashr_trunc_i8i32: 272; CHECK: @ %bb.0: @ %entry 273; CHECK-NEXT: .pad #48 274; CHECK-NEXT: sub sp, #48 275; CHECK-NEXT: add r0, sp, #16 276; CHECK-NEXT: mov r1, sp 277; CHECK-NEXT: vstrw.32 q1, [r0] 278; CHECK-NEXT: vstrw.32 q0, [r1] 279; CHECK-NEXT: vldrb.u16 q0, [r0, #8] 280; CHECK-NEXT: vldrb.s16 q1, [r1, #8] 281; CHECK-NEXT: add r2, sp, #32 282; CHECK-NEXT: vhadd.s16 q0, q1, q0 283; CHECK-NEXT: vstrb.16 q0, [r2, #8] 284; CHECK-NEXT: vldrb.u16 q0, [r0] 285; CHECK-NEXT: vldrb.s16 q1, [r1] 286; CHECK-NEXT: vhadd.s16 q0, q1, q0 287; CHECK-NEXT: vstrb.16 q0, [r2] 288; CHECK-NEXT: vldrw.u32 q0, [r2] 289; CHECK-NEXT: add sp, #48 290; CHECK-NEXT: bx lr 291entry: 292 %sa = sext <16 x i8> %a to <16 x i32> 293 %sb = zext <16 x i8> %b to <16 x i32> 294 %add = add <16 x i32> %sa, %sb 295 %sh = ashr <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 296 %t = trunc <16 x i32> %sh to <16 x i8> 297 ret <16 x i8> %t 298} 299 300define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) { 301; CHECK-LABEL: ext_ops_trunc_i32: 302; CHECK: @ %bb.0: @ %entry 303; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} 304; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 305; CHECK-NEXT: vmov.f32 s8, s2 306; CHECK-NEXT: vmov.f32 s2, s3 307; CHECK-NEXT: vmov.f32 s10, s7 308; CHECK-NEXT: vmov r10, s8 309; CHECK-NEXT: vmov.f32 s8, s6 310; CHECK-NEXT: vmov.f32 s6, s5 311; CHECK-NEXT: vmov r2, s8 312; CHECK-NEXT: asr.w r0, r10, #31 313; CHECK-NEXT: adds.w r6, r10, r2 314; CHECK-NEXT: eor.w r7, r10, r2 315; CHECK-NEXT: adc r3, r0, #0 316; CHECK-NEXT: asrl r6, r3, r2 317; CHECK-NEXT: subs r0, r6, r2 318; CHECK-NEXT: vmov r6, s2 319; CHECK-NEXT: sbc lr, r3, #0 320; CHECK-NEXT: vmov r3, s10 321; CHECK-NEXT: vmov.f32 s2, s1 322; CHECK-NEXT: umull r0, r8, r0, r2 323; CHECK-NEXT: asrs r5, r6, #31 324; CHECK-NEXT: adds r4, r6, r3 325; CHECK-NEXT: adc r5, r5, #0 326; CHECK-NEXT: eor.w r1, r6, r3 327; CHECK-NEXT: asrl r4, r5, r3 328; CHECK-NEXT: subs r4, r4, r3 329; CHECK-NEXT: sbc r5, r5, #0 330; CHECK-NEXT: orrs.w r7, r7, r10, asr #31 331; CHECK-NEXT: umull r4, r12, r4, r3 332; CHECK-NEXT: csetm r9, eq 333; CHECK-NEXT: orrs.w r1, r1, r6, asr #31 334; CHECK-NEXT: mov.w r7, #0 335; CHECK-NEXT: csetm r1, eq 336; CHECK-NEXT: bfi r7, r9, #0, #8 337; CHECK-NEXT: mla r5, r5, r3, r12 338; CHECK-NEXT: bfi r7, r1, #8, #8 339; CHECK-NEXT: rsbs r1, r6, #0 340; CHECK-NEXT: vmsr p0, r7 341; CHECK-NEXT: mla r7, lr, r2, r8 342; CHECK-NEXT: lsll r4, r5, r1 343; CHECK-NEXT: rsb.w r1, r10, #0 344; CHECK-NEXT: lsll r4, r5, r3 345; CHECK-NEXT: lsll r0, r7, r1 346; CHECK-NEXT: vmov r3, s2 347; CHECK-NEXT: vmov r1, s6 348; CHECK-NEXT: lsll r0, r7, r2 349; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 350; CHECK-NEXT: mov.w r12, #0 351; CHECK-NEXT: vpsel q2, q3, q2 352; CHECK-NEXT: adds r2, r3, r1 353; CHECK-NEXT: asr.w r0, r3, #31 354; CHECK-NEXT: adc r5, r0, #0 355; CHECK-NEXT: asrl r2, r5, r1 356; CHECK-NEXT: subs r0, r2, r1 357; CHECK-NEXT: vmov r2, s0 358; CHECK-NEXT: sbc r8, r5, #0 359; CHECK-NEXT: umull r4, lr, r0, r1 360; CHECK-NEXT: vmov r0, s4 361; CHECK-NEXT: asrs r5, r2, #31 362; CHECK-NEXT: adds r6, r2, r0 363; CHECK-NEXT: adc r7, r5, #0 364; CHECK-NEXT: mla r5, r8, r1, lr 365; CHECK-NEXT: asrl r6, r7, r0 366; CHECK-NEXT: subs.w r8, r6, r0 367; CHECK-NEXT: eor.w r6, r2, r0 368; CHECK-NEXT: sbc lr, r7, #0 369; CHECK-NEXT: eor.w r7, r3, r1 370; CHECK-NEXT: orrs.w r6, r6, r2, asr #31 371; CHECK-NEXT: orr.w r7, r7, r3, asr #31 372; CHECK-NEXT: csetm r6, eq 373; CHECK-NEXT: cmp r7, #0 374; CHECK-NEXT: bfi r12, r6, #0, #8 375; CHECK-NEXT: csetm r6, eq 376; CHECK-NEXT: bfi r12, r6, #8, #8 377; CHECK-NEXT: umull r6, r7, r8, r0 378; CHECK-NEXT: rsb.w r8, r3, #0 379; CHECK-NEXT: lsll r4, r5, r8 380; CHECK-NEXT: vmsr p0, r12 381; CHECK-NEXT: mla r3, lr, r0, r7 382; CHECK-NEXT: lsll r4, r5, r1 383; CHECK-NEXT: rsbs r1, r2, #0 384; CHECK-NEXT: lsll r6, r3, r1 385; CHECK-NEXT: lsll r6, r3, r0 386; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 387; CHECK-NEXT: vpsel q0, q0, q1 388; CHECK-NEXT: vmov.f32 s1, s2 389; CHECK-NEXT: vmov.f32 s2, s8 390; CHECK-NEXT: vmov.f32 s3, s10 391; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 392entry: 393 %sa = sext <4 x i32> %a to <4 x i64> 394 %sb = zext <4 x i32> %b to <4 x i64> 395 %add = add <4 x i64> %sa, %sb 396 %ashr = ashr <4 x i64> %add, %sb 397 %sub = sub <4 x i64> %ashr, %sb 398 %mul = mul <4 x i64> %sub, %sb 399 %lshr = lshr <4 x i64> %mul, %sa 400 %shl = shl <4 x i64> %lshr, %sb 401 %cmp = icmp eq <4 x i64> %sa, %sb 402 %sel = select <4 x i1> %cmp, <4 x i64> %shl, <4 x i64> %sb 403 %t = trunc <4 x i64> %sel to <4 x i32> 404 ret <4 x i32> %t 405} 406 407define arm_aapcs_vfpcc <8 x i16> @ext_ops_trunc_i16(<8 x i16> %a, <8 x i16> %b) { 408; CHECK-LABEL: ext_ops_trunc_i16: 409; CHECK: @ %bb.0: @ %entry 410; CHECK-NEXT: .vsave {d8, d9, d10, d11} 411; CHECK-NEXT: vpush {d8, d9, d10, d11} 412; CHECK-NEXT: vmovlt.u16 q2, q1 413; CHECK-NEXT: vmovlt.s16 q3, q0 414; CHECK-NEXT: vadd.i32 q4, q3, q2 415; CHECK-NEXT: vneg.s32 q5, q2 416; CHECK-NEXT: vshl.s32 q4, q4, q5 417; CHECK-NEXT: vneg.s32 q5, q3 418; CHECK-NEXT: vsub.i32 q4, q4, q2 419; CHECK-NEXT: vmul.i32 q4, q4, q2 420; CHECK-NEXT: vmovlb.u16 q1, q1 421; CHECK-NEXT: vshl.u32 q4, q4, q5 422; CHECK-NEXT: vmovlb.s16 q0, q0 423; CHECK-NEXT: vpt.i32 eq, q3, q2 424; CHECK-NEXT: vshlt.u32 q2, q4, q2 425; CHECK-NEXT: vadd.i32 q3, q0, q1 426; CHECK-NEXT: vneg.s32 q4, q1 427; CHECK-NEXT: vshl.s32 q3, q3, q4 428; CHECK-NEXT: vneg.s32 q4, q0 429; CHECK-NEXT: vsub.i32 q3, q3, q1 430; CHECK-NEXT: vmul.i32 q3, q3, q1 431; CHECK-NEXT: vshl.u32 q3, q3, q4 432; CHECK-NEXT: vpt.i32 eq, q0, q1 433; CHECK-NEXT: vshlt.u32 q1, q3, q1 434; CHECK-NEXT: vmovnt.i32 q1, q2 435; CHECK-NEXT: vmov q0, q1 436; CHECK-NEXT: vpop {d8, d9, d10, d11} 437; CHECK-NEXT: bx lr 438entry: 439 %sa = sext <8 x i16> %a to <8 x i32> 440 %sb = zext <8 x i16> %b to <8 x i32> 441 %add = add <8 x i32> %sa, %sb 442 %ashr = ashr <8 x i32> %add, %sb 443 %sub = sub <8 x i32> %ashr, %sb 444 %mul = mul <8 x i32> %sub, %sb 445 %lshr = lshr <8 x i32> %mul, %sa 446 %shl = shl <8 x i32> %lshr, %sb 447 %cmp = icmp eq <8 x i32> %sa, %sb 448 %sel = select <8 x i1> %cmp, <8 x i32> %shl, <8 x i32> %sb 449 %t = trunc <8 x i32> %sel to <8 x i16> 450 ret <8 x i16> %t 451} 452 453define arm_aapcs_vfpcc <16 x i8> @ext_ops_trunc_i8(<16 x i8> %a, <16 x i8> %b) { 454; CHECK-LABEL: ext_ops_trunc_i8: 455; CHECK: @ %bb.0: @ %entry 456; CHECK-NEXT: .vsave {d8, d9, d10, d11} 457; CHECK-NEXT: vpush {d8, d9, d10, d11} 458; CHECK-NEXT: vmovlt.u8 q2, q1 459; CHECK-NEXT: vmovlt.s8 q3, q0 460; CHECK-NEXT: vadd.i16 q4, q3, q2 461; CHECK-NEXT: vneg.s16 q5, q2 462; CHECK-NEXT: vshl.s16 q4, q4, q5 463; CHECK-NEXT: vneg.s16 q5, q3 464; CHECK-NEXT: vsub.i16 q4, q4, q2 465; CHECK-NEXT: vmul.i16 q4, q4, q2 466; CHECK-NEXT: vmovlb.u8 q1, q1 467; CHECK-NEXT: vshl.u16 q4, q4, q5 468; CHECK-NEXT: vmovlb.s8 q0, q0 469; CHECK-NEXT: vpt.i16 eq, q3, q2 470; CHECK-NEXT: vshlt.u16 q2, q4, q2 471; CHECK-NEXT: vadd.i16 q3, q0, q1 472; CHECK-NEXT: vneg.s16 q4, q1 473; CHECK-NEXT: vshl.s16 q3, q3, q4 474; CHECK-NEXT: vneg.s16 q4, q0 475; CHECK-NEXT: vsub.i16 q3, q3, q1 476; CHECK-NEXT: vmul.i16 q3, q3, q1 477; CHECK-NEXT: vshl.u16 q3, q3, q4 478; CHECK-NEXT: vpt.i16 eq, q0, q1 479; CHECK-NEXT: vshlt.u16 q1, q3, q1 480; CHECK-NEXT: vmovnt.i16 q1, q2 481; CHECK-NEXT: vmov q0, q1 482; CHECK-NEXT: vpop {d8, d9, d10, d11} 483; CHECK-NEXT: bx lr 484entry: 485 %sa = sext <16 x i8> %a to <16 x i16> 486 %sb = zext <16 x i8> %b to <16 x i16> 487 %add = add <16 x i16> %sa, %sb 488 %ashr = ashr <16 x i16> %add, %sb 489 %sub = sub <16 x i16> %ashr, %sb 490 %mul = mul <16 x i16> %sub, %sb 491 %lshr = lshr <16 x i16> %mul, %sa 492 %shl = shl <16 x i16> %lshr, %sb 493 %cmp = icmp eq <16 x i16> %sa, %sb 494 %sel = select <16 x i1> %cmp, <16 x i16> %shl, <16 x i16> %sb 495 %t = trunc <16 x i16> %sel to <16 x i8> 496 ret <16 x i8> %t 497} 498 499define arm_aapcs_vfpcc <8 x i16> @ext_intrinsics_trunc_i16(<8 x i16> %a, <8 x i16> %b) { 500; CHECK-LABEL: ext_intrinsics_trunc_i16: 501; CHECK: @ %bb.0: @ %entry 502; CHECK-NEXT: .vsave {d8, d9} 503; CHECK-NEXT: vpush {d8, d9} 504; CHECK-NEXT: vmovlb.u16 q2, q1 505; CHECK-NEXT: vmovlb.s16 q3, q0 506; CHECK-NEXT: vadd.i32 q4, q3, q2 507; CHECK-NEXT: vmovlt.u16 q1, q1 508; CHECK-NEXT: vqadd.u32 q4, q4, q2 509; CHECK-NEXT: vmovlt.s16 q0, q0 510; CHECK-NEXT: vqsub.s32 q4, q4, q3 511; CHECK-NEXT: vqsub.u32 q4, q4, q2 512; CHECK-NEXT: vabs.s32 q4, q4 513; CHECK-NEXT: vmin.s32 q4, q4, q3 514; CHECK-NEXT: vmax.s32 q4, q4, q2 515; CHECK-NEXT: vmin.u32 q3, q4, q3 516; CHECK-NEXT: vadd.i32 q4, q0, q1 517; CHECK-NEXT: vqadd.u32 q4, q4, q1 518; CHECK-NEXT: vqsub.s32 q4, q4, q0 519; CHECK-NEXT: vqsub.u32 q4, q4, q1 520; CHECK-NEXT: vabs.s32 q4, q4 521; CHECK-NEXT: vmin.s32 q4, q4, q0 522; CHECK-NEXT: vmax.s32 q4, q4, q1 523; CHECK-NEXT: vmin.u32 q0, q4, q0 524; CHECK-NEXT: vmax.u32 q1, q0, q1 525; CHECK-NEXT: vmax.u32 q0, q3, q2 526; CHECK-NEXT: vmovnt.i32 q0, q1 527; CHECK-NEXT: vpop {d8, d9} 528; CHECK-NEXT: bx lr 529entry: 530 %sa = sext <8 x i16> %a to <8 x i32> 531 %sb = zext <8 x i16> %b to <8 x i32> 532 %sadd = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %sa, <8 x i32> %sb) 533 %uadd = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %sadd, <8 x i32> %sb) 534 %ssub = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %uadd, <8 x i32> %sa) 535 %usub = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %ssub, <8 x i32> %sb) 536 %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %usub, i1 true) 537 %smin = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %abs, <8 x i32> %sa) 538 %smax = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %smin, <8 x i32> %sb) 539 %umin = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %smax, <8 x i32> %sa) 540 %umax = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %umin, <8 x i32> %sb) 541 %t = trunc <8 x i32> %umax to <8 x i16> 542 ret <8 x i16> %t 543} 544 545define arm_aapcs_vfpcc <8 x half> @ext_fpintrinsics_trunc_half(<8 x half> %a, <8 x half> %b) { 546; CHECK-LABEL: ext_fpintrinsics_trunc_half: 547; CHECK: @ %bb.0: @ %entry 548; CHECK-NEXT: .vsave {d8, d9, d10, d11} 549; CHECK-NEXT: vpush {d8, d9, d10, d11} 550; CHECK-NEXT: vcvtb.f32.f16 q2, q0 551; CHECK-NEXT: vcvtb.f32.f16 q4, q1 552; CHECK-NEXT: vrintm.f32 q3, q2 553; CHECK-NEXT: vrintx.f32 q5, q4 554; CHECK-NEXT: vabs.f32 q3, q3 555; CHECK-NEXT: vrinta.f32 q4, q4 556; CHECK-NEXT: vminnm.f32 q3, q3, q2 557; CHECK-NEXT: vrintp.f32 q2, q2 558; CHECK-NEXT: vmaxnm.f32 q3, q3, q5 559; CHECK-NEXT: vcvtt.f32.f16 q0, q0 560; CHECK-NEXT: vfma.f32 q2, q3, q4 561; CHECK-NEXT: vrintm.f32 q3, q0 562; CHECK-NEXT: vabs.f32 q3, q3 563; CHECK-NEXT: vcvtt.f32.f16 q1, q1 564; CHECK-NEXT: vminnm.f32 q3, q3, q0 565; CHECK-NEXT: vrintx.f32 q4, q1 566; CHECK-NEXT: vmaxnm.f32 q3, q3, q4 567; CHECK-NEXT: vrinta.f32 q1, q1 568; CHECK-NEXT: vrintp.f32 q0, q0 569; CHECK-NEXT: vrintz.f32 q2, q2 570; CHECK-NEXT: vfma.f32 q0, q3, q1 571; CHECK-NEXT: vrintz.f32 q1, q0 572; CHECK-NEXT: vcvtb.f16.f32 q0, q2 573; CHECK-NEXT: vcvtt.f16.f32 q0, q1 574; CHECK-NEXT: vpop {d8, d9, d10, d11} 575; CHECK-NEXT: bx lr 576entry: 577 %sa = fpext <8 x half> %a to <8 x float> 578 %sb = fpext <8 x half> %b to <8 x float> 579 %floor = call <8 x float> @llvm.floor.v8f32(<8 x float> %sa) 580 %rint = call <8 x float> @llvm.rint.v8f32(<8 x float> %sb) 581 %ceil = call <8 x float> @llvm.ceil.v8f32(<8 x float> %sa) 582 %round = call <8 x float> @llvm.round.v8f32(<8 x float> %sb) 583 %abs = call <8 x float> @llvm.fabs.v8f32(<8 x float> %floor) 584 %min = call <8 x float> @llvm.minnum.v8f32(<8 x float> %abs, <8 x float> %sa) 585 %max = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %min, <8 x float> %rint) 586 %fma = call <8 x float> @llvm.fma.v8f32(<8 x float> %max, <8 x float> %round, <8 x float> %ceil) 587 %trunc = call <8 x float> @llvm.trunc.v8f32(<8 x float> %fma) 588 %t = fptrunc <8 x float> %trunc to <8 x half> 589 ret <8 x half> %t 590} 591 592declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) 593declare <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32>, <8 x i32>) 594declare <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32>, <8 x i32>) 595declare <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32>, <8 x i32>) 596declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) 597declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) 598declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) 599declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) 600declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) 601declare <8 x float> @llvm.fabs.v8f32(<8 x float>) 602declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) 603declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) 604declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) 605declare <8 x float> @llvm.ceil.v8f32(<8 x float>) 606declare <8 x float> @llvm.floor.v8f32(<8 x float>) 607declare <8 x float> @llvm.rint.v8f32(<8 x float>) 608declare <8 x float> @llvm.round.v8f32(<8 x float>) 609declare <8 x float> @llvm.trunc.v8f32(<8 x float>) 610