1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV 3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV,CHECKFP 4; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS 5; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS,CHECKFP 6 7define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) { 8; CHECK-LABEL: shuffle1_i32: 9; CHECK: @ %bb.0: @ %entry 10; CHECK-NEXT: vmov.f32 s4, s3 11; CHECK-NEXT: vmov.f32 s5, s2 12; CHECK-NEXT: vmov.f32 s6, s1 13; CHECK-NEXT: vmov.f32 s7, s0 14; CHECK-NEXT: vmov q0, q1 15; CHECK-NEXT: bx lr 16entry: 17 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 18 ret <4 x i32> %out 19} 20 21define arm_aapcs_vfpcc <4 x i32> @shuffle2_i32(<4 x i32> %src) { 22; CHECK-LABEL: shuffle2_i32: 23; CHECK: @ %bb.0: @ %entry 24; CHECK-NEXT: bx lr 25entry: 26 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 27 ret <4 x i32> %out 28} 29 30define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) { 31; CHECK-LABEL: shuffle3_i32: 32; CHECK: @ %bb.0: @ %entry 33; CHECK-NEXT: vmov.f32 s4, s3 34; CHECK-NEXT: vmov.f32 s5, s1 35; CHECK-NEXT: vmov.f32 s6, s2 36; CHECK-NEXT: vmov.f32 s7, s0 37; CHECK-NEXT: vmov q0, q1 38; CHECK-NEXT: bx lr 39entry: 40 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0> 41 ret <4 x i32> %out 42} 43 44define arm_aapcs_vfpcc <4 x i32> @shuffle5_i32(<4 x i32> %src) { 45; CHECK-LABEL: shuffle5_i32: 46; CHECK: @ %bb.0: @ %entry 47; CHECK-NEXT: vrev64.32 q1, q0 48; CHECK-NEXT: vmov q0, q1 49; CHECK-NEXT: bx lr 50entry: 51 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 52 ret <4 x i32> %out 53} 54 55define arm_aapcs_vfpcc <4 x i32> @shuffle6_i32(<4 x i32> %src) { 56; CHECK-LABEL: shuffle6_i32: 57; CHECK: @ %bb.0: @ %entry 58; CHECK-NEXT: bx lr 59entry: 60 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 3> 61 ret <4 x i32> %out 62} 63 64define arm_aapcs_vfpcc <4 x i32> @oneoff11_i32(<4 x i32> %src1, <4 x i32> %src2) { 65; CHECK-LABEL: oneoff11_i32: 66; CHECK: @ %bb.0: @ %entry 67; CHECK-NEXT: vmov.f32 s2, s1 68; CHECK-NEXT: bx lr 69entry: 70 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 0, i32 1, i32 1, i32 3> 71 ret <4 x i32> %out 72} 73 74define arm_aapcs_vfpcc <4 x i32> @oneoff12_i32(<4 x i32> %src1, <4 x i32> %src2) { 75; CHECK-LABEL: oneoff12_i32: 76; CHECK: @ %bb.0: @ %entry 77; CHECK-NEXT: vmov.f32 s0, s4 78; CHECK-NEXT: bx lr 79entry: 80 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 81 ret <4 x i32> %out 82} 83 84define arm_aapcs_vfpcc <4 x i32> @oneoff21_i32(<4 x i32> %src1, <4 x i32> %src2) { 85; CHECK-LABEL: oneoff21_i32: 86; CHECK: @ %bb.0: @ %entry 87; CHECK-NEXT: vmov.f32 s7, s0 88; CHECK-NEXT: vmov q0, q1 89; CHECK-NEXT: bx lr 90entry: 91 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 5, i32 6, i32 0> 92 ret <4 x i32> %out 93} 94 95define arm_aapcs_vfpcc <4 x i32> @oneoff22_i32(<4 x i32> %src1, <4 x i32> %src2) { 96; CHECK-LABEL: oneoff22_i32: 97; CHECK: @ %bb.0: @ %entry 98; CHECK-NEXT: vmov q0, q1 99; CHECK-NEXT: vmov.f32 s2, s0 100; CHECK-NEXT: bx lr 101entry: 102 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 5, i32 4, i32 7> 103 ret <4 x i32> %out 104} 105 106define arm_aapcs_vfpcc <4 x i32> @oneoffundef_i32(<4 x i32> %src1, <4 x i32> %src2) { 107; CHECK-LABEL: oneoffundef_i32: 108; CHECK: @ %bb.0: @ %entry 109; CHECK-NEXT: vmov.f32 s1, s4 110; CHECK-NEXT: bx lr 111entry: 112 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 0, i32 4, i32 undef, i32 3> 113 ret <4 x i32> %out 114} 115 116define arm_aapcs_vfpcc <4 x i32> @shuffle2step_i32(<8 x i32> %src) { 117; CHECK-LABEL: shuffle2step_i32: 118; CHECK: @ %bb.0: @ %entry 119; CHECK-NEXT: vmov.f32 s8, s1 120; CHECK-NEXT: vmov.f32 s9, s3 121; CHECK-NEXT: vmov.f32 s1, s2 122; CHECK-NEXT: vmov.f32 s10, s5 123; CHECK-NEXT: vmov.f32 s11, s7 124; CHECK-NEXT: vmov.f32 s2, s4 125; CHECK-NEXT: vmov.f32 s3, s6 126; CHECK-NEXT: vadd.i32 q0, q0, q2 127; CHECK-NEXT: bx lr 128entry: 129 %s1 = shufflevector <8 x i32> %src, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 130 %s2 = shufflevector <8 x i32> %src, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 131 %r = add <4 x i32> %s1, %s2 132 ret <4 x i32> %r 133} 134 135define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) { 136; CHECK-LABEL: shuffle3step_i32: 137; CHECK: @ %bb.0: @ %entry 138; CHECK-NEXT: .vsave {d8, d9} 139; CHECK-NEXT: vpush {d8, d9} 140; CHECK-NEXT: vmov.f32 s13, s4 141; CHECK-NEXT: vmov.f32 s14, s7 142; CHECK-NEXT: vmov.f32 s18, s6 143; CHECK-NEXT: vmov.f32 s12, s1 144; CHECK-NEXT: vmov.f32 s15, s10 145; CHECK-NEXT: vmov.f32 s16, s0 146; CHECK-NEXT: vmov.f32 s17, s3 147; CHECK-NEXT: vmov.f32 s19, s9 148; CHECK-NEXT: vadd.i32 q3, q4, q3 149; CHECK-NEXT: vmov.f32 s4, s2 150; CHECK-NEXT: vmov.f32 s6, s8 151; CHECK-NEXT: vmov.f32 s7, s11 152; CHECK-NEXT: vadd.i32 q0, q3, q1 153; CHECK-NEXT: vpop {d8, d9} 154; CHECK-NEXT: bx lr 155entry: 156 %s1 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 157 %s2 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 158 %s3 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 159 %a = add <4 x i32> %s1, %s2 160 %r = add <4 x i32> %a, %s3 161 ret <4 x i32> %r 162} 163 164define arm_aapcs_vfpcc <4 x i32> @shuffle4step_i32(<16 x i32> %src) { 165; CHECK-LABEL: shuffle4step_i32: 166; CHECK: @ %bb.0: @ %entry 167; CHECK-NEXT: .vsave {d8, d9, d10, d11} 168; CHECK-NEXT: vpush {d8, d9, d10, d11} 169; CHECK-NEXT: vmov.f32 s16, s3 170; CHECK-NEXT: vmov.f32 s20, s2 171; CHECK-NEXT: vmov.f32 s17, s7 172; CHECK-NEXT: vmov.f32 s18, s11 173; CHECK-NEXT: vmov.f32 s19, s15 174; CHECK-NEXT: vmov.f32 s21, s6 175; CHECK-NEXT: vmov.f32 s22, s10 176; CHECK-NEXT: vmov.f32 s23, s14 177; CHECK-NEXT: vadd.i32 q4, q5, q4 178; CHECK-NEXT: vmov.f32 s20, s1 179; CHECK-NEXT: vmov.f32 s21, s5 180; CHECK-NEXT: vmov.f32 s22, s9 181; CHECK-NEXT: vmov.f32 s23, s13 182; CHECK-NEXT: vmov.f32 s1, s4 183; CHECK-NEXT: vmov.f32 s2, s8 184; CHECK-NEXT: vmov.f32 s3, s12 185; CHECK-NEXT: vadd.i32 q0, q0, q5 186; CHECK-NEXT: vadd.i32 q0, q0, q4 187; CHECK-NEXT: vpop {d8, d9, d10, d11} 188; CHECK-NEXT: bx lr 189entry: 190 %s1 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 191 %s2 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 192 %s3 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 193 %s4 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 194 %a1 = add <4 x i32> %s1, %s2 195 %a2 = add <4 x i32> %s3, %s4 196 %r = add <4 x i32> %a1, %a2 197 ret <4 x i32> %r 198} 199 200; i16 201 202define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) { 203; CHECK-LABEL: shuffle1_i16: 204; CHECK: @ %bb.0: @ %entry 205; CHECK-NEXT: vrev64.16 q1, q0 206; CHECK-NEXT: vmov.f32 s0, s6 207; CHECK-NEXT: vmov.f32 s1, s7 208; CHECK-NEXT: vmov.f32 s2, s4 209; CHECK-NEXT: vmov.f32 s3, s5 210; CHECK-NEXT: bx lr 211entry: 212 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 213 ret <8 x i16> %out 214} 215 216define arm_aapcs_vfpcc <8 x i16> @shuffle2_i16(<8 x i16> %src) { 217; CHECK-LABEL: shuffle2_i16: 218; CHECK: @ %bb.0: @ %entry 219; CHECK-NEXT: bx lr 220entry: 221 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 222 ret <8 x i16> %out 223} 224 225define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { 226; CHECK-LABEL: shuffle3_i16: 227; CHECK: @ %bb.0: @ %entry 228; CHECK-NEXT: vmov q1, q0 229; CHECK-NEXT: vmovx.f16 s2, s5 230; CHECK-NEXT: vmovx.f16 s0, s4 231; CHECK-NEXT: vins.f16 s5, s4 232; CHECK-NEXT: vins.f16 s2, s0 233; CHECK-NEXT: vmov.f32 s3, s5 234; CHECK-NEXT: vmovx.f16 s1, s7 235; CHECK-NEXT: vmov.f32 s0, s6 236; CHECK-NEXT: vins.f16 s1, s7 237; CHECK-NEXT: bx lr 238entry: 239 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0> 240 ret <8 x i16> %out 241} 242 243define arm_aapcs_vfpcc <8 x i16> @shuffle5_i16(<8 x i16> %src) { 244; CHECK-LABEL: shuffle5_i16: 245; CHECK: @ %bb.0: @ %entry 246; CHECK-NEXT: vrev64.16 q1, q0 247; CHECK-NEXT: vmov q0, q1 248; CHECK-NEXT: bx lr 249entry: 250 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 251 ret <8 x i16> %out 252} 253 254define arm_aapcs_vfpcc <8 x i16> @shuffle6_i16(<8 x i16> %src) { 255; CHECK-LABEL: shuffle6_i16: 256; CHECK: @ %bb.0: @ %entry 257; CHECK-NEXT: vrev32.16 q0, q0 258; CHECK-NEXT: bx lr 259entry: 260 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 261 ret <8 x i16> %out 262} 263 264define arm_aapcs_vfpcc <8 x i16> @oneoff11_i16(<8 x i16> %src1, <8 x i16> %src2) { 265; CHECK-LABEL: oneoff11_i16: 266; CHECK: @ %bb.0: @ %entry 267; CHECK-NEXT: vmov.u16 r0, q0[1] 268; CHECK-NEXT: vmov.16 q0[2], r0 269; CHECK-NEXT: bx lr 270entry: 271 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7> 272 ret <8 x i16> %out 273} 274 275define arm_aapcs_vfpcc <8 x i16> @oneoff12_i16(<8 x i16> %src1, <8 x i16> %src2) { 276; CHECK-LABEL: oneoff12_i16: 277; CHECK: @ %bb.0: @ %entry 278; CHECK-NEXT: vmov.u16 r0, q1[0] 279; CHECK-NEXT: vmov.16 q0[0], r0 280; CHECK-NEXT: bx lr 281entry: 282 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 283 ret <8 x i16> %out 284} 285 286define arm_aapcs_vfpcc <8 x i16> @oneoff21_i16(<8 x i16> %src1, <8 x i16> %src2) { 287; CHECK-LABEL: oneoff21_i16: 288; CHECK: @ %bb.0: @ %entry 289; CHECK-NEXT: vins.f16 s5, s0 290; CHECK-NEXT: vmov q0, q1 291; CHECK-NEXT: bx lr 292entry: 293 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 9, i32 10, i32 0, i32 12, i32 13, i32 14, i32 15> 294 ret <8 x i16> %out 295} 296 297define arm_aapcs_vfpcc <8 x i16> @oneoff22_i16(<8 x i16> %src1, <8 x i16> %src2) { 298; CHECK-LABEL: oneoff22_i16: 299; CHECK: @ %bb.0: @ %entry 300; CHECK-NEXT: vmov q0, q1 301; CHECK-NEXT: vmov.u16 r0, q1[6] 302; CHECK-NEXT: vmov.16 q0[0], r0 303; CHECK-NEXT: bx lr 304entry: 305 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 14, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 306 ret <8 x i16> %out 307} 308 309define arm_aapcs_vfpcc <8 x i16> @oneoffundef_i16(<8 x i16> %src1, <8 x i16> %src2) { 310; CHECK-LABEL: oneoffundef_i16: 311; CHECK: @ %bb.0: @ %entry 312; CHECK-NEXT: vmov.u16 r0, q0[3] 313; CHECK-NEXT: vmov.16 q1[5], r0 314; CHECK-NEXT: vmov q0, q1 315; CHECK-NEXT: bx lr 316entry: 317 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 9, i32 undef, i32 undef, i32 12, i32 3, i32 14, i32 15> 318 ret <8 x i16> %out 319} 320 321define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) { 322; CHECK-LABEL: shuffle2step_i16: 323; CHECK: @ %bb.0: @ %entry 324; CHECK-NEXT: .pad #32 325; CHECK-NEXT: sub sp, #32 326; CHECK-NEXT: mov r0, sp 327; CHECK-NEXT: vshr.u32 q2, q1, #16 328; CHECK-NEXT: vstrh.32 q2, [r0, #8] 329; CHECK-NEXT: vshr.u32 q2, q0, #16 330; CHECK-NEXT: add r1, sp, #16 331; CHECK-NEXT: vstrh.32 q2, [r0] 332; CHECK-NEXT: vstrh.32 q1, [r1, #8] 333; CHECK-NEXT: vstrh.32 q0, [r1] 334; CHECK-NEXT: vldrw.u32 q0, [r0] 335; CHECK-NEXT: vldrw.u32 q1, [r1] 336; CHECK-NEXT: vadd.i16 q0, q1, q0 337; CHECK-NEXT: add sp, #32 338; CHECK-NEXT: bx lr 339entry: 340 %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 341 %s2 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 342 %r = add <8 x i16> %s1, %s2 343 ret <8 x i16> %r 344} 345 346define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) { 347; CHECK-LABEL: shuffle3step_i16: 348; CHECK: @ %bb.0: @ %entry 349; CHECK-NEXT: .vsave {d8, d9} 350; CHECK-NEXT: vpush {d8, d9} 351; CHECK-NEXT: vmovx.f16 s12, s0 352; CHECK-NEXT: vmov.f32 s16, s1 353; CHECK-NEXT: vins.f16 s12, s2 354; CHECK-NEXT: vmovx.f16 s2, s2 355; CHECK-NEXT: vins.f16 s16, s2 356; CHECK-NEXT: vmovx.f16 s2, s5 357; CHECK-NEXT: vmov.f32 s17, s4 358; CHECK-NEXT: vmovx.f16 s13, s3 359; CHECK-NEXT: vins.f16 s17, s2 360; CHECK-NEXT: vmov.f32 s18, s7 361; CHECK-NEXT: vmovx.f16 s2, s8 362; CHECK-NEXT: vmov.f32 s19, s10 363; CHECK-NEXT: vins.f16 s18, s2 364; CHECK-NEXT: vmovx.f16 s2, s11 365; CHECK-NEXT: vins.f16 s19, s2 366; CHECK-NEXT: vmovx.f16 s2, s1 367; CHECK-NEXT: vins.f16 s0, s2 368; CHECK-NEXT: vmovx.f16 s2, s4 369; CHECK-NEXT: vins.f16 s3, s2 370; CHECK-NEXT: vmovx.f16 s2, s7 371; CHECK-NEXT: vmovx.f16 s4, s10 372; CHECK-NEXT: vmovx.f16 s14, s6 373; CHECK-NEXT: vmovx.f16 s15, s9 374; CHECK-NEXT: vins.f16 s6, s2 375; CHECK-NEXT: vins.f16 s9, s4 376; CHECK-NEXT: vmov.f32 s1, s3 377; CHECK-NEXT: vins.f16 s14, s8 378; CHECK-NEXT: vins.f16 s15, s11 379; CHECK-NEXT: vins.f16 s13, s5 380; CHECK-NEXT: vmov.f32 s2, s6 381; CHECK-NEXT: vmov.f32 s3, s9 382; CHECK-NEXT: vadd.i16 q0, q0, q3 383; CHECK-NEXT: vadd.i16 q0, q0, q4 384; CHECK-NEXT: vpop {d8, d9} 385; CHECK-NEXT: bx lr 386entry: 387 %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 388 %s2 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 389 %s3 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 390 %a = add <8 x i16> %s1, %s2 391 %r = add <8 x i16> %a, %s3 392 ret <8 x i16> %r 393} 394 395define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) { 396; CHECK-LABEL: shuffle4step_i16: 397; CHECK: @ %bb.0: @ %entry 398; CHECK-NEXT: .vsave {d8, d9, d10, d11} 399; CHECK-NEXT: vpush {d8, d9, d10, d11} 400; CHECK-NEXT: vmovx.f16 s18, s9 401; CHECK-NEXT: vmovx.f16 s16, s11 402; CHECK-NEXT: vins.f16 s18, s16 403; CHECK-NEXT: vmovx.f16 s19, s13 404; CHECK-NEXT: vmovx.f16 s16, s15 405; CHECK-NEXT: vmovx.f16 s20, s3 406; CHECK-NEXT: vins.f16 s19, s16 407; CHECK-NEXT: vmovx.f16 s16, s1 408; CHECK-NEXT: vins.f16 s16, s20 409; CHECK-NEXT: vmovx.f16 s17, s5 410; CHECK-NEXT: vmovx.f16 s20, s7 411; CHECK-NEXT: vins.f16 s9, s11 412; CHECK-NEXT: vins.f16 s13, s15 413; CHECK-NEXT: vins.f16 s5, s7 414; CHECK-NEXT: vins.f16 s1, s3 415; CHECK-NEXT: vins.f16 s17, s20 416; CHECK-NEXT: vmov.f32 s20, s1 417; CHECK-NEXT: vmovx.f16 s1, s10 418; CHECK-NEXT: vmov.f32 s22, s9 419; CHECK-NEXT: vmov.f32 s23, s13 420; CHECK-NEXT: vmov.f32 s21, s5 421; CHECK-NEXT: vadd.i16 q4, q5, q4 422; CHECK-NEXT: vmovx.f16 s22, s8 423; CHECK-NEXT: vins.f16 s22, s1 424; CHECK-NEXT: vmovx.f16 s23, s12 425; CHECK-NEXT: vmovx.f16 s1, s14 426; CHECK-NEXT: vmovx.f16 s20, s0 427; CHECK-NEXT: vins.f16 s23, s1 428; CHECK-NEXT: vmovx.f16 s1, s2 429; CHECK-NEXT: vins.f16 s20, s1 430; CHECK-NEXT: vmovx.f16 s21, s4 431; CHECK-NEXT: vmovx.f16 s1, s6 432; CHECK-NEXT: vins.f16 s12, s14 433; CHECK-NEXT: vins.f16 s8, s10 434; CHECK-NEXT: vins.f16 s4, s6 435; CHECK-NEXT: vins.f16 s21, s1 436; CHECK-NEXT: vins.f16 s0, s2 437; CHECK-NEXT: vmov.f32 s3, s12 438; CHECK-NEXT: vmov.f32 s1, s4 439; CHECK-NEXT: vmov.f32 s2, s8 440; CHECK-NEXT: vadd.i16 q0, q0, q5 441; CHECK-NEXT: vadd.i16 q0, q0, q4 442; CHECK-NEXT: vpop {d8, d9, d10, d11} 443; CHECK-NEXT: bx lr 444entry: 445 %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 446 %s2 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 447 %s3 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 448 %s4 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 449 %a1 = add <8 x i16> %s1, %s2 450 %a2 = add <8 x i16> %s3, %s4 451 %r = add <8 x i16> %a1, %a2 452 ret <8 x i16> %r 453} 454 455; i8 456 457define arm_aapcs_vfpcc <16 x i8> @shuffle1_i8(<16 x i8> %src) { 458; CHECK-LABEL: shuffle1_i8: 459; CHECK: @ %bb.0: @ %entry 460; CHECK-NEXT: vrev64.8 q1, q0 461; CHECK-NEXT: vmov.f32 s0, s6 462; CHECK-NEXT: vmov.f32 s1, s7 463; CHECK-NEXT: vmov.f32 s2, s4 464; CHECK-NEXT: vmov.f32 s3, s5 465; CHECK-NEXT: bx lr 466entry: 467 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 468 ret <16 x i8> %out 469} 470 471define arm_aapcs_vfpcc <16 x i8> @shuffle2_i8(<16 x i8> %src) { 472; CHECK-LABEL: shuffle2_i8: 473; CHECK: @ %bb.0: @ %entry 474; CHECK-NEXT: bx lr 475entry: 476 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 477 ret <16 x i8> %out 478} 479 480define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) { 481; CHECK-LABEL: shuffle3_i8: 482; CHECK: @ %bb.0: @ %entry 483; CHECK-NEXT: vmov q1, q0 484; CHECK-NEXT: vmov.u8 r0, q0[4] 485; CHECK-NEXT: vmov.8 q0[0], r0 486; CHECK-NEXT: vmov.u8 r0, q1[5] 487; CHECK-NEXT: vmov.8 q0[1], r0 488; CHECK-NEXT: vmov.u8 r0, q1[15] 489; CHECK-NEXT: vmov.8 q0[2], r0 490; CHECK-NEXT: vmov.u8 r0, q1[7] 491; CHECK-NEXT: vmov.8 q0[3], r0 492; CHECK-NEXT: vmov.u8 r0, q1[14] 493; CHECK-NEXT: vmov.8 q0[4], r0 494; CHECK-NEXT: vmov.u8 r0, q1[9] 495; CHECK-NEXT: vmov.8 q0[5], r0 496; CHECK-NEXT: vmov.u8 r0, q1[6] 497; CHECK-NEXT: vmov.8 q0[6], r0 498; CHECK-NEXT: vmov.u8 r0, q1[3] 499; CHECK-NEXT: vmov.8 q0[7], r0 500; CHECK-NEXT: vmov.u8 r0, q1[10] 501; CHECK-NEXT: vmov.8 q0[8], r0 502; CHECK-NEXT: vmov.u8 r0, q1[12] 503; CHECK-NEXT: vmov.8 q0[9], r0 504; CHECK-NEXT: vmov.u8 r0, q1[1] 505; CHECK-NEXT: vmov.8 q0[10], r0 506; CHECK-NEXT: vmov.u8 r0, q1[13] 507; CHECK-NEXT: vmov.8 q0[11], r0 508; CHECK-NEXT: vmov.u8 r0, q1[2] 509; CHECK-NEXT: vmov.8 q0[12], r0 510; CHECK-NEXT: vmov.u8 r0, q1[8] 511; CHECK-NEXT: vmov.8 q0[13], r0 512; CHECK-NEXT: vmov.u8 r0, q1[0] 513; CHECK-NEXT: vmov.8 q0[14], r0 514; CHECK-NEXT: vmov.u8 r0, q1[11] 515; CHECK-NEXT: vmov.8 q0[15], r0 516; CHECK-NEXT: bx lr 517entry: 518 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 15, i32 7, i32 14, i32 9, i32 6, i32 3, i32 10, i32 12, i32 1, i32 13, i32 2, i32 8, i32 0, i32 11> 519 ret <16 x i8> %out 520} 521 522define arm_aapcs_vfpcc <16 x i8> @shuffle5_i8(<16 x i8> %src) { 523; CHECK-LABEL: shuffle5_i8: 524; CHECK: @ %bb.0: @ %entry 525; CHECK-NEXT: vrev64.8 q1, q0 526; CHECK-NEXT: vmov q0, q1 527; CHECK-NEXT: bx lr 528entry: 529 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8> 530 ret <16 x i8> %out 531} 532 533define arm_aapcs_vfpcc <16 x i8> @shuffle6_i8(<16 x i8> %src) { 534; CHECK-LABEL: shuffle6_i8: 535; CHECK: @ %bb.0: @ %entry 536; CHECK-NEXT: vrev32.8 q0, q0 537; CHECK-NEXT: bx lr 538entry: 539 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 540 ret <16 x i8> %out 541} 542 543define arm_aapcs_vfpcc <16 x i8> @shuffle7_i8(<16 x i8> %src) { 544; CHECK-LABEL: shuffle7_i8: 545; CHECK: @ %bb.0: @ %entry 546; CHECK-NEXT: vrev16.8 q0, q0 547; CHECK-NEXT: bx lr 548entry: 549 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 550 ret <16 x i8> %out 551} 552 553define arm_aapcs_vfpcc <16 x i8> @oneoff11_i8(<16 x i8> %src1, <16 x i8> %src2) { 554; CHECK-LABEL: oneoff11_i8: 555; CHECK: @ %bb.0: @ %entry 556; CHECK-NEXT: vmov.u8 r0, q0[1] 557; CHECK-NEXT: vmov.8 q0[2], r0 558; CHECK-NEXT: bx lr 559entry: 560 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 561 ret <16 x i8> %out 562} 563 564define arm_aapcs_vfpcc <16 x i8> @oneoff12_i8(<16 x i8> %src1, <16 x i8> %src2) { 565; CHECK-LABEL: oneoff12_i8: 566; CHECK: @ %bb.0: @ %entry 567; CHECK-NEXT: vmov.u8 r0, q1[4] 568; CHECK-NEXT: vmov.8 q0[0], r0 569; CHECK-NEXT: bx lr 570entry: 571 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 20, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 572 ret <16 x i8> %out 573} 574 575define arm_aapcs_vfpcc <16 x i8> @oneoff21_i8(<16 x i8> %src1, <16 x i8> %src2) { 576; CHECK-LABEL: oneoff21_i8: 577; CHECK: @ %bb.0: @ %entry 578; CHECK-NEXT: vmov.u8 r0, q0[0] 579; CHECK-NEXT: vmov.8 q1[3], r0 580; CHECK-NEXT: vmov q0, q1 581; CHECK-NEXT: bx lr 582entry: 583 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 16, i32 17, i32 18, i32 0, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 584 ret <16 x i8> %out 585} 586 587define arm_aapcs_vfpcc <16 x i8> @oneoff22_i8(<16 x i8> %src1, <16 x i8> %src2) { 588; CHECK-LABEL: oneoff22_i8: 589; CHECK: @ %bb.0: @ %entry 590; CHECK-NEXT: vmov q0, q1 591; CHECK-NEXT: vmov.u8 r0, q1[15] 592; CHECK-NEXT: vmov.8 q0[9], r0 593; CHECK-NEXT: bx lr 594entry: 595 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 31, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 596 ret <16 x i8> %out 597} 598 599define arm_aapcs_vfpcc <16 x i8> @oneoffundef_i8(<16 x i8> %src1, <16 x i8> %src2) { 600; CHECK-LABEL: oneoffundef_i8: 601; CHECK: @ %bb.0: @ %entry 602; CHECK-NEXT: vmov.u8 r0, q0[2] 603; CHECK-NEXT: vmov.8 q0[1], r0 604; CHECK-NEXT: bx lr 605entry: 606 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 undef, i32 2, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 undef, i32 13, i32 14, i32 15> 607 ret <16 x i8> %out 608} 609 610define arm_aapcs_vfpcc <16 x i8> @shuffle2step_i8(<32 x i8> %src) { 611; CHECK-LABEL: shuffle2step_i8: 612; CHECK: @ %bb.0: @ %entry 613; CHECK-NEXT: .pad #32 614; CHECK-NEXT: sub sp, #32 615; CHECK-NEXT: mov r0, sp 616; CHECK-NEXT: vshr.u16 q2, q1, #8 617; CHECK-NEXT: vstrb.16 q2, [r0, #8] 618; CHECK-NEXT: vshr.u16 q2, q0, #8 619; CHECK-NEXT: add r1, sp, #16 620; CHECK-NEXT: vstrb.16 q2, [r0] 621; CHECK-NEXT: vstrb.16 q1, [r1, #8] 622; CHECK-NEXT: vstrb.16 q0, [r1] 623; CHECK-NEXT: vldrw.u32 q0, [r0] 624; CHECK-NEXT: vldrw.u32 q1, [r1] 625; CHECK-NEXT: vadd.i8 q0, q1, q0 626; CHECK-NEXT: add sp, #32 627; CHECK-NEXT: bx lr 628entry: 629 %s1 = shufflevector <32 x i8> %src, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 630 %s2 = shufflevector <32 x i8> %src, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 631 %r = add <16 x i8> %s1, %s2 632 ret <16 x i8> %r 633} 634 635define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) { 636; CHECK-LABEL: shuffle3step_i8: 637; CHECK: @ %bb.0: @ %entry 638; CHECK-NEXT: .vsave {d8, d9, d10, d11} 639; CHECK-NEXT: vpush {d8, d9, d10, d11} 640; CHECK-NEXT: vmov.u8 r0, q0[1] 641; CHECK-NEXT: vmov.8 q3[0], r0 642; CHECK-NEXT: vmov.u8 r0, q0[4] 643; CHECK-NEXT: vmov.8 q3[1], r0 644; CHECK-NEXT: vmov.u8 r0, q0[7] 645; CHECK-NEXT: vmov.8 q3[2], r0 646; CHECK-NEXT: vmov.u8 r0, q0[10] 647; CHECK-NEXT: vmov.8 q3[3], r0 648; CHECK-NEXT: vmov.u8 r0, q0[13] 649; CHECK-NEXT: vmov.8 q3[4], r0 650; CHECK-NEXT: vmov.u8 r0, q1[0] 651; CHECK-NEXT: vmov.8 q3[5], r0 652; CHECK-NEXT: vmov.u8 r0, q1[3] 653; CHECK-NEXT: vmov.8 q3[6], r0 654; CHECK-NEXT: vmov.u8 r0, q1[9] 655; CHECK-NEXT: vmov.8 q4[8], r0 656; CHECK-NEXT: vmov.u8 r0, q1[12] 657; CHECK-NEXT: vmov.8 q4[9], r0 658; CHECK-NEXT: vmov.u8 r0, q1[15] 659; CHECK-NEXT: vmov.8 q4[10], r0 660; CHECK-NEXT: vmov.u8 r0, q2[2] 661; CHECK-NEXT: vmov.8 q4[11], r0 662; CHECK-NEXT: vmov.u8 r0, q2[5] 663; CHECK-NEXT: vmov.8 q4[12], r0 664; CHECK-NEXT: vmov.u8 r0, q2[8] 665; CHECK-NEXT: vmov.8 q4[13], r0 666; CHECK-NEXT: vmov.u8 r0, q2[11] 667; CHECK-NEXT: vmov.8 q4[14], r0 668; CHECK-NEXT: vmov.u8 r0, q2[14] 669; CHECK-NEXT: vmov.8 q4[15], r0 670; CHECK-NEXT: vmov.u8 r0, q1[6] 671; CHECK-NEXT: vmov.8 q3[7], r0 672; CHECK-NEXT: vmov.u8 r0, q0[0] 673; CHECK-NEXT: vmov.f32 s14, s18 674; CHECK-NEXT: vmov.f32 s15, s19 675; CHECK-NEXT: vmov.8 q4[0], r0 676; CHECK-NEXT: vmov.u8 r0, q0[3] 677; CHECK-NEXT: vmov.8 q4[1], r0 678; CHECK-NEXT: vmov.u8 r0, q0[6] 679; CHECK-NEXT: vmov.8 q4[2], r0 680; CHECK-NEXT: vmov.u8 r0, q0[9] 681; CHECK-NEXT: vmov.8 q4[3], r0 682; CHECK-NEXT: vmov.u8 r0, q0[12] 683; CHECK-NEXT: vmov.8 q4[4], r0 684; CHECK-NEXT: vmov.u8 r0, q0[15] 685; CHECK-NEXT: vmov.8 q4[5], r0 686; CHECK-NEXT: vmov.u8 r0, q1[2] 687; CHECK-NEXT: vmov.8 q4[6], r0 688; CHECK-NEXT: vmov.u8 r0, q1[8] 689; CHECK-NEXT: vmov.8 q5[8], r0 690; CHECK-NEXT: vmov.u8 r0, q1[11] 691; CHECK-NEXT: vmov.8 q5[9], r0 692; CHECK-NEXT: vmov.u8 r0, q1[14] 693; CHECK-NEXT: vmov.8 q5[10], r0 694; CHECK-NEXT: vmov.u8 r0, q2[1] 695; CHECK-NEXT: vmov.8 q5[11], r0 696; CHECK-NEXT: vmov.u8 r0, q2[4] 697; CHECK-NEXT: vmov.8 q5[12], r0 698; CHECK-NEXT: vmov.u8 r0, q2[7] 699; CHECK-NEXT: vmov.8 q5[13], r0 700; CHECK-NEXT: vmov.u8 r0, q2[10] 701; CHECK-NEXT: vmov.8 q5[14], r0 702; CHECK-NEXT: vmov.u8 r0, q2[13] 703; CHECK-NEXT: vmov.8 q5[15], r0 704; CHECK-NEXT: vmov.u8 r0, q1[5] 705; CHECK-NEXT: vmov.8 q4[7], r0 706; CHECK-NEXT: vmov.u8 r0, q0[2] 707; CHECK-NEXT: vmov.f32 s18, s22 708; CHECK-NEXT: vmov.f32 s19, s23 709; CHECK-NEXT: vadd.i8 q3, q4, q3 710; CHECK-NEXT: vmov.8 q4[0], r0 711; CHECK-NEXT: vmov.u8 r0, q0[5] 712; CHECK-NEXT: vmov.8 q4[1], r0 713; CHECK-NEXT: vmov.u8 r0, q0[8] 714; CHECK-NEXT: vmov.8 q4[2], r0 715; CHECK-NEXT: vmov.u8 r0, q0[11] 716; CHECK-NEXT: vmov.8 q4[3], r0 717; CHECK-NEXT: vmov.u8 r0, q0[14] 718; CHECK-NEXT: vmov.8 q4[4], r0 719; CHECK-NEXT: vmov.u8 r0, q1[1] 720; CHECK-NEXT: vmov.8 q4[5], r0 721; CHECK-NEXT: vmov.u8 r0, q1[4] 722; CHECK-NEXT: vmov.8 q4[6], r0 723; CHECK-NEXT: vmov.u8 r0, q1[10] 724; CHECK-NEXT: vmov.8 q0[8], r0 725; CHECK-NEXT: vmov.u8 r0, q1[13] 726; CHECK-NEXT: vmov.8 q0[9], r0 727; CHECK-NEXT: vmov.u8 r0, q2[0] 728; CHECK-NEXT: vmov.8 q0[10], r0 729; CHECK-NEXT: vmov.u8 r0, q2[3] 730; CHECK-NEXT: vmov.8 q0[11], r0 731; CHECK-NEXT: vmov.u8 r0, q2[6] 732; CHECK-NEXT: vmov.8 q0[12], r0 733; CHECK-NEXT: vmov.u8 r0, q2[9] 734; CHECK-NEXT: vmov.8 q0[13], r0 735; CHECK-NEXT: vmov.u8 r0, q2[12] 736; CHECK-NEXT: vmov.8 q0[14], r0 737; CHECK-NEXT: vmov.u8 r0, q2[15] 738; CHECK-NEXT: vmov.8 q0[15], r0 739; CHECK-NEXT: vmov.u8 r0, q1[7] 740; CHECK-NEXT: vmov.8 q4[7], r0 741; CHECK-NEXT: vmov.f32 s18, s2 742; CHECK-NEXT: vmov.f32 s19, s3 743; CHECK-NEXT: vadd.i8 q0, q3, q4 744; CHECK-NEXT: vpop {d8, d9, d10, d11} 745; CHECK-NEXT: bx lr 746entry: 747 %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 748 %s2 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 749 %s3 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 750 %a = add <16 x i8> %s1, %s2 751 %r = add <16 x i8> %a, %s3 752 ret <16 x i8> %r 753} 754 755define arm_aapcs_vfpcc <16 x i8> @shuffle4step_i8(<64 x i8> %src) { 756; CHECK-LABEL: shuffle4step_i8: 757; CHECK: @ %bb.0: @ %entry 758; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 759; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 760; CHECK-NEXT: vmov.u8 r0, q0[3] 761; CHECK-NEXT: vmov.8 q4[0], r0 762; CHECK-NEXT: vmov.u8 r0, q0[7] 763; CHECK-NEXT: vmov.8 q4[1], r0 764; CHECK-NEXT: vmov.u8 r0, q0[11] 765; CHECK-NEXT: vmov.8 q4[2], r0 766; CHECK-NEXT: vmov.u8 r0, q0[15] 767; CHECK-NEXT: vmov.8 q4[3], r0 768; CHECK-NEXT: vmov.u8 r0, q1[3] 769; CHECK-NEXT: vmov.8 q4[4], r0 770; CHECK-NEXT: vmov.u8 r0, q1[7] 771; CHECK-NEXT: vmov.8 q4[5], r0 772; CHECK-NEXT: vmov.u8 r0, q1[11] 773; CHECK-NEXT: vmov.8 q4[6], r0 774; CHECK-NEXT: vmov.u8 r0, q2[3] 775; CHECK-NEXT: vmov.8 q5[8], r0 776; CHECK-NEXT: vmov.u8 r0, q2[7] 777; CHECK-NEXT: vmov.8 q5[9], r0 778; CHECK-NEXT: vmov.u8 r0, q2[11] 779; CHECK-NEXT: vmov.8 q5[10], r0 780; CHECK-NEXT: vmov.u8 r0, q2[15] 781; CHECK-NEXT: vmov.8 q5[11], r0 782; CHECK-NEXT: vmov.u8 r0, q3[3] 783; CHECK-NEXT: vmov.8 q5[12], r0 784; CHECK-NEXT: vmov.u8 r0, q3[7] 785; CHECK-NEXT: vmov.8 q5[13], r0 786; CHECK-NEXT: vmov.u8 r0, q3[11] 787; CHECK-NEXT: vmov.8 q5[14], r0 788; CHECK-NEXT: vmov.u8 r0, q3[15] 789; CHECK-NEXT: vmov.8 q5[15], r0 790; CHECK-NEXT: vmov.u8 r0, q1[15] 791; CHECK-NEXT: vmov.8 q4[7], r0 792; CHECK-NEXT: vmov.u8 r0, q0[2] 793; CHECK-NEXT: vmov.f32 s18, s22 794; CHECK-NEXT: vmov.f32 s19, s23 795; CHECK-NEXT: vmov.8 q5[0], r0 796; CHECK-NEXT: vmov.u8 r0, q0[6] 797; CHECK-NEXT: vmov.8 q5[1], r0 798; CHECK-NEXT: vmov.u8 r0, q0[10] 799; CHECK-NEXT: vmov.8 q5[2], r0 800; CHECK-NEXT: vmov.u8 r0, q0[14] 801; CHECK-NEXT: vmov.8 q5[3], r0 802; CHECK-NEXT: vmov.u8 r0, q1[2] 803; CHECK-NEXT: vmov.8 q5[4], r0 804; CHECK-NEXT: vmov.u8 r0, q1[6] 805; CHECK-NEXT: vmov.8 q5[5], r0 806; CHECK-NEXT: vmov.u8 r0, q1[10] 807; CHECK-NEXT: vmov.8 q5[6], r0 808; CHECK-NEXT: vmov.u8 r0, q2[2] 809; CHECK-NEXT: vmov.8 q6[8], r0 810; CHECK-NEXT: vmov.u8 r0, q2[6] 811; CHECK-NEXT: vmov.8 q6[9], r0 812; CHECK-NEXT: vmov.u8 r0, q2[10] 813; CHECK-NEXT: vmov.8 q6[10], r0 814; CHECK-NEXT: vmov.u8 r0, q2[14] 815; CHECK-NEXT: vmov.8 q6[11], r0 816; CHECK-NEXT: vmov.u8 r0, q3[2] 817; CHECK-NEXT: vmov.8 q6[12], r0 818; CHECK-NEXT: vmov.u8 r0, q3[6] 819; CHECK-NEXT: vmov.8 q6[13], r0 820; CHECK-NEXT: vmov.u8 r0, q3[10] 821; CHECK-NEXT: vmov.8 q6[14], r0 822; CHECK-NEXT: vmov.u8 r0, q3[14] 823; CHECK-NEXT: vmov.8 q6[15], r0 824; CHECK-NEXT: vmov.u8 r0, q1[14] 825; CHECK-NEXT: vmov.8 q5[7], r0 826; CHECK-NEXT: vmov.u8 r0, q0[1] 827; CHECK-NEXT: vmov.f32 s22, s26 828; CHECK-NEXT: vmov.f32 s23, s27 829; CHECK-NEXT: vadd.i8 q4, q5, q4 830; CHECK-NEXT: vmov.8 q5[0], r0 831; CHECK-NEXT: vmov.u8 r0, q0[5] 832; CHECK-NEXT: vmov.8 q5[1], r0 833; CHECK-NEXT: vmov.u8 r0, q0[9] 834; CHECK-NEXT: vmov.8 q5[2], r0 835; CHECK-NEXT: vmov.u8 r0, q0[13] 836; CHECK-NEXT: vmov.8 q5[3], r0 837; CHECK-NEXT: vmov.u8 r0, q1[1] 838; CHECK-NEXT: vmov.8 q5[4], r0 839; CHECK-NEXT: vmov.u8 r0, q1[5] 840; CHECK-NEXT: vmov.8 q5[5], r0 841; CHECK-NEXT: vmov.u8 r0, q1[9] 842; CHECK-NEXT: vmov.8 q5[6], r0 843; CHECK-NEXT: vmov.u8 r0, q2[1] 844; CHECK-NEXT: vmov.8 q6[8], r0 845; CHECK-NEXT: vmov.u8 r0, q2[5] 846; CHECK-NEXT: vmov.8 q6[9], r0 847; CHECK-NEXT: vmov.u8 r0, q2[9] 848; CHECK-NEXT: vmov.8 q6[10], r0 849; CHECK-NEXT: vmov.u8 r0, q2[13] 850; CHECK-NEXT: vmov.8 q6[11], r0 851; CHECK-NEXT: vmov.u8 r0, q3[1] 852; CHECK-NEXT: vmov.8 q6[12], r0 853; CHECK-NEXT: vmov.u8 r0, q3[5] 854; CHECK-NEXT: vmov.8 q6[13], r0 855; CHECK-NEXT: vmov.u8 r0, q3[9] 856; CHECK-NEXT: vmov.8 q6[14], r0 857; CHECK-NEXT: vmov.u8 r0, q3[13] 858; CHECK-NEXT: vmov.8 q6[15], r0 859; CHECK-NEXT: vmov.u8 r0, q1[13] 860; CHECK-NEXT: vmov.8 q5[7], r0 861; CHECK-NEXT: vmov.u8 r0, q0[0] 862; CHECK-NEXT: vmov.f32 s22, s26 863; CHECK-NEXT: vmov.f32 s23, s27 864; CHECK-NEXT: vmov.8 q6[0], r0 865; CHECK-NEXT: vmov.u8 r0, q0[4] 866; CHECK-NEXT: vmov.8 q6[1], r0 867; CHECK-NEXT: vmov.u8 r0, q0[8] 868; CHECK-NEXT: vmov.8 q6[2], r0 869; CHECK-NEXT: vmov.u8 r0, q0[12] 870; CHECK-NEXT: vmov.8 q6[3], r0 871; CHECK-NEXT: vmov.u8 r0, q1[0] 872; CHECK-NEXT: vmov.8 q6[4], r0 873; CHECK-NEXT: vmov.u8 r0, q1[4] 874; CHECK-NEXT: vmov.8 q6[5], r0 875; CHECK-NEXT: vmov.u8 r0, q1[8] 876; CHECK-NEXT: vmov.8 q6[6], r0 877; CHECK-NEXT: vmov.u8 r0, q2[0] 878; CHECK-NEXT: vmov.8 q0[8], r0 879; CHECK-NEXT: vmov.u8 r0, q2[4] 880; CHECK-NEXT: vmov.8 q0[9], r0 881; CHECK-NEXT: vmov.u8 r0, q2[8] 882; CHECK-NEXT: vmov.8 q0[10], r0 883; CHECK-NEXT: vmov.u8 r0, q2[12] 884; CHECK-NEXT: vmov.8 q0[11], r0 885; CHECK-NEXT: vmov.u8 r0, q3[0] 886; CHECK-NEXT: vmov.8 q0[12], r0 887; CHECK-NEXT: vmov.u8 r0, q3[4] 888; CHECK-NEXT: vmov.8 q0[13], r0 889; CHECK-NEXT: vmov.u8 r0, q3[8] 890; CHECK-NEXT: vmov.8 q0[14], r0 891; CHECK-NEXT: vmov.u8 r0, q3[12] 892; CHECK-NEXT: vmov.8 q0[15], r0 893; CHECK-NEXT: vmov.u8 r0, q1[12] 894; CHECK-NEXT: vmov.8 q6[7], r0 895; CHECK-NEXT: vmov.f32 s26, s2 896; CHECK-NEXT: vmov.f32 s27, s3 897; CHECK-NEXT: vadd.i8 q0, q6, q5 898; CHECK-NEXT: vadd.i8 q0, q0, q4 899; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 900; CHECK-NEXT: bx lr 901entry: 902 %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 903 %s2 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 904 %s3 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 905 %s4 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 906 %a1 = add <16 x i8> %s1, %s2 907 %a2 = add <16 x i8> %s3, %s4 908 %r = add <16 x i8> %a1, %a2 909 ret <16 x i8> %r 910} 911 912; i64 913 914define arm_aapcs_vfpcc <2 x i64> @shuffle1_i64(<2 x i64> %src) { 915; CHECK-LABEL: shuffle1_i64: 916; CHECK: @ %bb.0: @ %entry 917; CHECK-NEXT: bx lr 918entry: 919 %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 0, i32 1> 920 ret <2 x i64> %out 921} 922 923define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) { 924; CHECK-LABEL: shuffle2_i64: 925; CHECK: @ %bb.0: @ %entry 926; CHECK-NEXT: vmov.f32 s4, s2 927; CHECK-NEXT: vmov.f32 s6, s0 928; CHECK-NEXT: vmov.f32 s5, s3 929; CHECK-NEXT: vmov.f32 s7, s1 930; CHECK-NEXT: vmov q0, q1 931; CHECK-NEXT: bx lr 932entry: 933 %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 1, i32 0> 934 ret <2 x i64> %out 935} 936 937define arm_aapcs_vfpcc <2 x i64> @shuffle3_i64(<2 x i64> %src) { 938; CHECK-LABEL: shuffle3_i64: 939; CHECK: @ %bb.0: @ %entry 940; CHECK-NEXT: bx lr 941entry: 942 %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 undef, i32 1> 943 ret <2 x i64> %out 944} 945 946; f32 947 948define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) { 949; CHECK-LABEL: shuffle1_f32: 950; CHECK: @ %bb.0: @ %entry 951; CHECK-NEXT: vmov.f32 s4, s3 952; CHECK-NEXT: vmov.f32 s5, s2 953; CHECK-NEXT: vmov.f32 s6, s1 954; CHECK-NEXT: vmov.f32 s7, s0 955; CHECK-NEXT: vmov q0, q1 956; CHECK-NEXT: bx lr 957entry: 958 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 959 ret <4 x float> %out 960} 961 962define arm_aapcs_vfpcc <4 x float> @shuffle2_f32(<4 x float> %src) { 963; CHECK-LABEL: shuffle2_f32: 964; CHECK: @ %bb.0: @ %entry 965; CHECK-NEXT: bx lr 966entry: 967 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 968 ret <4 x float> %out 969} 970 971define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) { 972; CHECK-LABEL: shuffle3_f32: 973; CHECK: @ %bb.0: @ %entry 974; CHECK-NEXT: vmov.f32 s4, s3 975; CHECK-NEXT: vmov.f32 s5, s1 976; CHECK-NEXT: vmov.f32 s6, s2 977; CHECK-NEXT: vmov.f32 s7, s0 978; CHECK-NEXT: vmov q0, q1 979; CHECK-NEXT: bx lr 980entry: 981 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0> 982 ret <4 x float> %out 983} 984 985define arm_aapcs_vfpcc <4 x float> @shuffle5_f32(<4 x float> %src) { 986; CHECK-LABEL: shuffle5_f32: 987; CHECK: @ %bb.0: @ %entry 988; CHECK-NEXT: vrev64.32 q1, q0 989; CHECK-NEXT: vmov q0, q1 990; CHECK-NEXT: bx lr 991entry: 992 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 993 ret <4 x float> %out 994} 995 996define arm_aapcs_vfpcc <4 x float> @oneoff11_f32(<4 x float> %src1, <4 x float> %src2) { 997; CHECK-LABEL: oneoff11_f32: 998; CHECK: @ %bb.0: @ %entry 999; CHECK-NEXT: vmov.f32 s2, s1 1000; CHECK-NEXT: bx lr 1001entry: 1002 %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 0, i32 1, i32 1, i32 3> 1003 ret <4 x float> %out 1004} 1005 1006define arm_aapcs_vfpcc <4 x float> @oneoff12_f32(<4 x float> %src1, <4 x float> %src2) { 1007; CHECK-LABEL: oneoff12_f32: 1008; CHECK: @ %bb.0: @ %entry 1009; CHECK-NEXT: vmov.f32 s0, s4 1010; CHECK-NEXT: bx lr 1011entry: 1012 %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1013 ret <4 x float> %out 1014} 1015 1016define arm_aapcs_vfpcc <4 x float> @oneoff21_f32(<4 x float> %src1, <4 x float> %src2) { 1017; CHECK-LABEL: oneoff21_f32: 1018; CHECK: @ %bb.0: @ %entry 1019; CHECK-NEXT: vmov.f32 s7, s0 1020; CHECK-NEXT: vmov q0, q1 1021; CHECK-NEXT: bx lr 1022entry: 1023 %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 5, i32 6, i32 0> 1024 ret <4 x float> %out 1025} 1026 1027define arm_aapcs_vfpcc <4 x float> @oneoff22_f32(<4 x float> %src1, <4 x float> %src2) { 1028; CHECK-LABEL: oneoff22_f32: 1029; CHECK: @ %bb.0: @ %entry 1030; CHECK-NEXT: vmov q0, q1 1031; CHECK-NEXT: vmov.f32 s2, s0 1032; CHECK-NEXT: bx lr 1033entry: 1034 %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 5, i32 4, i32 7> 1035 ret <4 x float> %out 1036} 1037 1038define arm_aapcs_vfpcc <4 x float> @shuffle2step_f32(<8 x float> %src) { 1039; CHECKFP-LABEL: shuffle2step_f32: 1040; CHECKFP: @ %bb.0: @ %entry 1041; CHECKFP-NEXT: vmov.f32 s8, s1 1042; CHECKFP-NEXT: vmov.f32 s9, s3 1043; CHECKFP-NEXT: vmov.f32 s1, s2 1044; CHECKFP-NEXT: vmov.f32 s10, s5 1045; CHECKFP-NEXT: vmov.f32 s11, s7 1046; CHECKFP-NEXT: vmov.f32 s2, s4 1047; CHECKFP-NEXT: vmov.f32 s3, s6 1048; CHECKFP-NEXT: vadd.f32 q0, q0, q2 1049; CHECKFP-NEXT: bx lr 1050entry: 1051 %s1 = shufflevector <8 x float> %src, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1052 %s2 = shufflevector <8 x float> %src, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1053 %r = fadd <4 x float> %s1, %s2 1054 ret <4 x float> %r 1055} 1056 1057define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) { 1058; CHECKFP-LABEL: shuffle3step_f32: 1059; CHECKFP: @ %bb.0: @ %entry 1060; CHECKFP-NEXT: .vsave {d8, d9} 1061; CHECKFP-NEXT: vpush {d8, d9} 1062; CHECKFP-NEXT: vmov.f32 s13, s4 1063; CHECKFP-NEXT: vmov.f32 s14, s7 1064; CHECKFP-NEXT: vmov.f32 s18, s6 1065; CHECKFP-NEXT: vmov.f32 s12, s1 1066; CHECKFP-NEXT: vmov.f32 s15, s10 1067; CHECKFP-NEXT: vmov.f32 s16, s0 1068; CHECKFP-NEXT: vmov.f32 s17, s3 1069; CHECKFP-NEXT: vmov.f32 s19, s9 1070; CHECKFP-NEXT: vadd.f32 q3, q4, q3 1071; CHECKFP-NEXT: vmov.f32 s4, s2 1072; CHECKFP-NEXT: vmov.f32 s6, s8 1073; CHECKFP-NEXT: vmov.f32 s7, s11 1074; CHECKFP-NEXT: vadd.f32 q0, q3, q1 1075; CHECKFP-NEXT: vpop {d8, d9} 1076; CHECKFP-NEXT: bx lr 1077entry: 1078 %s1 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 1079 %s2 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 1080 %s3 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 1081 %a = fadd <4 x float> %s1, %s2 1082 %r = fadd <4 x float> %a, %s3 1083 ret <4 x float> %r 1084} 1085 1086define arm_aapcs_vfpcc <4 x float> @shuffle4step_f32(<16 x float> %src) { 1087; CHECKFP-LABEL: shuffle4step_f32: 1088; CHECKFP: @ %bb.0: @ %entry 1089; CHECKFP-NEXT: .vsave {d8, d9, d10, d11} 1090; CHECKFP-NEXT: vpush {d8, d9, d10, d11} 1091; CHECKFP-NEXT: vmov.f32 s16, s3 1092; CHECKFP-NEXT: vmov.f32 s20, s2 1093; CHECKFP-NEXT: vmov.f32 s17, s7 1094; CHECKFP-NEXT: vmov.f32 s18, s11 1095; CHECKFP-NEXT: vmov.f32 s19, s15 1096; CHECKFP-NEXT: vmov.f32 s21, s6 1097; CHECKFP-NEXT: vmov.f32 s22, s10 1098; CHECKFP-NEXT: vmov.f32 s23, s14 1099; CHECKFP-NEXT: vadd.f32 q4, q5, q4 1100; CHECKFP-NEXT: vmov.f32 s20, s1 1101; CHECKFP-NEXT: vmov.f32 s21, s5 1102; CHECKFP-NEXT: vmov.f32 s22, s9 1103; CHECKFP-NEXT: vmov.f32 s23, s13 1104; CHECKFP-NEXT: vmov.f32 s1, s4 1105; CHECKFP-NEXT: vmov.f32 s2, s8 1106; CHECKFP-NEXT: vmov.f32 s3, s12 1107; CHECKFP-NEXT: vadd.f32 q0, q0, q5 1108; CHECKFP-NEXT: vadd.f32 q0, q0, q4 1109; CHECKFP-NEXT: vpop {d8, d9, d10, d11} 1110; CHECKFP-NEXT: bx lr 1111entry: 1112 %s1 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 1113 %s2 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 1114 %s3 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 1115 %s4 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 1116 %a1 = fadd <4 x float> %s1, %s2 1117 %a2 = fadd <4 x float> %s3, %s4 1118 %r = fadd <4 x float> %a1, %a2 1119 ret <4 x float> %r 1120} 1121 1122; f16 1123 1124define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) { 1125; CHECK-LABEL: shuffle1_f16: 1126; CHECK: @ %bb.0: @ %entry 1127; CHECK-NEXT: vrev64.16 q1, q0 1128; CHECK-NEXT: vmov.f32 s0, s6 1129; CHECK-NEXT: vmov.f32 s1, s7 1130; CHECK-NEXT: vmov.f32 s2, s4 1131; CHECK-NEXT: vmov.f32 s3, s5 1132; CHECK-NEXT: bx lr 1133entry: 1134 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 1135 ret <8 x half> %out 1136} 1137 1138define arm_aapcs_vfpcc <8 x half> @shuffle2_f16(<8 x half> %src) { 1139; CHECK-LABEL: shuffle2_f16: 1140; CHECK: @ %bb.0: @ %entry 1141; CHECK-NEXT: bx lr 1142entry: 1143 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1144 ret <8 x half> %out 1145} 1146 1147define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { 1148; CHECK-LABEL: shuffle3_f16: 1149; CHECK: @ %bb.0: @ %entry 1150; CHECK-NEXT: vmov q1, q0 1151; CHECK-NEXT: vmovx.f16 s2, s5 1152; CHECK-NEXT: vmovx.f16 s0, s4 1153; CHECK-NEXT: vins.f16 s5, s4 1154; CHECK-NEXT: vins.f16 s2, s0 1155; CHECK-NEXT: vmov.f32 s3, s5 1156; CHECK-NEXT: vmovx.f16 s1, s7 1157; CHECK-NEXT: vmov.f32 s0, s6 1158; CHECK-NEXT: vins.f16 s1, s7 1159; CHECK-NEXT: bx lr 1160entry: 1161 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0> 1162 ret <8 x half> %out 1163} 1164 1165define arm_aapcs_vfpcc <8 x half> @shuffle5_f16(<8 x half> %src) { 1166; CHECK-LABEL: shuffle5_f16: 1167; CHECK: @ %bb.0: @ %entry 1168; CHECK-NEXT: vrev64.16 q1, q0 1169; CHECK-NEXT: vmov q0, q1 1170; CHECK-NEXT: bx lr 1171entry: 1172 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 1173 ret <8 x half> %out 1174} 1175 1176define arm_aapcs_vfpcc <8 x half> @shuffle6_f16(<8 x half> %src) { 1177; CHECK-LABEL: shuffle6_f16: 1178; CHECK: @ %bb.0: @ %entry 1179; CHECK-NEXT: vrev32.16 q0, q0 1180; CHECK-NEXT: bx lr 1181entry: 1182 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 1183 ret <8 x half> %out 1184} 1185 1186define arm_aapcs_vfpcc <8 x half> @oneoff11_f16(<8 x half> %src1, <8 x half> %src2) { 1187; CHECK-LABEL: oneoff11_f16: 1188; CHECK: @ %bb.0: @ %entry 1189; CHECK-NEXT: vmovx.f16 s4, s0 1190; CHECK-NEXT: vmov r0, s4 1191; CHECK-NEXT: vmov.16 q0[2], r0 1192; CHECK-NEXT: bx lr 1193entry: 1194 %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7> 1195 ret <8 x half> %out 1196} 1197 1198define arm_aapcs_vfpcc <8 x half> @oneoff12_f16(<8 x half> %src1, <8 x half> %src2) { 1199; CHECK-LABEL: oneoff12_f16: 1200; CHECK: @ %bb.0: @ %entry 1201; CHECK-NEXT: vmov r0, s4 1202; CHECK-NEXT: vmov.16 q0[0], r0 1203; CHECK-NEXT: bx lr 1204entry: 1205 %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1206 ret <8 x half> %out 1207} 1208 1209define arm_aapcs_vfpcc <8 x half> @oneoff21_f16(<8 x half> %src1, <8 x half> %src2) { 1210; CHECK-LABEL: oneoff21_f16: 1211; CHECK: @ %bb.0: @ %entry 1212; CHECK-NEXT: vins.f16 s5, s0 1213; CHECK-NEXT: vmov q0, q1 1214; CHECK-NEXT: bx lr 1215entry: 1216 %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 8, i32 9, i32 10, i32 0, i32 12, i32 13, i32 14, i32 15> 1217 ret <8 x half> %out 1218} 1219 1220define arm_aapcs_vfpcc <8 x half> @oneoff22_f16(<8 x half> %src1, <8 x half> %src2) { 1221; CHECK-LABEL: oneoff22_f16: 1222; CHECK: @ %bb.0: @ %entry 1223; CHECK-NEXT: vmov q0, q1 1224; CHECK-NEXT: vmov r0, s3 1225; CHECK-NEXT: vmov.16 q0[0], r0 1226; CHECK-NEXT: bx lr 1227entry: 1228 %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 14, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1229 ret <8 x half> %out 1230} 1231 1232define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) { 1233; CHECKFP-LABEL: shuffle2step_f16: 1234; CHECKFP: @ %bb.0: @ %entry 1235; CHECKFP-NEXT: vmovx.f16 s8, s0 1236; CHECKFP-NEXT: vmovx.f16 s10, s1 1237; CHECKFP-NEXT: vins.f16 s8, s10 1238; CHECKFP-NEXT: vmovx.f16 s9, s2 1239; CHECKFP-NEXT: vmovx.f16 s10, s3 1240; CHECKFP-NEXT: vmovx.f16 s12, s5 1241; CHECKFP-NEXT: vins.f16 s9, s10 1242; CHECKFP-NEXT: vmovx.f16 s10, s4 1243; CHECKFP-NEXT: vins.f16 s10, s12 1244; CHECKFP-NEXT: vmovx.f16 s11, s6 1245; CHECKFP-NEXT: vmovx.f16 s12, s7 1246; CHECKFP-NEXT: vins.f16 s2, s3 1247; CHECKFP-NEXT: vins.f16 s6, s7 1248; CHECKFP-NEXT: vins.f16 s4, s5 1249; CHECKFP-NEXT: vins.f16 s0, s1 1250; CHECKFP-NEXT: vmov.f32 s1, s2 1251; CHECKFP-NEXT: vins.f16 s11, s12 1252; CHECKFP-NEXT: vmov.f32 s2, s4 1253; CHECKFP-NEXT: vmov.f32 s3, s6 1254; CHECKFP-NEXT: vadd.f16 q0, q0, q2 1255; CHECKFP-NEXT: bx lr 1256entry: 1257 %s1 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 1258 %s2 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 1259 %r = fadd <8 x half> %s1, %s2 1260 ret <8 x half> %r 1261} 1262 1263define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) { 1264; CHECKFP-LABEL: shuffle3step_f16: 1265; CHECKFP: @ %bb.0: @ %entry 1266; CHECKFP-NEXT: .vsave {d8, d9} 1267; CHECKFP-NEXT: vpush {d8, d9} 1268; CHECKFP-NEXT: vmov.f32 s13, s4 1269; CHECKFP-NEXT: vmovx.f16 s4, s4 1270; CHECKFP-NEXT: vmovx.f16 s17, s3 1271; CHECKFP-NEXT: vins.f16 s3, s4 1272; CHECKFP-NEXT: vmovx.f16 s4, s7 1273; CHECKFP-NEXT: vmovx.f16 s18, s6 1274; CHECKFP-NEXT: vmovx.f16 s16, s0 1275; CHECKFP-NEXT: vins.f16 s6, s4 1276; CHECKFP-NEXT: vmovx.f16 s14, s2 1277; CHECKFP-NEXT: vmov.f32 s12, s1 1278; CHECKFP-NEXT: vmovx.f16 s4, s10 1279; CHECKFP-NEXT: vmovx.f16 s19, s9 1280; CHECKFP-NEXT: vins.f16 s12, s14 1281; CHECKFP-NEXT: vmovx.f16 s14, s5 1282; CHECKFP-NEXT: vins.f16 s16, s2 1283; CHECKFP-NEXT: vmovx.f16 s2, s11 1284; CHECKFP-NEXT: vmovx.f16 s15, s8 1285; CHECKFP-NEXT: vins.f16 s18, s8 1286; CHECKFP-NEXT: vmovx.f16 s8, s1 1287; CHECKFP-NEXT: vins.f16 s9, s4 1288; CHECKFP-NEXT: vins.f16 s13, s14 1289; CHECKFP-NEXT: vmov.f32 s14, s7 1290; CHECKFP-NEXT: vins.f16 s10, s2 1291; CHECKFP-NEXT: vmov.f32 s1, s3 1292; CHECKFP-NEXT: vins.f16 s19, s11 1293; CHECKFP-NEXT: vins.f16 s17, s5 1294; CHECKFP-NEXT: vins.f16 s0, s8 1295; CHECKFP-NEXT: vmov.f32 s2, s6 1296; CHECKFP-NEXT: vmov.f32 s3, s9 1297; CHECKFP-NEXT: vins.f16 s14, s15 1298; CHECKFP-NEXT: vmov.f32 s15, s10 1299; CHECKFP-NEXT: vadd.f16 q0, q0, q4 1300; CHECKFP-NEXT: vadd.f16 q0, q0, q3 1301; CHECKFP-NEXT: vpop {d8, d9} 1302; CHECKFP-NEXT: bx lr 1303entry: 1304 %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 1305 %s2 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 1306 %s3 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 1307 %a = fadd <8 x half> %s1, %s2 1308 %r = fadd <8 x half> %a, %s3 1309 ret <8 x half> %r 1310} 1311 1312define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) { 1313; CHECKFP-LABEL: shuffle4step_f16: 1314; CHECKFP: @ %bb.0: @ %entry 1315; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 1316; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} 1317; CHECKFP-NEXT: vmovx.f16 s18, s9 1318; CHECKFP-NEXT: vmovx.f16 s16, s11 1319; CHECKFP-NEXT: vins.f16 s18, s16 1320; CHECKFP-NEXT: vmovx.f16 s19, s13 1321; CHECKFP-NEXT: vmovx.f16 s16, s15 1322; CHECKFP-NEXT: vmovx.f16 s22, s8 1323; CHECKFP-NEXT: vins.f16 s19, s16 1324; CHECKFP-NEXT: vmovx.f16 s16, s1 1325; CHECKFP-NEXT: vmovx.f16 s20, s3 1326; CHECKFP-NEXT: vins.f16 s1, s3 1327; CHECKFP-NEXT: vmovx.f16 s3, s10 1328; CHECKFP-NEXT: vins.f16 s16, s20 1329; CHECKFP-NEXT: vmovx.f16 s17, s5 1330; CHECKFP-NEXT: vmovx.f16 s20, s7 1331; CHECKFP-NEXT: vins.f16 s22, s3 1332; CHECKFP-NEXT: vmovx.f16 s23, s12 1333; CHECKFP-NEXT: vmovx.f16 s3, s14 1334; CHECKFP-NEXT: vins.f16 s17, s20 1335; CHECKFP-NEXT: vins.f16 s23, s3 1336; CHECKFP-NEXT: vmovx.f16 s20, s0 1337; CHECKFP-NEXT: vmovx.f16 s3, s2 1338; CHECKFP-NEXT: vins.f16 s9, s11 1339; CHECKFP-NEXT: vins.f16 s13, s15 1340; CHECKFP-NEXT: vins.f16 s5, s7 1341; CHECKFP-NEXT: vins.f16 s20, s3 1342; CHECKFP-NEXT: vmovx.f16 s21, s4 1343; CHECKFP-NEXT: vmovx.f16 s3, s6 1344; CHECKFP-NEXT: vins.f16 s8, s10 1345; CHECKFP-NEXT: vins.f16 s12, s14 1346; CHECKFP-NEXT: vins.f16 s4, s6 1347; CHECKFP-NEXT: vins.f16 s21, s3 1348; CHECKFP-NEXT: vins.f16 s0, s2 1349; CHECKFP-NEXT: vmov.f32 s24, s1 1350; CHECKFP-NEXT: vmov.f32 s26, s9 1351; CHECKFP-NEXT: vmov.f32 s27, s13 1352; CHECKFP-NEXT: vmov.f32 s25, s5 1353; CHECKFP-NEXT: vmov.f32 s2, s8 1354; CHECKFP-NEXT: vadd.f16 q4, q6, q4 1355; CHECKFP-NEXT: vmov.f32 s3, s12 1356; CHECKFP-NEXT: vmov.f32 s1, s4 1357; CHECKFP-NEXT: vadd.f16 q0, q0, q5 1358; CHECKFP-NEXT: vadd.f16 q0, q0, q4 1359; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} 1360; CHECKFP-NEXT: bx lr 1361entry: 1362 %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 1363 %s2 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 1364 %s3 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 1365 %s4 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 1366 %a1 = fadd <8 x half> %s1, %s2 1367 %a2 = fadd <8 x half> %s3, %s4 1368 %r = fadd <8 x half> %a1, %a2 1369 ret <8 x half> %r 1370} 1371 1372; f64 1373 1374define arm_aapcs_vfpcc <2 x double> @shuffle1_f64(<2 x double> %src) { 1375; CHECK-LABEL: shuffle1_f64: 1376; CHECK: @ %bb.0: @ %entry 1377; CHECK-NEXT: bx lr 1378entry: 1379 %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 0, i32 1> 1380 ret <2 x double> %out 1381} 1382 1383define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) { 1384; CHECK-LABEL: shuffle2_f64: 1385; CHECK: @ %bb.0: @ %entry 1386; CHECK-NEXT: vmov.f32 s4, s2 1387; CHECK-NEXT: vmov.f32 s6, s0 1388; CHECK-NEXT: vmov.f32 s5, s3 1389; CHECK-NEXT: vmov.f32 s7, s1 1390; CHECK-NEXT: vmov q0, q1 1391; CHECK-NEXT: bx lr 1392entry: 1393 %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 1, i32 0> 1394 ret <2 x double> %out 1395} 1396 1397define arm_aapcs_vfpcc <2 x double> @shuffle3_f64(<2 x double> %src) { 1398; CHECK-LABEL: shuffle3_f64: 1399; CHECK: @ %bb.0: @ %entry 1400; CHECK-NEXT: bx lr 1401entry: 1402 %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 undef, i32 1> 1403 ret <2 x double> %out 1404} 1405 1406define arm_aapcs_vfpcc <4 x double> @shuffle4_f64(<2 x double> %src1, <2 x double> %src2) { 1407; CHECK-LABEL: shuffle4_f64: 1408; CHECK: @ %bb.0: @ %entry 1409; CHECK-NEXT: vmov.f32 s8, s6 1410; CHECK-NEXT: vmov.f32 s6, s0 1411; CHECK-NEXT: vmov.f32 s9, s7 1412; CHECK-NEXT: vmov.f32 s7, s1 1413; CHECK-NEXT: vmov.f32 s10, s2 1414; CHECK-NEXT: vmov.f32 s11, s3 1415; CHECK-NEXT: vmov q0, q2 1416; CHECK-NEXT: bx lr 1417entry: 1418 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0> 1419 ret <4 x double> %out 1420} 1421define arm_aapcs_vfpcc <4 x double> @shuffle5_f64(<2 x double> %src1, <2 x double> %src2) { 1422; CHECK-LABEL: shuffle5_f64: 1423; CHECK: @ %bb.0: @ %entry 1424; CHECK-NEXT: vmov.f32 s8, s6 1425; CHECK-NEXT: vmov.f32 s10, s4 1426; CHECK-NEXT: vmov.f32 s4, s2 1427; CHECK-NEXT: vmov.f32 s6, s0 1428; CHECK-NEXT: vmov.f32 s9, s7 1429; CHECK-NEXT: vmov.f32 s11, s5 1430; CHECK-NEXT: vmov.f32 s5, s3 1431; CHECK-NEXT: vmov.f32 s7, s1 1432; CHECK-NEXT: vmov q0, q2 1433; CHECK-NEXT: bx lr 1434entry: 1435 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1436 ret <4 x double> %out 1437} 1438define arm_aapcs_vfpcc <2 x double> @shuffle6_f64(<2 x double> %src1, <2 x double> %src2) { 1439; CHECK-LABEL: shuffle6_f64: 1440; CHECK: @ %bb.0: @ %entry 1441; CHECK-NEXT: vmov.f32 s2, s6 1442; CHECK-NEXT: vmov.f32 s3, s7 1443; CHECK-NEXT: bx lr 1444entry: 1445 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 0, i32 3> 1446 ret <2 x double> %out 1447} 1448define arm_aapcs_vfpcc <2 x double> @shuffle7_f64(<2 x double> %src1, <2 x double> %src2) { 1449; CHECK-LABEL: shuffle7_f64: 1450; CHECK: @ %bb.0: @ %entry 1451; CHECK-NEXT: vmov.f32 s0, s6 1452; CHECK-NEXT: vmov.f32 s1, s7 1453; CHECK-NEXT: bx lr 1454entry: 1455 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 3, i32 1> 1456 ret <2 x double> %out 1457} 1458define arm_aapcs_vfpcc <2 x double> @shuffle8_f64(<2 x double> %src1, <2 x double> %src2) { 1459; CHECK-LABEL: shuffle8_f64: 1460; CHECK: @ %bb.0: @ %entry 1461; CHECK-NEXT: vmov.f32 s6, s2 1462; CHECK-NEXT: vmov.f32 s7, s3 1463; CHECK-NEXT: vmov q0, q1 1464; CHECK-NEXT: bx lr 1465entry: 1466 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 2, i32 1> 1467 ret <2 x double> %out 1468} 1469define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) { 1470; CHECK-LV-LABEL: shuffle9_f64: 1471; CHECK-LV: @ %bb.0: @ %entry 1472; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} 1473; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} 1474; CHECK-LV-NEXT: vmov q5, q2 1475; CHECK-LV-NEXT: vmov.f32 s16, s0 1476; CHECK-LV-NEXT: vmov.f32 s18, s20 1477; CHECK-LV-NEXT: vmov.f32 s20, s2 1478; CHECK-LV-NEXT: vmov.f32 s10, s12 1479; CHECK-LV-NEXT: vmov.f32 s19, s21 1480; CHECK-LV-NEXT: vmov.f32 s8, s4 1481; CHECK-LV-NEXT: vmov.f32 s17, s1 1482; CHECK-LV-NEXT: vmov.f32 s21, s3 1483; CHECK-LV-NEXT: vmov q0, q4 1484; CHECK-LV-NEXT: vmov.f32 s12, s6 1485; CHECK-LV-NEXT: vmov.f32 s11, s13 1486; CHECK-LV-NEXT: vmov.f32 s9, s5 1487; CHECK-LV-NEXT: vmov.f32 s13, s7 1488; CHECK-LV-NEXT: vmov q1, q5 1489; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} 1490; CHECK-LV-NEXT: bx lr 1491; 1492; CHECK-LIS-LABEL: shuffle9_f64: 1493; CHECK-LIS: @ %bb.0: @ %entry 1494; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} 1495; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} 1496; CHECK-LIS-NEXT: vmov q5, q2 1497; CHECK-LIS-NEXT: vmov q4, q0 1498; CHECK-LIS-NEXT: vmov.f32 s2, s20 1499; CHECK-LIS-NEXT: vmov.f32 s20, s18 1500; CHECK-LIS-NEXT: vmov.f32 s10, s12 1501; CHECK-LIS-NEXT: vmov.f32 s3, s21 1502; CHECK-LIS-NEXT: vmov.f32 s8, s4 1503; CHECK-LIS-NEXT: vmov.f32 s21, s19 1504; CHECK-LIS-NEXT: vmov.f32 s12, s6 1505; CHECK-LIS-NEXT: vmov.f32 s11, s13 1506; CHECK-LIS-NEXT: vmov.f32 s9, s5 1507; CHECK-LIS-NEXT: vmov.f32 s13, s7 1508; CHECK-LIS-NEXT: vmov q1, q5 1509; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} 1510; CHECK-LIS-NEXT: bx lr 1511entry: 1512 %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 1513 ret <8 x double> %out 1514} 1515 1516 1517 1518 1519define arm_aapcs_vfpcc <4 x i64> @shuffle4_i64(<2 x i64> %src1, <2 x i64> %src2) { 1520; CHECK-LABEL: shuffle4_i64: 1521; CHECK: @ %bb.0: @ %entry 1522; CHECK-NEXT: vmov.f32 s8, s6 1523; CHECK-NEXT: vmov.f32 s6, s0 1524; CHECK-NEXT: vmov.f32 s9, s7 1525; CHECK-NEXT: vmov.f32 s7, s1 1526; CHECK-NEXT: vmov.f32 s10, s2 1527; CHECK-NEXT: vmov.f32 s11, s3 1528; CHECK-NEXT: vmov q0, q2 1529; CHECK-NEXT: bx lr 1530entry: 1531 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0> 1532 ret <4 x i64> %out 1533} 1534define arm_aapcs_vfpcc <4 x i64> @shuffle5_i64(<2 x i64> %src1, <2 x i64> %src2) { 1535; CHECK-LABEL: shuffle5_i64: 1536; CHECK: @ %bb.0: @ %entry 1537; CHECK-NEXT: vmov.f32 s8, s6 1538; CHECK-NEXT: vmov.f32 s10, s4 1539; CHECK-NEXT: vmov.f32 s4, s2 1540; CHECK-NEXT: vmov.f32 s6, s0 1541; CHECK-NEXT: vmov.f32 s9, s7 1542; CHECK-NEXT: vmov.f32 s11, s5 1543; CHECK-NEXT: vmov.f32 s5, s3 1544; CHECK-NEXT: vmov.f32 s7, s1 1545; CHECK-NEXT: vmov q0, q2 1546; CHECK-NEXT: bx lr 1547entry: 1548 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1549 ret <4 x i64> %out 1550} 1551define arm_aapcs_vfpcc <2 x i64> @shuffle6_i64(<2 x i64> %src1, <2 x i64> %src2) { 1552; CHECK-LABEL: shuffle6_i64: 1553; CHECK: @ %bb.0: @ %entry 1554; CHECK-NEXT: vmov.f32 s2, s6 1555; CHECK-NEXT: vmov.f32 s3, s7 1556; CHECK-NEXT: bx lr 1557entry: 1558 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 0, i32 3> 1559 ret <2 x i64> %out 1560} 1561define arm_aapcs_vfpcc <2 x i64> @shuffle7_i64(<2 x i64> %src1, <2 x i64> %src2) { 1562; CHECK-LABEL: shuffle7_i64: 1563; CHECK: @ %bb.0: @ %entry 1564; CHECK-NEXT: vmov.f32 s0, s6 1565; CHECK-NEXT: vmov.f32 s1, s7 1566; CHECK-NEXT: bx lr 1567entry: 1568 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 3, i32 1> 1569 ret <2 x i64> %out 1570} 1571define arm_aapcs_vfpcc <2 x i64> @shuffle8_i64(<2 x i64> %src1, <2 x i64> %src2) { 1572; CHECK-LABEL: shuffle8_i64: 1573; CHECK: @ %bb.0: @ %entry 1574; CHECK-NEXT: vmov.f32 s6, s2 1575; CHECK-NEXT: vmov.f32 s7, s3 1576; CHECK-NEXT: vmov q0, q1 1577; CHECK-NEXT: bx lr 1578entry: 1579 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 2, i32 1> 1580 ret <2 x i64> %out 1581} 1582define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) { 1583; CHECK-LV-LABEL: shuffle9_i64: 1584; CHECK-LV: @ %bb.0: @ %entry 1585; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} 1586; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} 1587; CHECK-LV-NEXT: vmov q5, q2 1588; CHECK-LV-NEXT: vmov.f32 s16, s0 1589; CHECK-LV-NEXT: vmov.f32 s18, s20 1590; CHECK-LV-NEXT: vmov.f32 s20, s2 1591; CHECK-LV-NEXT: vmov.f32 s10, s12 1592; CHECK-LV-NEXT: vmov.f32 s19, s21 1593; CHECK-LV-NEXT: vmov.f32 s8, s4 1594; CHECK-LV-NEXT: vmov.f32 s17, s1 1595; CHECK-LV-NEXT: vmov.f32 s21, s3 1596; CHECK-LV-NEXT: vmov q0, q4 1597; CHECK-LV-NEXT: vmov.f32 s12, s6 1598; CHECK-LV-NEXT: vmov.f32 s11, s13 1599; CHECK-LV-NEXT: vmov.f32 s9, s5 1600; CHECK-LV-NEXT: vmov.f32 s13, s7 1601; CHECK-LV-NEXT: vmov q1, q5 1602; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} 1603; CHECK-LV-NEXT: bx lr 1604; 1605; CHECK-LIS-LABEL: shuffle9_i64: 1606; CHECK-LIS: @ %bb.0: @ %entry 1607; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} 1608; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} 1609; CHECK-LIS-NEXT: vmov q5, q2 1610; CHECK-LIS-NEXT: vmov q4, q0 1611; CHECK-LIS-NEXT: vmov.f32 s2, s20 1612; CHECK-LIS-NEXT: vmov.f32 s20, s18 1613; CHECK-LIS-NEXT: vmov.f32 s10, s12 1614; CHECK-LIS-NEXT: vmov.f32 s3, s21 1615; CHECK-LIS-NEXT: vmov.f32 s8, s4 1616; CHECK-LIS-NEXT: vmov.f32 s21, s19 1617; CHECK-LIS-NEXT: vmov.f32 s12, s6 1618; CHECK-LIS-NEXT: vmov.f32 s11, s13 1619; CHECK-LIS-NEXT: vmov.f32 s9, s5 1620; CHECK-LIS-NEXT: vmov.f32 s13, s7 1621; CHECK-LIS-NEXT: vmov q1, q5 1622; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} 1623; CHECK-LIS-NEXT: bx lr 1624entry: 1625 %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 1626 ret <8 x i64> %out 1627} 1628 1629 1630define arm_aapcs_vfpcc <4 x i32> @insert_i32(i32 %a) { 1631; CHECK-LABEL: insert_i32: 1632; CHECK: @ %bb.0: @ %entry 1633; CHECK-NEXT: vmov.32 q0[0], r0 1634; CHECK-NEXT: bx lr 1635entry: 1636 %res = insertelement <4 x i32> undef, i32 %a, i32 0 1637 ret <4 x i32> %res 1638} 1639 1640define arm_aapcs_vfpcc <8 x i16> @insert_i16(i16 %a) { 1641; CHECK-LABEL: insert_i16: 1642; CHECK: @ %bb.0: @ %entry 1643; CHECK-NEXT: vmov.16 q0[0], r0 1644; CHECK-NEXT: bx lr 1645entry: 1646 %res = insertelement <8 x i16> undef, i16 %a, i32 0 1647 ret <8 x i16> %res 1648} 1649 1650define arm_aapcs_vfpcc <16 x i8> @insert_i8(i8 %a) { 1651; CHECK-LABEL: insert_i8: 1652; CHECK: @ %bb.0: @ %entry 1653; CHECK-NEXT: vmov.8 q0[0], r0 1654; CHECK-NEXT: bx lr 1655entry: 1656 %res = insertelement <16 x i8> undef, i8 %a, i32 0 1657 ret <16 x i8> %res 1658} 1659 1660define arm_aapcs_vfpcc <2 x i64> @insert_i64(i64 %a) { 1661; CHECK-LABEL: insert_i64: 1662; CHECK: @ %bb.0: @ %entry 1663; CHECK-NEXT: vmov.32 q0[0], r0 1664; CHECK-NEXT: vmov.32 q0[1], r1 1665; CHECK-NEXT: bx lr 1666entry: 1667 %res = insertelement <2 x i64> undef, i64 %a, i32 0 1668 ret <2 x i64> %res 1669} 1670 1671define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) { 1672; CHECK-LABEL: insert_f32: 1673; CHECK: @ %bb.0: @ %entry 1674; CHECK-NEXT: bx lr 1675entry: 1676 %res = insertelement <4 x float> undef, float %a, i32 0 1677 ret <4 x float> %res 1678} 1679 1680define arm_aapcs_vfpcc <8 x half> @insert_f16(half %a) { 1681; CHECK-LABEL: insert_f16: 1682; CHECK: @ %bb.0: @ %entry 1683; CHECK-NEXT: bx lr 1684entry: 1685 %res = insertelement <8 x half> undef, half %a, i32 0 1686 ret <8 x half> %res 1687} 1688 1689define arm_aapcs_vfpcc <2 x double> @insert_f64(double %a) { 1690; CHECK-LABEL: insert_f64: 1691; CHECK: @ %bb.0: @ %entry 1692; CHECK-NEXT: bx lr 1693entry: 1694 %res = insertelement <2 x double> undef, double %a, i32 0 1695 ret <2 x double> %res 1696} 1697 1698define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) { 1699; CHECK-LABEL: scalar_to_vector_i32: 1700; CHECK: @ %bb.0: @ %entry 1701; CHECK-NEXT: .pad #8 1702; CHECK-NEXT: sub sp, #8 1703; CHECK-NEXT: adr r2, .LCPI88_0 1704; CHECK-NEXT: vmov.u16 r0, q0[0] 1705; CHECK-NEXT: vldrw.u32 q0, [r2] 1706; CHECK-NEXT: mov r1, sp 1707; CHECK-NEXT: vmov.32 q0[0], r0 1708; CHECK-NEXT: vstrh.32 q0, [r1] 1709; CHECK-NEXT: ldrd r0, r1, [sp], #8 1710; CHECK-NEXT: bx lr 1711; CHECK-NEXT: .p2align 4 1712; CHECK-NEXT: @ %bb.1: 1713; CHECK-NEXT: .LCPI88_0: 1714; CHECK-NEXT: .zero 4 1715; CHECK-NEXT: .long 7 @ 0x7 1716; CHECK-NEXT: .long 1 @ 0x1 1717; CHECK-NEXT: .long 9 @ 0x9 1718entry: 1719 %f = shufflevector <8 x i16> %v, <8 x i16> <i16 undef, i16 7, i16 1, i16 9, i16 undef, i16 undef, i16 undef, i16 undef>, <4 x i32> <i32 0, i32 9, i32 10, i32 11> 1720 %0 = bitcast <4 x i16> %f to i64 1721 ret i64 %0 1722} 1723 1724 1725define arm_aapcs_vfpcc i32 @extract_i32_0(<4 x i32> %a) { 1726; CHECK-LABEL: extract_i32_0: 1727; CHECK: @ %bb.0: @ %entry 1728; CHECK-NEXT: vmov r0, s0 1729; CHECK-NEXT: bx lr 1730entry: 1731 %res = extractelement <4 x i32> %a, i32 0 1732 ret i32 %res 1733} 1734 1735define arm_aapcs_vfpcc i32 @extract_i32_3(<4 x i32> %a) { 1736; CHECK-LABEL: extract_i32_3: 1737; CHECK: @ %bb.0: @ %entry 1738; CHECK-NEXT: vmov r0, s3 1739; CHECK-NEXT: bx lr 1740entry: 1741 %res = extractelement <4 x i32> %a, i32 3 1742 ret i32 %res 1743} 1744 1745define arm_aapcs_vfpcc i16 @extract_i16_0(<8 x i16> %a) { 1746; CHECK-LABEL: extract_i16_0: 1747; CHECK: @ %bb.0: @ %entry 1748; CHECK-NEXT: vmov.u16 r0, q0[0] 1749; CHECK-NEXT: bx lr 1750entry: 1751 %res = extractelement <8 x i16> %a, i32 0 1752 ret i16 %res 1753} 1754 1755define arm_aapcs_vfpcc i16 @extract_i16_3(<8 x i16> %a) { 1756; CHECK-LABEL: extract_i16_3: 1757; CHECK: @ %bb.0: @ %entry 1758; CHECK-NEXT: vmov.u16 r0, q0[3] 1759; CHECK-NEXT: bx lr 1760entry: 1761 %res = extractelement <8 x i16> %a, i32 3 1762 ret i16 %res 1763} 1764 1765define arm_aapcs_vfpcc i8 @extract_i8_0(<16 x i8> %a) { 1766; CHECK-LABEL: extract_i8_0: 1767; CHECK: @ %bb.0: @ %entry 1768; CHECK-NEXT: vmov.u8 r0, q0[0] 1769; CHECK-NEXT: bx lr 1770entry: 1771 %res = extractelement <16 x i8> %a, i32 0 1772 ret i8 %res 1773} 1774 1775define arm_aapcs_vfpcc i8 @extract_i8_3(<16 x i8> %a) { 1776; CHECK-LABEL: extract_i8_3: 1777; CHECK: @ %bb.0: @ %entry 1778; CHECK-NEXT: vmov.u8 r0, q0[3] 1779; CHECK-NEXT: bx lr 1780entry: 1781 %res = extractelement <16 x i8> %a, i32 3 1782 ret i8 %res 1783} 1784 1785define arm_aapcs_vfpcc i64 @extract_i64_0(<2 x i64> %a) { 1786; CHECK-LABEL: extract_i64_0: 1787; CHECK: @ %bb.0: @ %entry 1788; CHECK-NEXT: vmov r0, r1, d0 1789; CHECK-NEXT: bx lr 1790entry: 1791 %res = extractelement <2 x i64> %a, i32 0 1792 ret i64 %res 1793} 1794 1795define arm_aapcs_vfpcc i64 @extract_i64_1(<2 x i64> %a) { 1796; CHECK-LABEL: extract_i64_1: 1797; CHECK: @ %bb.0: @ %entry 1798; CHECK-NEXT: vmov r0, r1, d1 1799; CHECK-NEXT: bx lr 1800entry: 1801 %res = extractelement <2 x i64> %a, i32 1 1802 ret i64 %res 1803} 1804 1805define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) { 1806; CHECK-LABEL: extract_f32_0: 1807; CHECK: @ %bb.0: @ %entry 1808; CHECK-NEXT: bx lr 1809entry: 1810 %res = extractelement <4 x float> %a, i32 0 1811 ret float %res 1812} 1813 1814define arm_aapcs_vfpcc float @extract_f32_3(<4 x float> %a) { 1815; CHECK-LABEL: extract_f32_3: 1816; CHECK: @ %bb.0: @ %entry 1817; CHECK-NEXT: vmov.f32 s0, s3 1818; CHECK-NEXT: bx lr 1819entry: 1820 %res = extractelement <4 x float> %a, i32 3 1821 ret float %res 1822} 1823 1824define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) { 1825; CHECK-LABEL: extract_f16_0: 1826; CHECK: @ %bb.0: @ %entry 1827; CHECK-NEXT: bx lr 1828entry: 1829 %res = extractelement <8 x half> %a, i32 0 1830 ret half %res 1831} 1832 1833define arm_aapcs_vfpcc half @extract_f16_3(<8 x half> %a) { 1834; CHECK-LABEL: extract_f16_3: 1835; CHECK: @ %bb.0: @ %entry 1836; CHECK-NEXT: vmovx.f16 s0, s1 1837; CHECK-NEXT: bx lr 1838entry: 1839 %res = extractelement <8 x half> %a, i32 3 1840 ret half %res 1841} 1842 1843define arm_aapcs_vfpcc double @extract_f64_0(<2 x double> %a) { 1844; CHECK-LABEL: extract_f64_0: 1845; CHECK: @ %bb.0: @ %entry 1846; CHECK-NEXT: bx lr 1847entry: 1848 %res = extractelement <2 x double> %a, i32 0 1849 ret double %res 1850} 1851 1852define arm_aapcs_vfpcc double @extract_f64_1(<2 x double> %a) { 1853; CHECK-LABEL: extract_f64_1: 1854; CHECK: @ %bb.0: @ %entry 1855; CHECK-NEXT: vmov.f32 s0, s2 1856; CHECK-NEXT: vmov.f32 s1, s3 1857; CHECK-NEXT: bx lr 1858entry: 1859 %res = extractelement <2 x double> %a, i32 1 1860 ret double %res 1861} 1862 1863