1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s 3 4define <8 x i8> @v_dup8(i8 %A) nounwind { 5; CHECK-LABEL: v_dup8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vdup.8 d16, r0 8; CHECK-NEXT: vmov r0, r1, d16 9; CHECK-NEXT: mov pc, lr 10 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 11 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 12 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 13 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 14 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 15 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 16 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 17 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 18 ret <8 x i8> %tmp8 19} 20 21define <4 x i16> @v_dup16(i16 %A) nounwind { 22; CHECK-LABEL: v_dup16: 23; CHECK: @ %bb.0: 24; CHECK-NEXT: vdup.16 d16, r0 25; CHECK-NEXT: vmov r0, r1, d16 26; CHECK-NEXT: mov pc, lr 27 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 28 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 29 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 30 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 31 ret <4 x i16> %tmp4 32} 33 34define <2 x i32> @v_dup32(i32 %A) nounwind { 35; CHECK-LABEL: v_dup32: 36; CHECK: @ %bb.0: 37; CHECK-NEXT: vdup.32 d16, r0 38; CHECK-NEXT: vmov r0, r1, d16 39; CHECK-NEXT: mov pc, lr 40 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 41 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 42 ret <2 x i32> %tmp2 43} 44 45define <2 x float> @v_dupfloat(float %A) nounwind { 46; CHECK-LABEL: v_dupfloat: 47; CHECK: @ %bb.0: 48; CHECK-NEXT: vdup.32 d16, r0 49; CHECK-NEXT: vmov r0, r1, d16 50; CHECK-NEXT: mov pc, lr 51 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 52 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 53 ret <2 x float> %tmp2 54} 55 56define <16 x i8> @v_dupQ8(i8 %A) nounwind { 57; CHECK-LABEL: v_dupQ8: 58; CHECK: @ %bb.0: 59; CHECK-NEXT: vdup.8 q8, r0 60; CHECK-NEXT: vmov r0, r1, d16 61; CHECK-NEXT: vmov r2, r3, d17 62; CHECK-NEXT: mov pc, lr 63 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 64 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 65 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 66 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 67 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 68 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 69 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 70 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 71 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 72 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 73 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 74 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 75 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 76 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 77 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 78 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 79 ret <16 x i8> %tmp16 80} 81 82define <8 x i16> @v_dupQ16(i16 %A) nounwind { 83; CHECK-LABEL: v_dupQ16: 84; CHECK: @ %bb.0: 85; CHECK-NEXT: vdup.16 q8, r0 86; CHECK-NEXT: vmov r0, r1, d16 87; CHECK-NEXT: vmov r2, r3, d17 88; CHECK-NEXT: mov pc, lr 89 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 90 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 91 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 92 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 93 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 94 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 95 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 96 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 97 ret <8 x i16> %tmp8 98} 99 100define <4 x i32> @v_dupQ32(i32 %A) nounwind { 101; CHECK-LABEL: v_dupQ32: 102; CHECK: @ %bb.0: 103; CHECK-NEXT: vdup.32 q8, r0 104; CHECK-NEXT: vmov r0, r1, d16 105; CHECK-NEXT: vmov r2, r3, d17 106; CHECK-NEXT: mov pc, lr 107 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 108 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 109 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 110 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 111 ret <4 x i32> %tmp4 112} 113 114define <4 x float> @v_dupQfloat(float %A) nounwind { 115; CHECK-LABEL: v_dupQfloat: 116; CHECK: @ %bb.0: 117; CHECK-NEXT: vdup.32 q8, r0 118; CHECK-NEXT: vmov r0, r1, d16 119; CHECK-NEXT: vmov r2, r3, d17 120; CHECK-NEXT: mov pc, lr 121 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 122 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 123 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 124 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 125 ret <4 x float> %tmp4 126} 127 128; Check to make sure it works with shuffles, too. 129 130define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 131; CHECK-LABEL: v_shuffledup8: 132; CHECK: @ %bb.0: 133; CHECK-NEXT: vdup.8 d16, r0 134; CHECK-NEXT: vmov r0, r1, d16 135; CHECK-NEXT: mov pc, lr 136 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 137 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 138 ret <8 x i8> %tmp2 139} 140 141define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 142; CHECK-LABEL: v_shuffledup16: 143; CHECK: @ %bb.0: 144; CHECK-NEXT: vdup.16 d16, r0 145; CHECK-NEXT: vmov r0, r1, d16 146; CHECK-NEXT: mov pc, lr 147 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 148 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 149 ret <4 x i16> %tmp2 150} 151 152define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 153; CHECK-LABEL: v_shuffledup32: 154; CHECK: @ %bb.0: 155; CHECK-NEXT: vdup.32 d16, r0 156; CHECK-NEXT: vmov r0, r1, d16 157; CHECK-NEXT: mov pc, lr 158 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 159 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 160 ret <2 x i32> %tmp2 161} 162 163define <2 x float> @v_shuffledupfloat(float %A) nounwind { 164; CHECK-LABEL: v_shuffledupfloat: 165; CHECK: @ %bb.0: 166; CHECK-NEXT: vdup.32 d16, r0 167; CHECK-NEXT: vmov r0, r1, d16 168; CHECK-NEXT: mov pc, lr 169 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 170 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 171 ret <2 x float> %tmp2 172} 173 174define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 175; CHECK-LABEL: v_shuffledupQ8: 176; CHECK: @ %bb.0: 177; CHECK-NEXT: vdup.8 q8, r0 178; CHECK-NEXT: vmov r0, r1, d16 179; CHECK-NEXT: vmov r2, r3, d17 180; CHECK-NEXT: mov pc, lr 181 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 182 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 183 ret <16 x i8> %tmp2 184} 185 186define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 187; CHECK-LABEL: v_shuffledupQ16: 188; CHECK: @ %bb.0: 189; CHECK-NEXT: vdup.16 q8, r0 190; CHECK-NEXT: vmov r0, r1, d16 191; CHECK-NEXT: vmov r2, r3, d17 192; CHECK-NEXT: mov pc, lr 193 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 194 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 195 ret <8 x i16> %tmp2 196} 197 198define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 199; CHECK-LABEL: v_shuffledupQ32: 200; CHECK: @ %bb.0: 201; CHECK-NEXT: vdup.32 q8, r0 202; CHECK-NEXT: vmov r0, r1, d16 203; CHECK-NEXT: vmov r2, r3, d17 204; CHECK-NEXT: mov pc, lr 205 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 206 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 207 ret <4 x i32> %tmp2 208} 209 210define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 211; CHECK-LABEL: v_shuffledupQfloat: 212; CHECK: @ %bb.0: 213; CHECK-NEXT: vdup.32 q8, r0 214; CHECK-NEXT: vmov r0, r1, d16 215; CHECK-NEXT: vmov r2, r3, d17 216; CHECK-NEXT: mov pc, lr 217 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 218 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 219 ret <4 x float> %tmp2 220} 221 222define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { 223; CHECK-LABEL: vduplane8: 224; CHECK: @ %bb.0: 225; CHECK-NEXT: vldr d16, [r0] 226; CHECK-NEXT: vdup.8 d16, d16[1] 227; CHECK-NEXT: vmov r0, r1, d16 228; CHECK-NEXT: mov pc, lr 229 %tmp1 = load <8 x i8>, <8 x i8>* %A 230 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 231 ret <8 x i8> %tmp2 232} 233 234define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { 235; CHECK-LABEL: vduplane16: 236; CHECK: @ %bb.0: 237; CHECK-NEXT: vldr d16, [r0] 238; CHECK-NEXT: vdup.16 d16, d16[1] 239; CHECK-NEXT: vmov r0, r1, d16 240; CHECK-NEXT: mov pc, lr 241 %tmp1 = load <4 x i16>, <4 x i16>* %A 242 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 243 ret <4 x i16> %tmp2 244} 245 246define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { 247; CHECK-LABEL: vduplane32: 248; CHECK: @ %bb.0: 249; CHECK-NEXT: vldr d16, [r0] 250; CHECK-NEXT: vdup.32 d16, d16[1] 251; CHECK-NEXT: vmov r0, r1, d16 252; CHECK-NEXT: mov pc, lr 253 %tmp1 = load <2 x i32>, <2 x i32>* %A 254 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 255 ret <2 x i32> %tmp2 256} 257 258define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { 259; CHECK-LABEL: vduplanefloat: 260; CHECK: @ %bb.0: 261; CHECK-NEXT: vldr d16, [r0] 262; CHECK-NEXT: vdup.32 d16, d16[1] 263; CHECK-NEXT: vmov r0, r1, d16 264; CHECK-NEXT: mov pc, lr 265 %tmp1 = load <2 x float>, <2 x float>* %A 266 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 267 ret <2 x float> %tmp2 268} 269 270define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { 271; CHECK-LABEL: vduplaneQ8: 272; CHECK: @ %bb.0: 273; CHECK-NEXT: vldr d16, [r0] 274; CHECK-NEXT: vdup.8 q8, d16[1] 275; CHECK-NEXT: vmov r0, r1, d16 276; CHECK-NEXT: vmov r2, r3, d17 277; CHECK-NEXT: mov pc, lr 278 %tmp1 = load <8 x i8>, <8 x i8>* %A 279 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 280 ret <16 x i8> %tmp2 281} 282 283define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { 284; CHECK-LABEL: vduplaneQ16: 285; CHECK: @ %bb.0: 286; CHECK-NEXT: vldr d16, [r0] 287; CHECK-NEXT: vdup.16 q8, d16[1] 288; CHECK-NEXT: vmov r0, r1, d16 289; CHECK-NEXT: vmov r2, r3, d17 290; CHECK-NEXT: mov pc, lr 291 %tmp1 = load <4 x i16>, <4 x i16>* %A 292 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 293 ret <8 x i16> %tmp2 294} 295 296define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { 297; CHECK-LABEL: vduplaneQ32: 298; CHECK: @ %bb.0: 299; CHECK-NEXT: vldr d16, [r0] 300; CHECK-NEXT: vdup.32 q8, d16[1] 301; CHECK-NEXT: vmov r0, r1, d16 302; CHECK-NEXT: vmov r2, r3, d17 303; CHECK-NEXT: mov pc, lr 304 %tmp1 = load <2 x i32>, <2 x i32>* %A 305 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 306 ret <4 x i32> %tmp2 307} 308 309define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { 310; CHECK-LABEL: vduplaneQfloat: 311; CHECK: @ %bb.0: 312; CHECK-NEXT: vldr d16, [r0] 313; CHECK-NEXT: vdup.32 q8, d16[1] 314; CHECK-NEXT: vmov r0, r1, d16 315; CHECK-NEXT: vmov r2, r3, d17 316; CHECK-NEXT: mov pc, lr 317 %tmp1 = load <2 x float>, <2 x float>* %A 318 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 319 ret <4 x float> %tmp2 320} 321 322define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 323; CHECK-LABEL: foo: 324; CHECK: @ %bb.0: @ %entry 325; CHECK-NEXT: mov r0, r2 326; CHECK-NEXT: mov r1, r3 327; CHECK-NEXT: mov pc, lr 328entry: 329 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 330 ret <2 x i64> %0 331} 332 333define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 334; CHECK-LABEL: bar: 335; CHECK: @ %bb.0: @ %entry 336; CHECK-NEXT: mov r2, r0 337; CHECK-NEXT: mov r3, r1 338; CHECK-NEXT: mov pc, lr 339entry: 340 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 341 ret <2 x i64> %0 342} 343 344define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 345; CHECK-LABEL: baz: 346; CHECK: @ %bb.0: @ %entry 347; CHECK-NEXT: mov r0, r2 348; CHECK-NEXT: mov r1, r3 349; CHECK-NEXT: mov pc, lr 350entry: 351 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 352 ret <2 x double> %0 353} 354 355define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 356; CHECK-LABEL: qux: 357; CHECK: @ %bb.0: @ %entry 358; CHECK-NEXT: mov r2, r0 359; CHECK-NEXT: mov r3, r1 360; CHECK-NEXT: mov pc, lr 361entry: 362 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 363 ret <2 x double> %0 364} 365 366; Radar 7373643 367define void @redundantVdup(<8 x i8>* %ptr) nounwind { 368; CHECK-LABEL: redundantVdup: 369; CHECK: @ %bb.0: 370; CHECK-NEXT: vmov.i8 d16, #0x80 371; CHECK-NEXT: vstr d16, [r0] 372; CHECK-NEXT: mov pc, lr 373 %1 = insertelement <8 x i8> undef, i8 -128, i32 0 374 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 375 store <8 x i8> %2, <8 x i8>* %ptr, align 8 376 ret void 377} 378 379define <4 x i32> @tdupi(i32 %x, i32 %y) { 380; CHECK-LABEL: tdupi: 381; CHECK: @ %bb.0: 382; CHECK-NEXT: vdup.32 q8, r0 383; CHECK-NEXT: vmov.32 d17[1], r1 384; CHECK-NEXT: vmov r0, r1, d16 385; CHECK-NEXT: vmov r2, r3, d17 386; CHECK-NEXT: mov pc, lr 387 %1 = insertelement <4 x i32> undef, i32 %x, i32 0 388 %2 = insertelement <4 x i32> %1, i32 %x, i32 1 389 %3 = insertelement <4 x i32> %2, i32 %x, i32 2 390 %4 = insertelement <4 x i32> %3, i32 %y, i32 3 391 ret <4 x i32> %4 392} 393 394define <4 x float> @tdupf(float %x, float %y) { 395; CHECK-LABEL: tdupf: 396; CHECK: @ %bb.0: 397; CHECK-NEXT: vdup.32 q0, r0 398; CHECK-NEXT: vmov s3, r1 399; CHECK-NEXT: vmov r0, r1, d0 400; CHECK-NEXT: vmov r2, r3, d1 401; CHECK-NEXT: mov pc, lr 402 %1 = insertelement <4 x float> undef, float %x, i32 0 403 %2 = insertelement <4 x float> %1, float %x, i32 1 404 %3 = insertelement <4 x float> %2, float %x, i32 2 405 %4 = insertelement <4 x float> %3, float %y, i32 3 406 ret <4 x float> %4 407} 408 409; This test checks that when splatting an element from a vector into another, 410; the value isn't moved out to GPRs first. 411define <4 x i32> @tduplane(<4 x i32> %invec) { 412; CHECK-LABEL: tduplane: 413; CHECK: @ %bb.0: 414; CHECK-NEXT: vmov d16, r0, r1 415; CHECK-NEXT: mov r0, #255 416; CHECK-NEXT: vdup.32 q8, d16[1] 417; CHECK-NEXT: vmov.32 d17[1], r0 418; CHECK-NEXT: vmov r0, r1, d16 419; CHECK-NEXT: vmov r2, r3, d17 420; CHECK-NEXT: mov pc, lr 421 %in = extractelement <4 x i32> %invec, i32 1 422 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 423 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 424 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 425 %4 = insertelement <4 x i32> %3, i32 255, i32 3 426 ret <4 x i32> %4 427} 428 429define <2 x float> @check_f32(<4 x float> %v) nounwind { 430; CHECK-LABEL: check_f32: 431; CHECK: @ %bb.0: 432; CHECK-NEXT: vmov d17, r2, r3 433; CHECK-NEXT: vmov d16, r0, r1 434; CHECK-NEXT: vdup.32 d16, d17[1] 435; CHECK-NEXT: vmov r0, r1, d16 436; CHECK-NEXT: mov pc, lr 437 %x = extractelement <4 x float> %v, i32 3 438 %1 = insertelement <2 x float> undef, float %x, i32 0 439 %2 = insertelement <2 x float> %1, float %x, i32 1 440 ret <2 x float> %2 441} 442 443define <2 x i32> @check_i32(<4 x i32> %v) nounwind { 444; CHECK-LABEL: check_i32: 445; CHECK: @ %bb.0: 446; CHECK-NEXT: vmov d17, r2, r3 447; CHECK-NEXT: vmov d16, r0, r1 448; CHECK-NEXT: vdup.32 d16, d17[1] 449; CHECK-NEXT: vmov r0, r1, d16 450; CHECK-NEXT: mov pc, lr 451 %x = extractelement <4 x i32> %v, i32 3 452 %1 = insertelement <2 x i32> undef, i32 %x, i32 0 453 %2 = insertelement <2 x i32> %1, i32 %x, i32 1 454 ret <2 x i32> %2 455} 456 457define <4 x i16> @check_i16(<8 x i16> %v) nounwind { 458; CHECK-LABEL: check_i16: 459; CHECK: @ %bb.0: 460; CHECK-NEXT: vmov d17, r2, r3 461; CHECK-NEXT: vmov d16, r0, r1 462; CHECK-NEXT: vdup.16 d16, d16[3] 463; CHECK-NEXT: vmov r0, r1, d16 464; CHECK-NEXT: mov pc, lr 465 %x = extractelement <8 x i16> %v, i32 3 466 %1 = insertelement <4 x i16> undef, i16 %x, i32 0 467 %2 = insertelement <4 x i16> %1, i16 %x, i32 1 468 ret <4 x i16> %2 469} 470 471define <8 x i8> @check_i8(<16 x i8> %v) nounwind { 472; CHECK-LABEL: check_i8: 473; CHECK: @ %bb.0: 474; CHECK-NEXT: vmov d17, r2, r3 475; CHECK-NEXT: vmov d16, r0, r1 476; CHECK-NEXT: vdup.8 d16, d16[3] 477; CHECK-NEXT: vmov r0, r1, d16 478; CHECK-NEXT: mov pc, lr 479 %x = extractelement <16 x i8> %v, i32 3 480 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 481 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 482 ret <8 x i8> %2 483} 484 485; Check that an SPR splat produces a vdup. 486 487define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) { 488; CHECK-LABEL: check_spr_splat2: 489; CHECK: @ %bb.0: 490; CHECK-NEXT: lsl r2, r2, #16 491; CHECK-NEXT: vmov d16, r0, r1 492; CHECK-NEXT: asr r2, r2, #16 493; CHECK-NEXT: vmov s0, r2 494; CHECK-NEXT: vcvt.f32.s32 s0, s0 495; CHECK-NEXT: vdup.32 d17, d0[0] 496; CHECK-NEXT: vsub.f32 d16, d17, d16 497; CHECK-NEXT: vmov r0, r1, d16 498; CHECK-NEXT: mov pc, lr 499 %conv = sitofp i16 %q to float 500 %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0 501 %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer 502 %sub = fsub <2 x float> %splat.splat, %p 503 ret <2 x float> %sub 504} 505 506define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) { 507; CHECK-LABEL: check_spr_splat4: 508; CHECK: @ %bb.0: 509; CHECK-NEXT: ldrsh r12, [sp] 510; CHECK-NEXT: vmov d17, r2, r3 511; CHECK-NEXT: vmov d16, r0, r1 512; CHECK-NEXT: vmov s0, r12 513; CHECK-NEXT: vcvt.f32.s32 s0, s0 514; CHECK-NEXT: vdup.32 q9, d0[0] 515; CHECK-NEXT: vsub.f32 q8, q9, q8 516; CHECK-NEXT: vmov r0, r1, d16 517; CHECK-NEXT: vmov r2, r3, d17 518; CHECK-NEXT: mov pc, lr 519 %conv = sitofp i16 %q to float 520 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0 521 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 522 %sub = fsub <4 x float> %splat.splat, %p 523 ret <4 x float> %sub 524} 525; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant. 526define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) { 527; CHECK-LABEL: check_spr_splat4_lane1: 528; CHECK: @ %bb.0: 529; CHECK-NEXT: ldrsh r12, [sp] 530; CHECK-NEXT: vmov d17, r2, r3 531; CHECK-NEXT: vmov d16, r0, r1 532; CHECK-NEXT: vmov s0, r12 533; CHECK-NEXT: vcvt.f32.s32 s0, s0 534; CHECK-NEXT: vdup.32 q9, d0[0] 535; CHECK-NEXT: vsub.f32 q8, q9, q8 536; CHECK-NEXT: vmov r0, r1, d16 537; CHECK-NEXT: vmov r2, r3, d17 538; CHECK-NEXT: mov pc, lr 539 %conv = sitofp i16 %q to float 540 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1 541 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 542 %sub = fsub <4 x float> %splat.splat, %p 543 ret <4 x float> %sub 544} 545 546; Also make sure we don't barf on variable-index extractelts, where we almost 547; could have generated a vdup. 548 549define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) { 550; CHECK-LABEL: check_i8_varidx: 551; CHECK: @ %bb.0: 552; CHECK-NEXT: .save {r11} 553; CHECK-NEXT: push {r11} 554; CHECK-NEXT: .setfp r11, sp 555; CHECK-NEXT: mov r11, sp 556; CHECK-NEXT: .pad #28 557; CHECK-NEXT: sub sp, sp, #28 558; CHECK-NEXT: bic sp, sp, #15 559; CHECK-NEXT: ldr r12, [r11, #4] 560; CHECK-NEXT: vmov d17, r2, r3 561; CHECK-NEXT: vmov d16, r0, r1 562; CHECK-NEXT: mov r1, sp 563; CHECK-NEXT: and r0, r12, #15 564; CHECK-NEXT: vst1.64 {d16, d17}, [r1:128], r0 565; CHECK-NEXT: vld1.8 {d16[]}, [r1] 566; CHECK-NEXT: vmov r0, r1, d16 567; CHECK-NEXT: mov sp, r11 568; CHECK-NEXT: pop {r11} 569; CHECK-NEXT: mov pc, lr 570 %x = extractelement <16 x i8> %v, i32 %idx 571 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 572 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 573 ret <8 x i8> %2 574} 575