1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD 3; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI 4 5define <8 x i8> @v_dup8(i8 %A) nounwind { 6; CHECK-LABEL: v_dup8: 7; CHECK: // %bb.0: 8; CHECK-NEXT: dup.8b v0, w0 9; CHECK-NEXT: ret 10 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 11 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 12 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 13 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 14 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 15 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 16 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 17 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 18 ret <8 x i8> %tmp8 19} 20 21define <4 x i16> @v_dup16(i16 %A) nounwind { 22; CHECK-LABEL: v_dup16: 23; CHECK: // %bb.0: 24; CHECK-NEXT: dup.4h v0, w0 25; CHECK-NEXT: ret 26 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 27 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 28 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 29 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 30 ret <4 x i16> %tmp4 31} 32 33define <2 x i32> @v_dup32(i32 %A) nounwind { 34; CHECK-LABEL: v_dup32: 35; CHECK: // %bb.0: 36; CHECK-NEXT: dup.2s v0, w0 37; CHECK-NEXT: ret 38 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 39 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 40 ret <2 x i32> %tmp2 41} 42 43define <2 x float> @v_dupfloat(float %A) nounwind { 44; CHECK-LABEL: v_dupfloat: 45; CHECK: // %bb.0: 46; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 47; CHECK-NEXT: dup.2s v0, v0[0] 48; CHECK-NEXT: ret 49 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 50 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 51 ret <2 x float> %tmp2 52} 53 54define <16 x i8> @v_dupQ8(i8 %A) nounwind { 55; CHECK-LABEL: v_dupQ8: 56; CHECK: // %bb.0: 57; CHECK-NEXT: dup.16b v0, w0 58; CHECK-NEXT: ret 59 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 60 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 61 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 62 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 63 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 64 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 65 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 66 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 67 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 68 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 69 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 70 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 71 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 72 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 73 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 74 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 75 ret <16 x i8> %tmp16 76} 77 78define <8 x i16> @v_dupQ16(i16 %A) nounwind { 79; CHECK-LABEL: v_dupQ16: 80; CHECK: // %bb.0: 81; CHECK-NEXT: dup.8h v0, w0 82; CHECK-NEXT: ret 83 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 84 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 85 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 86 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 87 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 88 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 89 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 90 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 91 ret <8 x i16> %tmp8 92} 93 94define <4 x i32> @v_dupQ32(i32 %A) nounwind { 95; CHECK-LABEL: v_dupQ32: 96; CHECK: // %bb.0: 97; CHECK-NEXT: dup.4s v0, w0 98; CHECK-NEXT: ret 99 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 100 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 101 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 102 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 103 ret <4 x i32> %tmp4 104} 105 106define <4 x i16> @v_dup16_const(i16 %y, ptr %p) { 107; CHECK-LABEL: v_dup16_const: 108; CHECK: // %bb.0: 109; CHECK-NEXT: movi.4h v0, #10 110; CHECK-NEXT: mov w8, #10 // =0xa 111; CHECK-NEXT: strh w8, [x1] 112; CHECK-NEXT: ret 113 %i = insertelement <4 x i16> undef, i16 10, i32 0 114 %lo = shufflevector <4 x i16> %i, <4 x i16> undef, <4 x i32> zeroinitializer 115 store i16 10, ptr %p 116 ret <4 x i16> %lo 117} 118 119define <4 x float> @v_dupQfloat(float %A) nounwind { 120; CHECK-LABEL: v_dupQfloat: 121; CHECK: // %bb.0: 122; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 123; CHECK-NEXT: dup.4s v0, v0[0] 124; CHECK-NEXT: ret 125 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 126 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 127 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 128 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 129 ret <4 x float> %tmp4 130} 131 132; Check to make sure it works with shuffles, too. 133 134define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 135; CHECK-LABEL: v_shuffledup8: 136; CHECK: // %bb.0: 137; CHECK-NEXT: dup.8b v0, w0 138; CHECK-NEXT: ret 139 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 140 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 141 ret <8 x i8> %tmp2 142} 143 144define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 145; CHECK-LABEL: v_shuffledup16: 146; CHECK: // %bb.0: 147; CHECK-NEXT: dup.4h v0, w0 148; CHECK-NEXT: ret 149 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 150 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 151 ret <4 x i16> %tmp2 152} 153 154define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 155; CHECK-LABEL: v_shuffledup32: 156; CHECK: // %bb.0: 157; CHECK-NEXT: dup.2s v0, w0 158; CHECK-NEXT: ret 159 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 160 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 161 ret <2 x i32> %tmp2 162} 163 164define <2 x float> @v_shuffledupfloat(float %A) nounwind { 165; CHECK-LABEL: v_shuffledupfloat: 166; CHECK: // %bb.0: 167; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 168; CHECK-NEXT: dup.2s v0, v0[0] 169; CHECK-NEXT: ret 170 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 171 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 172 ret <2 x float> %tmp2 173} 174 175define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 176; CHECK-LABEL: v_shuffledupQ8: 177; CHECK: // %bb.0: 178; CHECK-NEXT: dup.16b v0, w0 179; CHECK-NEXT: ret 180 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 181 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 182 ret <16 x i8> %tmp2 183} 184 185define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 186; CHECK-LABEL: v_shuffledupQ16: 187; CHECK: // %bb.0: 188; CHECK-NEXT: dup.8h v0, w0 189; CHECK-NEXT: ret 190 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 191 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 192 ret <8 x i16> %tmp2 193} 194 195define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 196; CHECK-LABEL: v_shuffledupQ32: 197; CHECK: // %bb.0: 198; CHECK-NEXT: dup.4s v0, w0 199; CHECK-NEXT: ret 200 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 201 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 202 ret <4 x i32> %tmp2 203} 204 205define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 206; CHECK-LABEL: v_shuffledupQfloat: 207; CHECK: // %bb.0: 208; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 209; CHECK-NEXT: dup.4s v0, v0[0] 210; CHECK-NEXT: ret 211 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 212 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 213 ret <4 x float> %tmp2 214} 215 216define <8 x i8> @vduplane8(<8 x i8> %A) nounwind { 217; CHECK-LABEL: vduplane8: 218; CHECK: // %bb.0: 219; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 220; CHECK-NEXT: dup.8b v0, v0[1] 221; CHECK-NEXT: ret 222 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 223 ret <8 x i8> %tmp2 224} 225 226define <4 x i16> @vduplane16(<4 x i16> %A) nounwind { 227; CHECK-LABEL: vduplane16: 228; CHECK: // %bb.0: 229; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 230; CHECK-NEXT: dup.4h v0, v0[1] 231; CHECK-NEXT: ret 232 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 233 ret <4 x i16> %tmp2 234} 235 236define <2 x i32> @vduplane32(<2 x i32> %A) nounwind { 237; CHECK-LABEL: vduplane32: 238; CHECK: // %bb.0: 239; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 240; CHECK-NEXT: dup.2s v0, v0[1] 241; CHECK-NEXT: ret 242 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 243 ret <2 x i32> %tmp2 244} 245 246define <2 x float> @vduplanefloat(<2 x float> %A) nounwind { 247; CHECK-LABEL: vduplanefloat: 248; CHECK: // %bb.0: 249; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 250; CHECK-NEXT: dup.2s v0, v0[1] 251; CHECK-NEXT: ret 252 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 253 ret <2 x float> %tmp2 254} 255 256define <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind { 257; CHECK-LABEL: vduplaneQ8: 258; CHECK: // %bb.0: 259; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 260; CHECK-NEXT: dup.16b v0, v0[1] 261; CHECK-NEXT: ret 262 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 263 ret <16 x i8> %tmp2 264} 265 266define <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind { 267; CHECK-LABEL: vduplaneQ16: 268; CHECK: // %bb.0: 269; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 270; CHECK-NEXT: dup.8h v0, v0[1] 271; CHECK-NEXT: ret 272 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 273 ret <8 x i16> %tmp2 274} 275 276define <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind { 277; CHECK-LABEL: vduplaneQ32: 278; CHECK: // %bb.0: 279; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 280; CHECK-NEXT: dup.4s v0, v0[1] 281; CHECK-NEXT: ret 282 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 283 ret <4 x i32> %tmp2 284} 285 286define <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind { 287; CHECK-LABEL: vduplaneQfloat: 288; CHECK: // %bb.0: 289; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 290; CHECK-NEXT: dup.4s v0, v0[1] 291; CHECK-NEXT: ret 292 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 293 ret <4 x float> %tmp2 294} 295 296define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 297; CHECK-LABEL: foo: 298; CHECK: // %bb.0: // %entry 299; CHECK-NEXT: dup.2d v0, v0[1] 300; CHECK-NEXT: ret 301entry: 302 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 303 ret <2 x i64> %0 304} 305 306define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 307; CHECK-LABEL: bar: 308; CHECK: // %bb.0: // %entry 309; CHECK-NEXT: dup.2d v0, v0[0] 310; CHECK-NEXT: ret 311entry: 312 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 313 ret <2 x i64> %0 314} 315 316define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 317; CHECK-LABEL: baz: 318; CHECK: // %bb.0: // %entry 319; CHECK-NEXT: dup.2d v0, v0[1] 320; CHECK-NEXT: ret 321entry: 322 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 323 ret <2 x double> %0 324} 325 326define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 327; CHECK-LABEL: qux: 328; CHECK: // %bb.0: // %entry 329; CHECK-NEXT: dup.2d v0, v0[0] 330; CHECK-NEXT: ret 331entry: 332 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 333 ret <2 x double> %0 334} 335 336define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone { 337; CHECK-SD-LABEL: f: 338; CHECK-SD: // %bb.0: 339; CHECK-SD-NEXT: fmov s0, w0 340; CHECK-SD-NEXT: mov.s v0[1], w1 341; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 342; CHECK-SD-NEXT: ret 343; 344; CHECK-GI-LABEL: f: 345; CHECK-GI: // %bb.0: 346; CHECK-GI-NEXT: mov.s v0[0], w0 347; CHECK-GI-NEXT: mov.s v0[1], w1 348; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 349; CHECK-GI-NEXT: ret 350 %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0 351 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1 352 ret <2 x i32> %vecinit1 353} 354 355define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone { 356; CHECK-SD-LABEL: g: 357; CHECK-SD: // %bb.0: 358; CHECK-SD-NEXT: fmov s0, w0 359; CHECK-SD-NEXT: mov.s v0[1], w1 360; CHECK-SD-NEXT: mov.s v0[2], w1 361; CHECK-SD-NEXT: mov.s v0[3], w0 362; CHECK-SD-NEXT: ret 363; 364; CHECK-GI-LABEL: g: 365; CHECK-GI: // %bb.0: 366; CHECK-GI-NEXT: mov.s v0[0], w0 367; CHECK-GI-NEXT: mov.s v0[1], w1 368; CHECK-GI-NEXT: mov.s v0[2], w1 369; CHECK-GI-NEXT: mov.s v0[3], w0 370; CHECK-GI-NEXT: ret 371 %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0 372 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1 373 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2 374 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3 375 ret <4 x i32> %vecinit3 376} 377 378define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone { 379; CHECK-SD-LABEL: h: 380; CHECK-SD: // %bb.0: 381; CHECK-SD-NEXT: fmov d0, x0 382; CHECK-SD-NEXT: mov.d v0[1], x1 383; CHECK-SD-NEXT: ret 384; 385; CHECK-GI-LABEL: h: 386; CHECK-GI: // %bb.0: 387; CHECK-GI-NEXT: mov.d v0[0], x0 388; CHECK-GI-NEXT: mov.d v0[1], x1 389; CHECK-GI-NEXT: ret 390 %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 391 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1 392 ret <2 x i64> %vecinit1 393} 394 395; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that 396; the single value needed was of the same type as the vector. This is false if 397; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16> 398; BUILD_VECTOR will have an i32 as its source). In that case, the operation is 399; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed. 400; 401; *However*, it is a dup vD.4h, vN.h[2*idx]. 402define <4 x i16> @test_build_illegal(<4 x i32> %in) { 403; CHECK-SD-LABEL: test_build_illegal: 404; CHECK-SD: // %bb.0: 405; CHECK-SD-NEXT: dup.4h v0, v0[6] 406; CHECK-SD-NEXT: ret 407; 408; CHECK-GI-LABEL: test_build_illegal: 409; CHECK-GI: // %bb.0: 410; CHECK-GI-NEXT: mov.s w8, v0[3] 411; CHECK-GI-NEXT: mov.h v0[3], w8 412; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 413; CHECK-GI-NEXT: ret 414 %val = extractelement <4 x i32> %in, i32 3 415 %smallval = trunc i32 %val to i16 416 %vec = insertelement <4x i16> undef, i16 %smallval, i32 3 417 418 ret <4 x i16> %vec 419} 420 421; We used to inherit an already extract_subvectored v4i16 from 422; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing 423; the formation of an indexed-by-7 MLS. 424define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 425; CHECK-SD-LABEL: test_high_splat: 426; CHECK-SD: // %bb.0: // %entry 427; CHECK-SD-NEXT: mls.4h v0, v1, v2[7] 428; CHECK-SD-NEXT: ret 429; 430; CHECK-GI-LABEL: test_high_splat: 431; CHECK-GI: // %bb.0: // %entry 432; CHECK-GI-NEXT: dup.8h v2, v2[7] 433; CHECK-GI-NEXT: mls.4h v0, v2, v1 434; CHECK-GI-NEXT: ret 435entry: 436 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 437 %mul = mul <4 x i16> %shuffle, %b 438 %sub = sub <4 x i16> %a, %mul 439 ret <4 x i16> %sub 440} 441 442; Also test the DUP path in the PerfectShuffle generator. 443 444define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { 445; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16: 446; CHECK-SD: // %bb.0: 447; CHECK-SD-NEXT: trn1.4h v0, v0, v0 448; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 449; CHECK-SD-NEXT: mov.s v0[1], v1[0] 450; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 451; CHECK-SD-NEXT: ret 452; 453; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16: 454; CHECK-GI: // %bb.0: 455; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 456; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 457; CHECK-GI-NEXT: adrp x8, .LCPI34_0 458; CHECK-GI-NEXT: mov.d v0[1], v1[0] 459; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0] 460; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1 461; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 462; CHECK-GI-NEXT: ret 463 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 464 ret <4 x i16> %r 465} 466 467define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind { 468; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16: 469; CHECK-SD: // %bb.0: 470; CHECK-SD-NEXT: trn1.4h v0, v0, v0 471; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 472; CHECK-SD-NEXT: mov.s v0[1], v1[0] 473; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 474; CHECK-SD-NEXT: ret 475; 476; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16: 477; CHECK-GI: // %bb.0: 478; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 479; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 480; CHECK-GI-NEXT: adrp x8, .LCPI35_0 481; CHECK-GI-NEXT: mov.d v0[1], v1[0] 482; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI35_0] 483; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1 484; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 485; CHECK-GI-NEXT: ret 486 %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 487 ret <4 x half> %r 488} 489 490define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 491; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i32: 492; CHECK-SD: // %bb.0: 493; CHECK-SD-NEXT: trn1.4s v0, v0, v0 494; CHECK-SD-NEXT: mov.d v0[1], v1[0] 495; CHECK-SD-NEXT: ret 496; 497; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32: 498; CHECK-GI: // %bb.0: 499; CHECK-GI-NEXT: adrp x8, .LCPI36_0 500; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 501; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] 502; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 503; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 504; CHECK-GI-NEXT: ret 505 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 506 ret <4 x i32> %r 507} 508 509define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind { 510; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f32: 511; CHECK-SD: // %bb.0: 512; CHECK-SD-NEXT: trn1.4s v0, v0, v0 513; CHECK-SD-NEXT: mov.d v0[1], v1[0] 514; CHECK-SD-NEXT: ret 515; 516; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32: 517; CHECK-GI: // %bb.0: 518; CHECK-GI-NEXT: adrp x8, .LCPI37_0 519; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 520; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_0] 521; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 522; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 523; CHECK-GI-NEXT: ret 524 %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> 525 ret <4 x float> %r 526} 527 528define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) { 529; CHECK-SD-LABEL: disguised_dup: 530; CHECK-SD: // %bb.0: 531; CHECK-SD-NEXT: ext.16b v1, v0, v0, #4 532; CHECK-SD-NEXT: mov.s v1[2], v0[0] 533; CHECK-SD-NEXT: dup.4s v0, v0[0] 534; CHECK-SD-NEXT: str q1, [x0] 535; CHECK-SD-NEXT: str q0, [x1] 536; CHECK-SD-NEXT: ret 537; 538; CHECK-GI-LABEL: disguised_dup: 539; CHECK-GI: // %bb.0: 540; CHECK-GI-NEXT: adrp x8, .LCPI38_1 541; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q0_q1 542; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_1] 543; CHECK-GI-NEXT: adrp x8, .LCPI38_0 544; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 545; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_0] 546; CHECK-GI-NEXT: tbl.16b v2, { v0, v1 }, v2 547; CHECK-GI-NEXT: str q0, [x0] 548; CHECK-GI-NEXT: str q2, [x1] 549; CHECK-GI-NEXT: ret 550 %shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 0> 551 %dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> 552 store <4 x float> %shuf, ptr %p1, align 8 553 store <4 x float> %dup, ptr %p2, align 8 554 ret void 555} 556 557define <2 x i32> @dup_const2(<2 x i32> %A) nounwind { 558; CHECK-SD-LABEL: dup_const2: 559; CHECK-SD: // %bb.0: 560; CHECK-SD-NEXT: mov w8, #32770 // =0x8002 561; CHECK-SD-NEXT: movk w8, #128, lsl #16 562; CHECK-SD-NEXT: dup.2s v1, w8 563; CHECK-SD-NEXT: add.2s v0, v0, v1 564; CHECK-SD-NEXT: ret 565; 566; CHECK-GI-LABEL: dup_const2: 567; CHECK-GI: // %bb.0: 568; CHECK-GI-NEXT: adrp x8, .LCPI39_0 569; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI39_0] 570; CHECK-GI-NEXT: add.2s v0, v0, v1 571; CHECK-GI-NEXT: ret 572 %tmp2 = add <2 x i32> %A, <i32 8421378, i32 8421378> 573 ret <2 x i32> %tmp2 574} 575 576define <2 x i32> @dup_const4_ext(<4 x i32> %A) nounwind { 577; CHECK-SD-LABEL: dup_const4_ext: 578; CHECK-SD: // %bb.0: 579; CHECK-SD-NEXT: mov w8, #32769 // =0x8001 580; CHECK-SD-NEXT: movk w8, #128, lsl #16 581; CHECK-SD-NEXT: dup.2s v1, w8 582; CHECK-SD-NEXT: add.2s v0, v0, v1 583; CHECK-SD-NEXT: ret 584; 585; CHECK-GI-LABEL: dup_const4_ext: 586; CHECK-GI: // %bb.0: 587; CHECK-GI-NEXT: adrp x8, .LCPI40_0 588; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0] 589; CHECK-GI-NEXT: add.4s v0, v0, v1 590; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 591; CHECK-GI-NEXT: ret 592 %tmp1 = add <4 x i32> %A, <i32 8421377, i32 8421377, i32 8421377, i32 8421377> 593 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 594 ret <2 x i32> %tmp2 595} 596 597define <4 x i32> @dup_const24(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C) nounwind { 598; CHECK-SD-LABEL: dup_const24: 599; CHECK-SD: // %bb.0: 600; CHECK-SD-NEXT: mov w8, #32768 // =0x8000 601; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 602; CHECK-SD-NEXT: movk w8, #128, lsl #16 603; CHECK-SD-NEXT: dup.4s v3, w8 604; CHECK-SD-NEXT: add.2s v0, v0, v3 605; CHECK-SD-NEXT: mov.d v0[1], v1[0] 606; CHECK-SD-NEXT: add.4s v1, v2, v3 607; CHECK-SD-NEXT: eor.16b v0, v1, v0 608; CHECK-SD-NEXT: ret 609; 610; CHECK-GI-LABEL: dup_const24: 611; CHECK-GI: // %bb.0: 612; CHECK-GI-NEXT: adrp x8, .LCPI41_1 613; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 614; CHECK-GI-NEXT: ldr d3, [x8, :lo12:.LCPI41_1] 615; CHECK-GI-NEXT: adrp x8, .LCPI41_0 616; CHECK-GI-NEXT: add.2s v0, v0, v3 617; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI41_0] 618; CHECK-GI-NEXT: mov.d v0[1], v1[0] 619; CHECK-GI-NEXT: add.4s v1, v2, v3 620; CHECK-GI-NEXT: eor.16b v0, v1, v0 621; CHECK-GI-NEXT: ret 622 %tmp1 = add <2 x i32> %A, <i32 8421376, i32 8421376> 623 %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 624 %tmp3 = add <4 x i32> %C, <i32 8421376, i32 8421376, i32 8421376, i32 8421376> 625 %tmp5 = xor <4 x i32> %tmp3, %tmp4 626 ret <4 x i32> %tmp5 627} 628 629define <8 x i16> @bitcast_i64_v8i16(i64 %a) { 630; CHECK-SD-LABEL: bitcast_i64_v8i16: 631; CHECK-SD: // %bb.0: 632; CHECK-SD-NEXT: dup.8h v0, w0 633; CHECK-SD-NEXT: ret 634; 635; CHECK-GI-LABEL: bitcast_i64_v8i16: 636; CHECK-GI: // %bb.0: 637; CHECK-GI-NEXT: fmov d0, x0 638; CHECK-GI-NEXT: dup.8h v0, v0[0] 639; CHECK-GI-NEXT: ret 640 %b = bitcast i64 %a to <4 x i16> 641 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer 642 ret <8 x i16> %r 643} 644 645define <8 x i16> @bitcast_i64_v8i16_lane1(i64 %a) { 646; CHECK-LABEL: bitcast_i64_v8i16_lane1: 647; CHECK: // %bb.0: 648; CHECK-NEXT: fmov d0, x0 649; CHECK-NEXT: dup.8h v0, v0[1] 650; CHECK-NEXT: ret 651 %b = bitcast i64 %a to <4 x i16> 652 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 653 ret <8 x i16> %r 654} 655 656define <8 x i16> @bitcast_f64_v8i16(double %a) { 657; CHECK-LABEL: bitcast_f64_v8i16: 658; CHECK: // %bb.0: 659; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 660; CHECK-NEXT: dup.8h v0, v0[0] 661; CHECK-NEXT: ret 662 %b = bitcast double %a to <4 x i16> 663 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer 664 ret <8 x i16> %r 665} 666 667define <8 x half> @bitcast_i64_v8f16(i64 %a) { 668; CHECK-LABEL: bitcast_i64_v8f16: 669; CHECK: // %bb.0: 670; CHECK-NEXT: fmov d0, x0 671; CHECK-NEXT: dup.8h v0, v0[0] 672; CHECK-NEXT: ret 673 %b = bitcast i64 %a to <4 x half> 674 %r = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> zeroinitializer 675 ret <8 x half> %r 676} 677 678define <2 x i64> @bitcast_i64_v2f64(i64 %a) { 679; CHECK-SD-LABEL: bitcast_i64_v2f64: 680; CHECK-SD: // %bb.0: 681; CHECK-SD-NEXT: fmov d0, x0 682; CHECK-SD-NEXT: dup.2d v0, v0[0] 683; CHECK-SD-NEXT: ret 684; 685; CHECK-GI-LABEL: bitcast_i64_v2f64: 686; CHECK-GI: // %bb.0: 687; CHECK-GI-NEXT: dup.2d v0, x0 688; CHECK-GI-NEXT: ret 689 %b = bitcast i64 %a to <1 x i64> 690 %r = shufflevector <1 x i64> %b, <1 x i64> poison, <2 x i32> zeroinitializer 691 ret <2 x i64> %r 692} 693 694define <2 x i64> @bitcast_v2f64_v2i64(<2 x double> %a) { 695; CHECK-LABEL: bitcast_v2f64_v2i64: 696; CHECK: // %bb.0: 697; CHECK-NEXT: dup.2d v0, v0[0] 698; CHECK-NEXT: ret 699 %b = bitcast <2 x double> %a to <2 x i64> 700 %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer 701 ret <2 x i64> %r 702} 703 704define <2 x i64> @bitcast_v8i16_v2i64(<8 x i16> %a) { 705; CHECK-LABEL: bitcast_v8i16_v2i64: 706; CHECK: // %bb.0: 707; CHECK-NEXT: dup.2d v0, v0[0] 708; CHECK-NEXT: ret 709 %b = bitcast <8 x i16> %a to <2 x i64> 710 %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer 711 ret <2 x i64> %r 712} 713 714define <8 x i16> @bitcast_v2f64_v8i16(<2 x i64> %a) { 715; CHECK-LABEL: bitcast_v2f64_v8i16: 716; CHECK: // %bb.0: 717; CHECK-NEXT: dup.8h v0, v0[0] 718; CHECK-NEXT: ret 719 %b = bitcast <2 x i64> %a to <8 x i16> 720 %r = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer 721 ret <8 x i16> %r 722} 723 724define <4 x i16> @dup_i16_v4i16_constant() { 725; CHECK-SD-LABEL: dup_i16_v4i16_constant: 726; CHECK-SD: // %bb.0: 727; CHECK-SD-NEXT: mov w8, #9211 // =0x23fb 728; CHECK-SD-NEXT: dup.4h v0, w8 729; CHECK-SD-NEXT: ret 730; 731; CHECK-GI-LABEL: dup_i16_v4i16_constant: 732; CHECK-GI: // %bb.0: 733; CHECK-GI-NEXT: adrp x8, .LCPI50_0 734; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI50_0] 735; CHECK-GI-NEXT: ret 736 ret <4 x i16> <i16 9211, i16 9211, i16 9211, i16 9211> 737} 738