1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16 3; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon,+bf16 -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16 4; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon,+fullfp16,+bf16 -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FP16 5 6%struct.float16x4x2_t = type { [2 x <4 x bfloat>] } 7%struct.float16x8x2_t = type { [2 x <8 x bfloat>] } 8 9define dso_local <4 x bfloat> @test_vbsl_bf16(<4 x i16> %a, <4 x bfloat> %b, <4 x bfloat> %c) { 10; CHECK-LABEL: test_vbsl_bf16: 11; CHECK: @ %bb.0: @ %entry 12; CHECK-NEXT: vbsl d0, d1, d2 13; CHECK-NEXT: bx lr 14entry: 15 %0 = bitcast <4 x i16> %a to <8 x i8> 16 %1 = bitcast <4 x bfloat> %b to <8 x i8> 17 %2 = bitcast <4 x bfloat> %c to <8 x i8> 18 %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2) 19 %3 = bitcast <8 x i8> %vbsl_v.i to <4 x bfloat> 20 ret <4 x bfloat> %3 21} 22 23define dso_local <8 x bfloat> @test_vbslq_bf16(<8 x i16> %a, <8 x bfloat> %b, <8 x bfloat> %c) { 24; CHECK-LABEL: test_vbslq_bf16: 25; CHECK: @ %bb.0: @ %entry 26; CHECK-NEXT: vbsl q0, q1, q2 27; CHECK-NEXT: bx lr 28entry: 29 %0 = bitcast <8 x i16> %a to <16 x i8> 30 %1 = bitcast <8 x bfloat> %b to <16 x i8> 31 %2 = bitcast <8 x bfloat> %c to <16 x i8> 32 %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) 33 %3 = bitcast <16 x i8> %vbslq_v.i to <8 x bfloat> 34 ret <8 x bfloat> %3 35} 36 37define dso_local %struct.float16x4x2_t @test_vzip_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 38; CHECK-LABEL: test_vzip_bf16: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: vzip.16 d0, d1 41; CHECK-NEXT: bx lr 42entry: 43 %vzip.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 44 %vzip1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 45 %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vzip.i, 0, 0 46 %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vzip1.i, 0, 1 47 ret %struct.float16x4x2_t %.fca.0.1.insert 48} 49 50define dso_local %struct.float16x8x2_t @test_vzipq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 51; CHECK-LABEL: test_vzipq_bf16: 52; CHECK: @ %bb.0: @ %entry 53; CHECK-NEXT: vzip.16 q0, q1 54; CHECK-NEXT: bx lr 55entry: 56 %vzip.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 57 %vzip1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 58 %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vzip.i, 0, 0 59 %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vzip1.i, 0, 1 60 ret %struct.float16x8x2_t %.fca.0.1.insert 61} 62 63define dso_local %struct.float16x4x2_t @test_vuzp_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 64; CHECK-LABEL: test_vuzp_bf16: 65; CHECK: @ %bb.0: @ %entry 66; CHECK-NEXT: vuzp.16 d0, d1 67; CHECK-NEXT: bx lr 68entry: 69 %vuzp.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 70 %vuzp1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 71 %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vuzp.i, 0, 0 72 %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vuzp1.i, 0, 1 73 ret %struct.float16x4x2_t %.fca.0.1.insert 74} 75 76define dso_local %struct.float16x8x2_t @test_vuzpq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 77; CHECK-LABEL: test_vuzpq_bf16: 78; CHECK: @ %bb.0: @ %entry 79; CHECK-NEXT: vuzp.16 q0, q1 80; CHECK-NEXT: bx lr 81entry: 82 %vuzp.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 83 %vuzp1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 84 %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vuzp.i, 0, 0 85 %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vuzp1.i, 0, 1 86 ret %struct.float16x8x2_t %.fca.0.1.insert 87} 88 89define dso_local %struct.float16x4x2_t @test_vtrn_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 90; CHECK-LABEL: test_vtrn_bf16: 91; CHECK: @ %bb.0: @ %entry 92; CHECK-NEXT: vtrn.16 d0, d1 93; CHECK-NEXT: bx lr 94entry: 95 %vtrn.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 96 %vtrn1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 97 %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vtrn.i, 0, 0 98 %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vtrn1.i, 0, 1 99 ret %struct.float16x4x2_t %.fca.0.1.insert 100} 101 102define dso_local %struct.float16x8x2_t @test_vtrnq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 103; CHECK-LABEL: test_vtrnq_bf16: 104; CHECK: @ %bb.0: @ %entry 105; CHECK-NEXT: vtrn.16 q0, q1 106; CHECK-NEXT: bx lr 107entry: 108 %vtrn.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 109 %vtrn1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 110 %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vtrn.i, 0, 0 111 %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vtrn1.i, 0, 1 112 ret %struct.float16x8x2_t %.fca.0.1.insert 113} 114 115define dso_local <4 x bfloat> @test_vmov_n_bf16(float %a.coerce) { 116; CHECK-NOFP16-LABEL: test_vmov_n_bf16: 117; CHECK-NOFP16: @ %bb.0: @ %entry 118; CHECK-NOFP16-NEXT: .pad #4 119; CHECK-NOFP16-NEXT: sub sp, sp, #4 120; CHECK-NOFP16-NEXT: vmov r0, s0 121; CHECK-NOFP16-NEXT: strh r0, [sp, #2] 122; CHECK-NOFP16-NEXT: add r0, sp, #2 123; CHECK-NOFP16-NEXT: vld1.16 {d0[]}, [r0:16] 124; CHECK-NOFP16-NEXT: add sp, sp, #4 125; CHECK-NOFP16-NEXT: bx lr 126; 127; CHECK-FP16-LABEL: test_vmov_n_bf16: 128; CHECK-FP16: @ %bb.0: @ %entry 129; CHECK-FP16-NEXT: @ kill: def $s0 killed $s0 def $d0 130; CHECK-FP16-NEXT: vdup.16 d0, d0[0] 131; CHECK-FP16-NEXT: bx lr 132entry: 133 %0 = bitcast float %a.coerce to i32 134 %tmp.0.extract.trunc = trunc i32 %0 to i16 135 %1 = bitcast i16 %tmp.0.extract.trunc to bfloat 136 %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0 137 %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer 138 ret <4 x bfloat> %vecinit4 139} 140 141define dso_local <8 x bfloat> @test_vmovq_n_bf16(float %a.coerce) { 142; CHECK-NOFP16-LABEL: test_vmovq_n_bf16: 143; CHECK-NOFP16: @ %bb.0: @ %entry 144; CHECK-NOFP16-NEXT: .pad #4 145; CHECK-NOFP16-NEXT: sub sp, sp, #4 146; CHECK-NOFP16-NEXT: vmov r0, s0 147; CHECK-NOFP16-NEXT: strh r0, [sp, #2] 148; CHECK-NOFP16-NEXT: add r0, sp, #2 149; CHECK-NOFP16-NEXT: vld1.16 {d0[], d1[]}, [r0:16] 150; CHECK-NOFP16-NEXT: add sp, sp, #4 151; CHECK-NOFP16-NEXT: bx lr 152; 153; CHECK-FP16-LABEL: test_vmovq_n_bf16: 154; CHECK-FP16: @ %bb.0: @ %entry 155; CHECK-FP16-NEXT: @ kill: def $s0 killed $s0 def $d0 156; CHECK-FP16-NEXT: vdup.16 q0, d0[0] 157; CHECK-FP16-NEXT: bx lr 158entry: 159 %0 = bitcast float %a.coerce to i32 160 %tmp.0.extract.trunc = trunc i32 %0 to i16 161 %1 = bitcast i16 %tmp.0.extract.trunc to bfloat 162 %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0 163 %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer 164 ret <8 x bfloat> %vecinit8 165} 166 167define dso_local <4 x bfloat> @test_vdup_n_bf16(float %a.coerce) { 168; CHECK-NOFP16-LABEL: test_vdup_n_bf16: 169; CHECK-NOFP16: @ %bb.0: @ %entry 170; CHECK-NOFP16-NEXT: .pad #4 171; CHECK-NOFP16-NEXT: sub sp, sp, #4 172; CHECK-NOFP16-NEXT: vmov r0, s0 173; CHECK-NOFP16-NEXT: strh r0, [sp, #2] 174; CHECK-NOFP16-NEXT: add r0, sp, #2 175; CHECK-NOFP16-NEXT: vld1.16 {d0[]}, [r0:16] 176; CHECK-NOFP16-NEXT: add sp, sp, #4 177; CHECK-NOFP16-NEXT: bx lr 178; 179; CHECK-FP16-LABEL: test_vdup_n_bf16: 180; CHECK-FP16: @ %bb.0: @ %entry 181; CHECK-FP16-NEXT: @ kill: def $s0 killed $s0 def $d0 182; CHECK-FP16-NEXT: vdup.16 d0, d0[0] 183; CHECK-FP16-NEXT: bx lr 184entry: 185 %0 = bitcast float %a.coerce to i32 186 %tmp.0.extract.trunc = trunc i32 %0 to i16 187 %1 = bitcast i16 %tmp.0.extract.trunc to bfloat 188 %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0 189 %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer 190 ret <4 x bfloat> %vecinit4 191} 192 193define dso_local <8 x bfloat> @test_vdupq_n_bf16(float %a.coerce) { 194; CHECK-NOFP16-LABEL: test_vdupq_n_bf16: 195; CHECK-NOFP16: @ %bb.0: @ %entry 196; CHECK-NOFP16-NEXT: .pad #4 197; CHECK-NOFP16-NEXT: sub sp, sp, #4 198; CHECK-NOFP16-NEXT: vmov r0, s0 199; CHECK-NOFP16-NEXT: strh r0, [sp, #2] 200; CHECK-NOFP16-NEXT: add r0, sp, #2 201; CHECK-NOFP16-NEXT: vld1.16 {d0[], d1[]}, [r0:16] 202; CHECK-NOFP16-NEXT: add sp, sp, #4 203; CHECK-NOFP16-NEXT: bx lr 204; 205; CHECK-FP16-LABEL: test_vdupq_n_bf16: 206; CHECK-FP16: @ %bb.0: @ %entry 207; CHECK-FP16-NEXT: @ kill: def $s0 killed $s0 def $d0 208; CHECK-FP16-NEXT: vdup.16 q0, d0[0] 209; CHECK-FP16-NEXT: bx lr 210entry: 211 %0 = bitcast float %a.coerce to i32 212 %tmp.0.extract.trunc = trunc i32 %0 to i16 213 %1 = bitcast i16 %tmp.0.extract.trunc to bfloat 214 %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0 215 %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer 216 ret <8 x bfloat> %vecinit8 217} 218 219define dso_local <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %a) { 220; CHECK-LABEL: test_vdup_lane_bf16: 221; CHECK: @ %bb.0: @ %entry 222; CHECK-NEXT: vdup.16 d0, d0[3] 223; CHECK-NEXT: bx lr 224entry: 225 %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 226 ret <4 x bfloat> %shuffle 227} 228 229define dso_local <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %a) { 230; CHECK-LABEL: test_vdupq_lane_bf16: 231; CHECK: @ %bb.0: @ %entry 232; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 233; CHECK-NEXT: vdup.16 q0, d0[3] 234; CHECK-NEXT: bx lr 235entry: 236 %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 237 ret <8 x bfloat> %shuffle 238} 239 240define dso_local <4 x bfloat> @test_vext_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 241; CHECK-LABEL: test_vext_bf16: 242; CHECK: @ %bb.0: @ %entry 243; CHECK-NEXT: vext.16 d0, d0, d1, #2 244; CHECK-NEXT: bx lr 245entry: 246 %vext = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 247 ret <4 x bfloat> %vext 248} 249 250define dso_local <8 x bfloat> @test_vextq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 251; CHECK-LABEL: test_vextq_bf16: 252; CHECK: @ %bb.0: @ %entry 253; CHECK-NEXT: vext.16 q0, q0, q1, #5 254; CHECK-NEXT: bx lr 255entry: 256 %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12> 257 ret <8 x bfloat> %vext 258} 259 260define dso_local <4 x bfloat> @test_vext_aligned_bf16(<8 x bfloat> %a) { 261; CHECK-LABEL: test_vext_aligned_bf16: 262; CHECK: @ %bb.0: @ %entry 263; CHECK-NEXT: vmov.f64 d0, d1 264; CHECK-NEXT: bx lr 265entry: 266 %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 267 ret <4 x bfloat> %vext 268} 269 270define dso_local <4 x bfloat> @test_vext_unaligned_bf16(<8 x bfloat> %a) { 271; CHECK-LABEL: test_vext_unaligned_bf16: 272; CHECK: @ %bb.0: @ %entry 273; CHECK-NEXT: vext.16 d0, d0, d1, #3 274; CHECK-NEXT: bx lr 275entry: 276 %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 277 ret <4 x bfloat> %vext 278} 279 280define arm_aapcs_vfpcc <8 x bfloat> @shuffle3step0_bf16(<32 x bfloat> %src) { 281; CHECK-NOFP16-LABEL: shuffle3step0_bf16: 282; CHECK-NOFP16: @ %bb.0: @ %entry 283; CHECK-NOFP16-NEXT: vmov r1, s0 284; CHECK-NOFP16-NEXT: vmov.u16 r0, d0[3] 285; CHECK-NOFP16-NEXT: vrev32.16 d16, d3 286; CHECK-NOFP16-NEXT: vext.16 d17, d4, d5, #2 287; CHECK-NOFP16-NEXT: vext.16 d16, d16, d3, #1 288; CHECK-NOFP16-NEXT: vext.16 d16, d17, d16, #2 289; CHECK-NOFP16-NEXT: vext.16 d16, d16, d17, #1 290; CHECK-NOFP16-NEXT: vext.16 d17, d16, d16, #1 291; CHECK-NOFP16-NEXT: vmov.16 d16[0], r1 292; CHECK-NOFP16-NEXT: vmov.16 d16[1], r0 293; CHECK-NOFP16-NEXT: vmov r0, s3 294; CHECK-NOFP16-NEXT: vmov.16 d16[2], r0 295; CHECK-NOFP16-NEXT: vmov.u16 r0, d2[1] 296; CHECK-NOFP16-NEXT: vmov.16 d16[3], r0 297; CHECK-NOFP16-NEXT: vorr q0, q8, q8 298; CHECK-NOFP16-NEXT: bx lr 299; 300; CHECK-FP16-LABEL: shuffle3step0_bf16: 301; CHECK-FP16: @ %bb.0: @ %entry 302; CHECK-FP16-NEXT: vmov r1, s0 303; CHECK-FP16-NEXT: vext.16 d17, d4, d5, #2 304; CHECK-FP16-NEXT: vmovx.f16 s8, s1 305; CHECK-FP16-NEXT: vrev32.16 d16, d3 306; CHECK-FP16-NEXT: vmov r0, s8 307; CHECK-FP16-NEXT: vext.16 d16, d16, d3, #1 308; CHECK-FP16-NEXT: vext.16 d16, d17, d16, #2 309; CHECK-FP16-NEXT: vext.16 d16, d16, d17, #1 310; CHECK-FP16-NEXT: vext.16 d17, d16, d16, #1 311; CHECK-FP16-NEXT: vmov.16 d16[0], r1 312; CHECK-FP16-NEXT: vmov.16 d16[1], r0 313; CHECK-FP16-NEXT: vmov r0, s3 314; CHECK-FP16-NEXT: vmovx.f16 s0, s4 315; CHECK-FP16-NEXT: vmov.16 d16[2], r0 316; CHECK-FP16-NEXT: vmov r0, s0 317; CHECK-FP16-NEXT: vmov.16 d16[3], r0 318; CHECK-FP16-NEXT: vorr q0, q8, q8 319; CHECK-FP16-NEXT: bx lr 320entry: 321 %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 322 ret <8 x bfloat> %s1 323} 324 325define arm_aapcs_vfpcc <8 x bfloat> @shuffle3step1_bf16(<32 x bfloat> %src) { 326; CHECK-NOFP16-LABEL: shuffle3step1_bf16: 327; CHECK-NOFP16: @ %bb.0: @ %entry 328; CHECK-NOFP16-NEXT: vorr q3, q0, q0 329; CHECK-NOFP16-NEXT: vmov.u16 r1, d6[1] 330; CHECK-NOFP16-NEXT: vmov r0, s14 331; CHECK-NOFP16-NEXT: vmov.16 d0[0], r1 332; CHECK-NOFP16-NEXT: vmov.16 d0[1], r0 333; CHECK-NOFP16-NEXT: vmov.u16 r0, d7[3] 334; CHECK-NOFP16-NEXT: vmov.16 d0[2], r0 335; CHECK-NOFP16-NEXT: vmov r0, s5 336; CHECK-NOFP16-NEXT: vdup.16 q1, d3[1] 337; CHECK-NOFP16-NEXT: vmov r1, s4 338; CHECK-NOFP16-NEXT: vmov.16 d0[3], r0 339; CHECK-NOFP16-NEXT: vmov r0, s8 340; CHECK-NOFP16-NEXT: vmov.16 d1[0], r1 341; CHECK-NOFP16-NEXT: vmov.16 d1[1], r0 342; CHECK-NOFP16-NEXT: vmov.u16 r0, d4[3] 343; CHECK-NOFP16-NEXT: vmov.16 d1[2], r0 344; CHECK-NOFP16-NEXT: vmov r0, s11 345; CHECK-NOFP16-NEXT: vmov.16 d1[3], r0 346; CHECK-NOFP16-NEXT: bx lr 347; 348; CHECK-FP16-LABEL: shuffle3step1_bf16: 349; CHECK-FP16: @ %bb.0: @ %entry 350; CHECK-FP16-NEXT: vorr q3, q0, q0 351; CHECK-FP16-NEXT: vmovx.f16 s0, s12 352; CHECK-FP16-NEXT: vmovx.f16 s12, s15 353; CHECK-FP16-NEXT: vmov r1, s0 354; CHECK-FP16-NEXT: vmov r0, s14 355; CHECK-FP16-NEXT: vmov.16 d0[0], r1 356; CHECK-FP16-NEXT: vmov.16 d0[1], r0 357; CHECK-FP16-NEXT: vmov r0, s12 358; CHECK-FP16-NEXT: vmov.16 d0[2], r0 359; CHECK-FP16-NEXT: vmov r0, s5 360; CHECK-FP16-NEXT: vdup.16 q1, d3[1] 361; CHECK-FP16-NEXT: vmov r1, s4 362; CHECK-FP16-NEXT: vmovx.f16 s4, s9 363; CHECK-FP16-NEXT: vmov.16 d0[3], r0 364; CHECK-FP16-NEXT: vmov r0, s8 365; CHECK-FP16-NEXT: vmov.16 d1[0], r1 366; CHECK-FP16-NEXT: vmov.16 d1[1], r0 367; CHECK-FP16-NEXT: vmov r0, s4 368; CHECK-FP16-NEXT: vmov.16 d1[2], r0 369; CHECK-FP16-NEXT: vmov r0, s11 370; CHECK-FP16-NEXT: vmov.16 d1[3], r0 371; CHECK-FP16-NEXT: bx lr 372entry: 373 %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 374 ret <8 x bfloat> %s1 375} 376 377define arm_aapcs_vfpcc <8 x bfloat> @shuffle3step2_bf16(<32 x bfloat> %src) { 378; CHECK-NOFP16-LABEL: shuffle3step2_bf16: 379; CHECK-NOFP16: @ %bb.0: @ %entry 380; CHECK-NOFP16-NEXT: vext.16 d16, d0, d1, #2 381; CHECK-NOFP16-NEXT: vmov.u16 r0, d4[1] 382; CHECK-NOFP16-NEXT: vext.16 d17, d16, d2, #3 383; CHECK-NOFP16-NEXT: vext.16 d16, d2, d16, #1 384; CHECK-NOFP16-NEXT: vdup.16 q1, d3[2] 385; CHECK-NOFP16-NEXT: vext.16 d16, d16, d17, #2 386; CHECK-NOFP16-NEXT: vmov r1, s4 387; CHECK-NOFP16-NEXT: vext.16 d0, d16, d16, #1 388; CHECK-NOFP16-NEXT: vmov.16 d1[0], r1 389; CHECK-NOFP16-NEXT: vmov.16 d1[1], r0 390; CHECK-NOFP16-NEXT: vmov r0, s10 391; CHECK-NOFP16-NEXT: vmov.16 d1[2], r0 392; CHECK-NOFP16-NEXT: vmov.u16 r0, d5[3] 393; CHECK-NOFP16-NEXT: vmov.16 d1[3], r0 394; CHECK-NOFP16-NEXT: bx lr 395; 396; CHECK-FP16-LABEL: shuffle3step2_bf16: 397; CHECK-FP16: @ %bb.0: @ %entry 398; CHECK-FP16-NEXT: vext.16 d16, d0, d1, #2 399; CHECK-FP16-NEXT: vmovx.f16 s12, s8 400; CHECK-FP16-NEXT: vmov r0, s12 401; CHECK-FP16-NEXT: vext.16 d17, d16, d2, #3 402; CHECK-FP16-NEXT: vext.16 d16, d2, d16, #1 403; CHECK-FP16-NEXT: vdup.16 q1, d3[2] 404; CHECK-FP16-NEXT: vext.16 d16, d16, d17, #2 405; CHECK-FP16-NEXT: vmov r1, s4 406; CHECK-FP16-NEXT: vmovx.f16 s4, s11 407; CHECK-FP16-NEXT: vext.16 d0, d16, d16, #1 408; CHECK-FP16-NEXT: vmov.16 d1[0], r1 409; CHECK-FP16-NEXT: vmov.16 d1[1], r0 410; CHECK-FP16-NEXT: vmov r0, s10 411; CHECK-FP16-NEXT: vmov.16 d1[2], r0 412; CHECK-FP16-NEXT: vmov r0, s4 413; CHECK-FP16-NEXT: vmov.16 d1[3], r0 414; CHECK-FP16-NEXT: bx lr 415entry: 416 %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 417 ret <8 x bfloat> %s1 418} 419 420 421define dso_local <4 x bfloat> @test_vrev64_bf16(<4 x bfloat> %a) { 422; CHECK-LABEL: test_vrev64_bf16: 423; CHECK: @ %bb.0: @ %entry 424; CHECK-NEXT: vrev64.16 d0, d0 425; CHECK-NEXT: bx lr 426entry: 427 %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 428 ret <4 x bfloat> %shuffle.i 429} 430 431define dso_local <8 x bfloat> @test_vrev64q_bf16(<8 x bfloat> %a) { 432; CHECK-LABEL: test_vrev64q_bf16: 433; CHECK: @ %bb.0: @ %entry 434; CHECK-NEXT: vrev64.16 q0, q0 435; CHECK-NEXT: bx lr 436entry: 437 %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 438 ret <8 x bfloat> %shuffle.i 439} 440 441define dso_local <4 x bfloat> @test_vrev32_bf16(<4 x bfloat> %a) { 442; CHECK-LABEL: test_vrev32_bf16: 443; CHECK: @ %bb.0: @ %entry 444; CHECK-NEXT: vrev32.16 d0, d0 445; CHECK-NEXT: bx lr 446entry: 447 %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 448 ret <4 x bfloat> %shuffle.i 449} 450 451define dso_local <8 x bfloat> @test_vrev32q_bf16(<8 x bfloat> %a) { 452; CHECK-LABEL: test_vrev32q_bf16: 453; CHECK: @ %bb.0: @ %entry 454; CHECK-NEXT: vrev32.16 q0, q0 455; CHECK-NEXT: bx lr 456entry: 457 %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 458 ret <8 x bfloat> %shuffle.i 459} 460 461define <4 x bfloat> @test_vld_dup1_4xbfloat(ptr %b) { 462; CHECK-LABEL: test_vld_dup1_4xbfloat: 463; CHECK: @ %bb.0: @ %entry 464; CHECK-NEXT: vld1.16 {d0[]}, [r0:16] 465; CHECK-NEXT: bx lr 466entry: 467 %b1 = load bfloat, ptr %b, align 2 468 %vecinit = insertelement <4 x bfloat> undef, bfloat %b1, i32 0 469 %vecinit2 = insertelement <4 x bfloat> %vecinit, bfloat %b1, i32 1 470 %vecinit3 = insertelement <4 x bfloat> %vecinit2, bfloat %b1, i32 2 471 %vecinit4 = insertelement <4 x bfloat> %vecinit3, bfloat %b1, i32 3 472 ret <4 x bfloat> %vecinit4 473} 474 475define <8 x bfloat> @test_vld_dup1_8xbfloat(ptr %b) local_unnamed_addr { 476; CHECK-LABEL: test_vld_dup1_8xbfloat: 477; CHECK: @ %bb.0: @ %entry 478; CHECK-NEXT: vld1.16 {d0[], d1[]}, [r0:16] 479; CHECK-NEXT: bx lr 480entry: 481 %b1 = load bfloat, ptr %b, align 2 482 %vecinit = insertelement <8 x bfloat> undef, bfloat %b1, i32 0 483 %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer 484 ret <8 x bfloat> %vecinit8 485} 486 487define <8 x bfloat> @test_shufflevector8xbfloat(<4 x bfloat> %a) { 488; CHECK-LABEL: test_shufflevector8xbfloat: 489; CHECK: @ %bb.0: @ %entry 490; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 491; CHECK-NEXT: vmov.f64 d1, d0 492; CHECK-NEXT: bx lr 493entry: 494 %r = shufflevector <4 x bfloat> %a, <4 x bfloat> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 495 ret <8 x bfloat> %r 496} 497 498declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) 499declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) 500