1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon < %s | FileCheck %s 3; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+bf16 < %s | FileCheck %s 4; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+fullfp16,+bf16 < %s | FileCheck %s 5 6%struct.float16x4x2_t = type { [2 x <4 x bfloat>] } 7%struct.float16x8x2_t = type { [2 x <8 x bfloat>] } 8 9define dso_local %struct.float16x4x2_t @test_vzip_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 10; CHECK-LABEL: test_vzip_bf16: 11; CHECK: // %bb.0: // %entry 12; CHECK-NEXT: zip1 v2.4h, v0.4h, v1.4h 13; CHECK-NEXT: zip2 v1.4h, v0.4h, v1.4h 14; CHECK-NEXT: fmov d0, d2 15; CHECK-NEXT: ret 16entry: 17 %vzip.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 18 %vzip1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 19 %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vzip.i, 0, 0 20 %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vzip1.i, 0, 1 21 ret %struct.float16x4x2_t %.fca.0.1.insert 22} 23 24define dso_local %struct.float16x8x2_t @test_vzipq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 25; CHECK-LABEL: test_vzipq_bf16: 26; CHECK: // %bb.0: // %entry 27; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h 28; CHECK-NEXT: zip2 v1.8h, v0.8h, v1.8h 29; CHECK-NEXT: mov v0.16b, v2.16b 30; CHECK-NEXT: ret 31entry: 32 %vzip.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 33 %vzip1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 34 %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vzip.i, 0, 0 35 %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vzip1.i, 0, 1 36 ret %struct.float16x8x2_t %.fca.0.1.insert 37} 38 39define dso_local %struct.float16x4x2_t @test_vuzp_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 40; CHECK-LABEL: test_vuzp_bf16: 41; CHECK: // %bb.0: // %entry 42; CHECK-NEXT: uzp1 v2.4h, v0.4h, v1.4h 43; CHECK-NEXT: uzp2 v1.4h, v0.4h, v1.4h 44; CHECK-NEXT: fmov d0, d2 45; CHECK-NEXT: ret 46entry: 47 %vuzp.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 48 %vuzp1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 49 %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vuzp.i, 0, 0 50 %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vuzp1.i, 0, 1 51 ret %struct.float16x4x2_t %.fca.0.1.insert 52} 53 54define dso_local %struct.float16x8x2_t @test_vuzpq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 55; CHECK-LABEL: test_vuzpq_bf16: 56; CHECK: // %bb.0: // %entry 57; CHECK-NEXT: uzp1 v2.8h, v0.8h, v1.8h 58; CHECK-NEXT: uzp2 v1.8h, v0.8h, v1.8h 59; CHECK-NEXT: mov v0.16b, v2.16b 60; CHECK-NEXT: ret 61entry: 62 %vuzp.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 63 %vuzp1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 64 %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vuzp.i, 0, 0 65 %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vuzp1.i, 0, 1 66 ret %struct.float16x8x2_t %.fca.0.1.insert 67} 68 69define dso_local %struct.float16x4x2_t @test_vtrn_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 70; CHECK-LABEL: test_vtrn_bf16: 71; CHECK: // %bb.0: // %entry 72; CHECK-NEXT: trn1 v2.4h, v0.4h, v1.4h 73; CHECK-NEXT: trn2 v1.4h, v0.4h, v1.4h 74; CHECK-NEXT: fmov d0, d2 75; CHECK-NEXT: ret 76entry: 77 %vtrn.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 78 %vtrn1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 79 %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vtrn.i, 0, 0 80 %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vtrn1.i, 0, 1 81 ret %struct.float16x4x2_t %.fca.0.1.insert 82} 83 84define dso_local %struct.float16x8x2_t @test_vtrnq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 85; CHECK-LABEL: test_vtrnq_bf16: 86; CHECK: // %bb.0: // %entry 87; CHECK-NEXT: trn1 v2.8h, v0.8h, v1.8h 88; CHECK-NEXT: trn2 v1.8h, v0.8h, v1.8h 89; CHECK-NEXT: mov v0.16b, v2.16b 90; CHECK-NEXT: ret 91entry: 92 %vtrn.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 93 %vtrn1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 94 %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vtrn.i, 0, 0 95 %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vtrn1.i, 0, 1 96 ret %struct.float16x8x2_t %.fca.0.1.insert 97} 98 99define dso_local <4 x bfloat> @test_vmov_n_bf16(float %a.coerce) { 100; CHECK-LABEL: test_vmov_n_bf16: 101; CHECK: // %bb.0: // %entry 102; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 103; CHECK-NEXT: dup v0.4h, v0.h[0] 104; CHECK-NEXT: ret 105entry: 106 %0 = bitcast float %a.coerce to i32 107 %tmp.0.extract.trunc = trunc i32 %0 to i16 108 %1 = bitcast i16 %tmp.0.extract.trunc to bfloat 109 %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0 110 %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer 111 ret <4 x bfloat> %vecinit4 112} 113 114define dso_local <8 x bfloat> @test_vmovq_n_bf16(float %a.coerce) { 115; CHECK-LABEL: test_vmovq_n_bf16: 116; CHECK: // %bb.0: // %entry 117; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 118; CHECK-NEXT: dup v0.8h, v0.h[0] 119; CHECK-NEXT: ret 120entry: 121 %0 = bitcast float %a.coerce to i32 122 %tmp.0.extract.trunc = trunc i32 %0 to i16 123 %1 = bitcast i16 %tmp.0.extract.trunc to bfloat 124 %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0 125 %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer 126 ret <8 x bfloat> %vecinit8 127} 128 129define dso_local <4 x bfloat> @test_vdup_n_bf16(float %a.coerce) { 130; CHECK-LABEL: test_vdup_n_bf16: 131; CHECK: // %bb.0: // %entry 132; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 133; CHECK-NEXT: dup v0.4h, v0.h[0] 134; CHECK-NEXT: ret 135entry: 136 %0 = bitcast float %a.coerce to i32 137 %tmp.0.extract.trunc = trunc i32 %0 to i16 138 %1 = bitcast i16 %tmp.0.extract.trunc to bfloat 139 %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0 140 %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer 141 ret <4 x bfloat> %vecinit4 142} 143 144define dso_local <8 x bfloat> @test_vdupq_n_bf16(float %a.coerce) { 145; CHECK-LABEL: test_vdupq_n_bf16: 146; CHECK: // %bb.0: // %entry 147; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 148; CHECK-NEXT: dup v0.8h, v0.h[0] 149; CHECK-NEXT: ret 150entry: 151 %0 = bitcast float %a.coerce to i32 152 %tmp.0.extract.trunc = trunc i32 %0 to i16 153 %1 = bitcast i16 %tmp.0.extract.trunc to bfloat 154 %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0 155 %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer 156 ret <8 x bfloat> %vecinit8 157} 158 159define dso_local <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %a) { 160; CHECK-LABEL: test_vdup_lane_bf16: 161; CHECK: // %bb.0: // %entry 162; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 163; CHECK-NEXT: dup v0.4h, v0.h[3] 164; CHECK-NEXT: ret 165entry: 166 %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 167 ret <4 x bfloat> %shuffle 168} 169 170define dso_local <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %a) { 171; CHECK-LABEL: test_vdupq_lane_bf16: 172; CHECK: // %bb.0: // %entry 173; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 174; CHECK-NEXT: dup v0.8h, v0.h[3] 175; CHECK-NEXT: ret 176entry: 177 %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 178 ret <8 x bfloat> %shuffle 179} 180 181define dso_local <4 x bfloat> @test_vext_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 182; CHECK-LABEL: test_vext_bf16: 183; CHECK: // %bb.0: // %entry 184; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 185; CHECK-NEXT: ret 186entry: 187 %vext = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 188 ret <4 x bfloat> %vext 189} 190 191define dso_local <8 x bfloat> @test_vextq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 192; CHECK-LABEL: test_vextq_bf16: 193; CHECK: // %bb.0: // %entry 194; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #10 195; CHECK-NEXT: ret 196entry: 197 %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12> 198 ret <8 x bfloat> %vext 199} 200 201define dso_local <4 x bfloat> @test_vext_aligned_bf16(<8 x bfloat> %a) { 202; CHECK-LABEL: test_vext_aligned_bf16: 203; CHECK: // %bb.0: // %entry 204; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 205; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 206; CHECK-NEXT: ret 207entry: 208 %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 209 ret <4 x bfloat> %vext 210} 211 212define dso_local <4 x bfloat> @test_vext_unaligned_bf16(<8 x bfloat> %a) { 213; CHECK-LABEL: test_vext_unaligned_bf16: 214; CHECK: // %bb.0: // %entry 215; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #6 216; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 217; CHECK-NEXT: ret 218entry: 219 %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 220 ret <4 x bfloat> %vext 221} 222 223define <8 x bfloat> @shuffle3step0_bf16(<32 x bfloat> %src) { 224; CHECK-LABEL: shuffle3step0_bf16: 225; CHECK: // %bb.0: // %entry 226; CHECK-NEXT: adrp x8, .LCPI16_0 227; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 228; CHECK-NEXT: mov v3.16b, v2.16b 229; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_0] 230; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 231; CHECK-NEXT: adrp x8, .LCPI16_1 232; CHECK-NEXT: tbl v2.16b, { v0.16b, v1.16b }, v4.16b 233; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_1] 234; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b 235; CHECK-NEXT: ret 236entry: 237 %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 238 ret <8 x bfloat> %s1 239} 240 241define <8 x bfloat> @shuffle3step1_bf16(<32 x bfloat> %src) { 242; CHECK-LABEL: shuffle3step1_bf16: 243; CHECK: // %bb.0: // %entry 244; CHECK-NEXT: adrp x8, .LCPI17_0 245; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 246; CHECK-NEXT: mov v3.16b, v2.16b 247; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_0] 248; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 249; CHECK-NEXT: adrp x8, .LCPI17_1 250; CHECK-NEXT: tbl v2.16b, { v0.16b, v1.16b }, v4.16b 251; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_1] 252; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b 253; CHECK-NEXT: ret 254entry: 255 %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 256 ret <8 x bfloat> %s1 257} 258 259define <8 x bfloat> @shuffle3step2_bf16(<32 x bfloat> %src) { 260; CHECK-LABEL: shuffle3step2_bf16: 261; CHECK: // %bb.0: // %entry 262; CHECK-NEXT: adrp x8, .LCPI18_0 263; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 264; CHECK-NEXT: mov v3.16b, v2.16b 265; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_0] 266; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 267; CHECK-NEXT: adrp x8, .LCPI18_1 268; CHECK-NEXT: tbl v2.16b, { v0.16b, v1.16b }, v4.16b 269; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_1] 270; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b 271; CHECK-NEXT: ret 272entry: 273 %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 274 ret <8 x bfloat> %s1 275} 276 277 278define dso_local <4 x bfloat> @test_vrev64_bf16(<4 x bfloat> %a) { 279; CHECK-LABEL: test_vrev64_bf16: 280; CHECK: // %bb.0: // %entry 281; CHECK-NEXT: rev64 v0.4h, v0.4h 282; CHECK-NEXT: ret 283entry: 284 %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 285 ret <4 x bfloat> %shuffle.i 286} 287 288define dso_local <8 x bfloat> @test_vrev64q_bf16(<8 x bfloat> %a) { 289; CHECK-LABEL: test_vrev64q_bf16: 290; CHECK: // %bb.0: // %entry 291; CHECK-NEXT: rev64 v0.8h, v0.8h 292; CHECK-NEXT: ret 293entry: 294 %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 295 ret <8 x bfloat> %shuffle.i 296} 297 298define dso_local <4 x bfloat> @test_vrev32_bf16(<4 x bfloat> %a) { 299; CHECK-LABEL: test_vrev32_bf16: 300; CHECK: // %bb.0: // %entry 301; CHECK-NEXT: rev32 v0.4h, v0.4h 302; CHECK-NEXT: ret 303entry: 304 %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 305 ret <4 x bfloat> %shuffle.i 306} 307 308define dso_local <8 x bfloat> @test_vrev32q_bf16(<8 x bfloat> %a) { 309; CHECK-LABEL: test_vrev32q_bf16: 310; CHECK: // %bb.0: // %entry 311; CHECK-NEXT: rev32 v0.8h, v0.8h 312; CHECK-NEXT: ret 313entry: 314 %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 315 ret <8 x bfloat> %shuffle.i 316} 317 318define <4 x bfloat> @test_vld_dup1_4xbfloat(ptr %b) { 319; CHECK-LABEL: test_vld_dup1_4xbfloat: 320; CHECK: // %bb.0: // %entry 321; CHECK-NEXT: ld1r { v0.4h }, [x0] 322; CHECK-NEXT: ret 323entry: 324 %b1 = load bfloat, ptr %b, align 2 325 %vecinit = insertelement <4 x bfloat> undef, bfloat %b1, i32 0 326 %vecinit2 = insertelement <4 x bfloat> %vecinit, bfloat %b1, i32 1 327 %vecinit3 = insertelement <4 x bfloat> %vecinit2, bfloat %b1, i32 2 328 %vecinit4 = insertelement <4 x bfloat> %vecinit3, bfloat %b1, i32 3 329 ret <4 x bfloat> %vecinit4 330} 331 332define <8 x bfloat> @test_vld_dup1_8xbfloat(ptr %b) local_unnamed_addr { 333; CHECK-LABEL: test_vld_dup1_8xbfloat: 334; CHECK: // %bb.0: // %entry 335; CHECK-NEXT: ld1r { v0.8h }, [x0] 336; CHECK-NEXT: ret 337entry: 338 %b1 = load bfloat, ptr %b, align 2 339 %vecinit = insertelement <8 x bfloat> undef, bfloat %b1, i32 0 340 %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer 341 ret <8 x bfloat> %vecinit8 342} 343 344define <8 x bfloat> @test_shufflevector8xbfloat(<4 x bfloat> %a) { 345; CHECK-LABEL: test_shufflevector8xbfloat: 346; CHECK: // %bb.0: // %entry 347; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 348; CHECK-NEXT: mov v0.d[1], v0.d[0] 349; CHECK-NEXT: ret 350entry: 351 %r = shufflevector <4 x bfloat> %a, <4 x bfloat> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 352 ret <8 x bfloat> %r 353} 354 355