1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 4 5declare <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1 6 7define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 { 8; CHECK-LABEL: test_mm_cvtne2ps2bf16_128: 9; CHECK: # %bb.0: # %entry 10; CHECK-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1] 11; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 12entry: 13 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 14 %1 = bitcast <8 x i16> %0 to <2 x i64> 15 ret <2 x i64> %1 16} 17 18define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 { 19; X86-LABEL: test_mm_maskz_cvtne2ps2bf16_128: 20; X86: # %bb.0: # %entry 21; X86-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1] 22; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 23; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 24; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xc0] 25; X86-NEXT: retl # encoding: [0xc3] 26; 27; X64-LABEL: test_mm_maskz_cvtne2ps2bf16_128: 28; X64: # %bb.0: # %entry 29; X64-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1] 30; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 31; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xc0] 32; X64-NEXT: retq # encoding: [0xc3] 33entry: 34 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 35 %1 = bitcast i8 %U to <8 x i1> 36 %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer 37 %3 = bitcast <8 x i16> %2 to <2 x i64> 38 ret <2 x i64> %3 39} 40 41define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 { 42; X86-LABEL: test_mm_mask_cvtne2ps2bf16_128: 43; X86: # %bb.0: # %entry 44; X86-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm1 # encoding: [0x62,0xf2,0x77,0x08,0x72,0xca] 45; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 46; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 47; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] 48; X86-NEXT: retl # encoding: [0xc3] 49; 50; X64-LABEL: test_mm_mask_cvtne2ps2bf16_128: 51; X64: # %bb.0: # %entry 52; X64-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm1 # encoding: [0x62,0xf2,0x77,0x08,0x72,0xca] 53; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 54; X64-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] 55; X64-NEXT: retq # encoding: [0xc3] 56entry: 57 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 58 %1 = bitcast <2 x i64> %C to <8 x i16> 59 %2 = bitcast i8 %U to <8 x i1> 60 %3 = select <8 x i1> %2, <8 x i16> %0, <8 x i16> %1 61 %4 = bitcast <8 x i16> %3 to <2 x i64> 62 ret <2 x i64> %4 63} 64 65declare <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3 66 67define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 { 68; CHECK-LABEL: test_mm256_cvtne2ps2bf16_256: 69; CHECK: # %bb.0: # %entry 70; CHECK-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1] 71; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 72entry: 73 %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 74 %1 = bitcast <16 x i16> %0 to <4 x i64> 75 ret <4 x i64> %1 76} 77 78define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 { 79; X86-LABEL: test_mm256_maskz_cvtne2ps2bf16_256: 80; X86: # %bb.0: # %entry 81; X86-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1] 82; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] 83; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xc0] 84; X86-NEXT: retl # encoding: [0xc3] 85; 86; X64-LABEL: test_mm256_maskz_cvtne2ps2bf16_256: 87; X64: # %bb.0: # %entry 88; X64-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1] 89; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 90; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xc0] 91; X64-NEXT: retq # encoding: [0xc3] 92entry: 93 %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 94 %1 = bitcast i16 %U to <16 x i1> 95 %2 = select <16 x i1> %1, <16 x i16> %0, <16 x i16> zeroinitializer 96 %3 = bitcast <16 x i16> %2 to <4 x i64> 97 ret <4 x i64> %3 98} 99 100define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 { 101; X86-LABEL: test_mm256_mask_cvtne2ps2bf16_256: 102; X86: # %bb.0: # %entry 103; X86-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm1 # encoding: [0x62,0xf2,0x77,0x28,0x72,0xca] 104; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] 105; X86-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1] 106; X86-NEXT: retl # encoding: [0xc3] 107; 108; X64-LABEL: test_mm256_mask_cvtne2ps2bf16_256: 109; X64: # %bb.0: # %entry 110; X64-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm1 # encoding: [0x62,0xf2,0x77,0x28,0x72,0xca] 111; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 112; X64-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1] 113; X64-NEXT: retq # encoding: [0xc3] 114entry: 115 %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 116 %1 = bitcast <4 x i64> %C to <16 x i16> 117 %2 = bitcast i16 %U to <16 x i1> 118 %3 = select <16 x i1> %2, <16 x i16> %0, <16 x i16> %1 119 %4 = bitcast <16 x i16> %3 to <4 x i64> 120 ret <4 x i64> %4 121} 122 123declare <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3 124 125define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 { 126; CHECK-LABEL: test_mm256_cvtneps2bf16_256: 127; CHECK: # %bb.0: # %entry 128; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc0] 129; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 130; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 131entry: 132 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 133 %1 = bitcast <8 x i16> %0 to <2 x i64> 134 ret <2 x i64> %1 135} 136 137define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 { 138; X86-LABEL: test_mm256_maskz_cvtneps2bf16_256: 139; X86: # %bb.0: # %entry 140; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc0] 141; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 142; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 143; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xc0] 144; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 145; X86-NEXT: retl # encoding: [0xc3] 146; 147; X64-LABEL: test_mm256_maskz_cvtneps2bf16_256: 148; X64: # %bb.0: # %entry 149; X64-NEXT: vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc0] 150; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 151; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xc0] 152; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 153; X64-NEXT: retq # encoding: [0xc3] 154entry: 155 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 156 %1 = bitcast i8 %U to <8 x i1> 157 %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer 158 %3 = bitcast <8 x i16> %2 to <2 x i64> 159 ret <2 x i64> %3 160} 161 162define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 { 163; X86-LABEL: test_mm256_mask_cvtneps2bf16_256: 164; X86: # %bb.0: # %entry 165; X86-NEXT: vcvtneps2bf16 %ymm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc9] 166; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 167; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 168; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] 169; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 170; X86-NEXT: retl # encoding: [0xc3] 171; 172; X64-LABEL: test_mm256_mask_cvtneps2bf16_256: 173; X64: # %bb.0: # %entry 174; X64-NEXT: vcvtneps2bf16 %ymm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc9] 175; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 176; X64-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] 177; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 178; X64-NEXT: retq # encoding: [0xc3] 179entry: 180 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 181 %1 = bitcast <2 x i64> %C to <8 x i16> 182 %2 = bitcast i8 %U to <8 x i1> 183 %3 = select <8 x i1> %2, <8 x i16> %0, <8 x i16> %1 184 %4 = bitcast <8 x i16> %3 to <2 x i64> 185 ret <2 x i64> %4 186} 187 188declare <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x i16>, <4 x i1>) #3 189 190define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 { 191; CHECK-LABEL: test_mm128_cvtneps2bf16_128: 192; CHECK: # %bb.0: # %entry 193; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0] 194; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 195entry: 196 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 197 %1 = bitcast <8 x i16> %0 to <2 x i64> 198 ret <2 x i64> %1 199} 200 201define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 { 202; X86-LABEL: test_mm128_maskz_cvtneps2bf16_128: 203; X86: # %bb.0: # %entry 204; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 205; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 206; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x72,0xc0] 207; X86-NEXT: retl # encoding: [0xc3] 208; 209; X64-LABEL: test_mm128_maskz_cvtneps2bf16_128: 210; X64: # %bb.0: # %entry 211; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 212; X64-NEXT: vcvtneps2bf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x72,0xc0] 213; X64-NEXT: retq # encoding: [0xc3] 214entry: 215 %0 = bitcast i8 %U to <8 x i1> 216 %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 217 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> zeroinitializer, <4 x i1> %1) #4 218 %3 = bitcast <8 x i16> %2 to <2 x i64> 219 ret <2 x i64> %3 220} 221 222define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 { 223; X86-LABEL: test_mm128_mask_cvtneps2bf16_128: 224; X86: # %bb.0: # %entry 225; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 226; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 227; X86-NEXT: vcvtneps2bf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x72,0xc1] 228; X86-NEXT: retl # encoding: [0xc3] 229; 230; X64-LABEL: test_mm128_mask_cvtneps2bf16_128: 231; X64: # %bb.0: # %entry 232; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 233; X64-NEXT: vcvtneps2bf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x72,0xc1] 234; X64-NEXT: retq # encoding: [0xc3] 235entry: 236 %0 = bitcast i8 %U to <8 x i1> 237 %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 238 %2 = bitcast <2 x i64> %C to <8 x i16> 239 %3 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> %2, <4 x i1> %1) #4 240 %4 = bitcast <8 x i16> %3 to <2 x i64> 241 ret <2 x i64> %4 242} 243 244; Make sure we don't fold a select into the 128 bit form of cvtneps2bf16. It 245; always writes zeros to bits 127:64 regardless of mask. 246define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 { 247; X86-LABEL: test_mm128_cvtneps2bf16_128_select: 248; X86: # %bb.0: # %entry 249; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 250; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 251; X86-NEXT: vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9] 252; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] 253; X86-NEXT: retl # encoding: [0xc3] 254; 255; X64-LABEL: test_mm128_cvtneps2bf16_128_select: 256; X64: # %bb.0: # %entry 257; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 258; X64-NEXT: vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9] 259; X64-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] 260; X64-NEXT: retq # encoding: [0xc3] 261entry: 262 %0 = bitcast i8 %U to <8 x i1> 263 %1 = bitcast <2 x i64> %C to <8 x i16> 264 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 265 %3 = select <8 x i1> %0, <8 x i16> %2, <8 x i16> %1 266 %4 = bitcast <8 x i16> %3 to <2 x i64> 267 ret <2 x i64> %4 268} 269 270declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <8 x i32>, <8 x i32>) #3 271 272define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B) local_unnamed_addr #2 { 273; CHECK-LABEL: test_mm256_dpbf16ps_256: 274; CHECK: # %bb.0: # %entry 275; CHECK-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x52,0xc2] 276; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 277entry: 278 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B) #4 279 ret <8 x float> %0 280} 281 282define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B, i8 zeroext %U) local_unnamed_addr #2 { 283; X86-LABEL: test_mm256_maskz_dpbf16ps_256: 284; X86: # %bb.0: # %entry 285; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 286; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 287; X86-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x52,0xc2] 288; X86-NEXT: retl # encoding: [0xc3] 289; 290; X64-LABEL: test_mm256_maskz_dpbf16ps_256: 291; X64: # %bb.0: # %entry 292; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 293; X64-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x52,0xc2] 294; X64-NEXT: retq # encoding: [0xc3] 295entry: 296 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B) #4 297 %1 = bitcast i8 %U to <8 x i1> 298 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 299 ret <8 x float> %2 300} 301define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <8 x i32> %A, <8 x i32> %B) local_unnamed_addr #2 { 302; X86-LABEL: test_mm256_mask_dpbf16ps_256: 303; X86: # %bb.0: # %entry 304; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 305; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 306; X86-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x52,0xc2] 307; X86-NEXT: retl # encoding: [0xc3] 308; 309; X64-LABEL: test_mm256_mask_dpbf16ps_256: 310; X64: # %bb.0: # %entry 311; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 312; X64-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x52,0xc2] 313; X64-NEXT: retq # encoding: [0xc3] 314entry: 315 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B) #4 316 %1 = bitcast i8 %U to <8 x i1> 317 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E 318 ret <8 x float> %2 319} 320 321declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <4 x i32>, <4 x i32>) #3 322 323define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <4 x i32> %A, <4 x i32> %B) local_unnamed_addr #2 { 324; CHECK-LABEL: test_mm128_dpbf16ps_128: 325; CHECK: # %bb.0: # %entry 326; CHECK-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x52,0xc2] 327; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 328entry: 329 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <4 x i32> %A, <4x i32> %B) #4 330 ret <4 x float> %0 331} 332 333define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <4 x i32> %A, <4 x i32> %B, i4 zeroext %U) local_unnamed_addr #2 { 334; X86-LABEL: test_mm128_maskz_dpbf16ps_128: 335; X86: # %bb.0: # %entry 336; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 337; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 338; X86-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x52,0xc2] 339; X86-NEXT: retl # encoding: [0xc3] 340; 341; X64-LABEL: test_mm128_maskz_dpbf16ps_128: 342; X64: # %bb.0: # %entry 343; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 344; X64-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x52,0xc2] 345; X64-NEXT: retq # encoding: [0xc3] 346entry: 347 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <4 x i32> %A, <4 x i32> %B) #4 348 %1 = bitcast i4 %U to <4 x i1> 349 %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer 350 ret <4 x float> %2 351} 352define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <4 x i32> %A, <4 x i32> %B) local_unnamed_addr #2 { 353; X86-LABEL: test_mm128_mask_dpbf16ps_128: 354; X86: # %bb.0: # %entry 355; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 356; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 357; X86-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x52,0xc2] 358; X86-NEXT: retl # encoding: [0xc3] 359; 360; X64-LABEL: test_mm128_mask_dpbf16ps_128: 361; X64: # %bb.0: # %entry 362; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 363; X64-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x52,0xc2] 364; X64-NEXT: retq # encoding: [0xc3] 365entry: 366 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <4 x i32> %A, <4 x i32> %B) #4 367 %1 = bitcast i4 %U to <4 x i1> 368 %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E 369 ret <4 x float> %2 370} 371