1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86 3; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2 4; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16 5; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16 6; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,BF16,AVXNC 7 8define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind { 9; X86-LABEL: add: 10; X86: # %bb.0: 11; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 12; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 13; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 14; X86-NEXT: movzwl (%edx), %edx 15; X86-NEXT: shll $16, %edx 16; X86-NEXT: vmovd %edx, %xmm0 17; X86-NEXT: movzwl (%ecx), %ecx 18; X86-NEXT: shll $16, %ecx 19; X86-NEXT: vmovd %ecx, %xmm1 20; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0 21; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 22; X86-NEXT: vpextrw $0, %xmm0, (%eax) 23; X86-NEXT: retl 24; 25; SSE2-LABEL: add: 26; SSE2: # %bb.0: 27; SSE2-NEXT: pushq %rbx 28; SSE2-NEXT: movq %rdx, %rbx 29; SSE2-NEXT: movzwl (%rsi), %eax 30; SSE2-NEXT: shll $16, %eax 31; SSE2-NEXT: movd %eax, %xmm1 32; SSE2-NEXT: movzwl (%rdi), %eax 33; SSE2-NEXT: shll $16, %eax 34; SSE2-NEXT: movd %eax, %xmm0 35; SSE2-NEXT: addss %xmm1, %xmm0 36; SSE2-NEXT: callq __truncsfbf2@PLT 37; SSE2-NEXT: pextrw $0, %xmm0, %eax 38; SSE2-NEXT: movw %ax, (%rbx) 39; SSE2-NEXT: popq %rbx 40; SSE2-NEXT: retq 41; 42; F16-LABEL: add: 43; F16: # %bb.0: 44; F16-NEXT: movzwl (%rsi), %eax 45; F16-NEXT: shll $16, %eax 46; F16-NEXT: vmovd %eax, %xmm0 47; F16-NEXT: movzwl (%rdi), %eax 48; F16-NEXT: shll $16, %eax 49; F16-NEXT: vmovd %eax, %xmm1 50; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0 51; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 52; F16-NEXT: vpextrw $0, %xmm0, (%rdx) 53; F16-NEXT: retq 54; 55; AVXNC-LABEL: add: 56; AVXNC: # %bb.0: 57; AVXNC-NEXT: movzwl (%rsi), %eax 58; AVXNC-NEXT: shll $16, %eax 59; AVXNC-NEXT: vmovd %eax, %xmm0 60; AVXNC-NEXT: movzwl (%rdi), %eax 61; AVXNC-NEXT: shll $16, %eax 62; AVXNC-NEXT: vmovd %eax, %xmm1 63; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 64; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 65; AVXNC-NEXT: vpextrw $0, %xmm0, (%rdx) 66; AVXNC-NEXT: retq 67 %a = load bfloat, ptr %pa 68 %b = load bfloat, ptr %pb 69 %add = fadd bfloat %a, %b 70 store bfloat %add, ptr %pc 71 ret void 72} 73 74define bfloat @add2(bfloat %a, bfloat %b) nounwind { 75; X86-LABEL: add2: 76; X86: # %bb.0: 77; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 78; X86-NEXT: shll $16, %eax 79; X86-NEXT: vmovd %eax, %xmm0 80; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 81; X86-NEXT: shll $16, %eax 82; X86-NEXT: vmovd %eax, %xmm1 83; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0 84; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 85; X86-NEXT: vmovw %xmm0, %eax 86; X86-NEXT: vmovw %eax, %xmm0 87; X86-NEXT: retl 88; 89; SSE2-LABEL: add2: 90; SSE2: # %bb.0: 91; SSE2-NEXT: pushq %rax 92; SSE2-NEXT: pextrw $0, %xmm0, %eax 93; SSE2-NEXT: pextrw $0, %xmm1, %ecx 94; SSE2-NEXT: shll $16, %ecx 95; SSE2-NEXT: movd %ecx, %xmm1 96; SSE2-NEXT: shll $16, %eax 97; SSE2-NEXT: movd %eax, %xmm0 98; SSE2-NEXT: addss %xmm1, %xmm0 99; SSE2-NEXT: callq __truncsfbf2@PLT 100; SSE2-NEXT: popq %rax 101; SSE2-NEXT: retq 102; 103; FP16-LABEL: add2: 104; FP16: # %bb.0: 105; FP16-NEXT: vmovw %xmm0, %eax 106; FP16-NEXT: vmovw %xmm1, %ecx 107; FP16-NEXT: shll $16, %ecx 108; FP16-NEXT: vmovd %ecx, %xmm0 109; FP16-NEXT: shll $16, %eax 110; FP16-NEXT: vmovd %eax, %xmm1 111; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 112; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 113; FP16-NEXT: vmovw %xmm0, %eax 114; FP16-NEXT: vmovw %eax, %xmm0 115; FP16-NEXT: retq 116; 117; AVXNC-LABEL: add2: 118; AVXNC: # %bb.0: 119; AVXNC-NEXT: vpextrw $0, %xmm0, %eax 120; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx 121; AVXNC-NEXT: shll $16, %ecx 122; AVXNC-NEXT: vmovd %ecx, %xmm0 123; AVXNC-NEXT: shll $16, %eax 124; AVXNC-NEXT: vmovd %eax, %xmm1 125; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 126; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 127; AVXNC-NEXT: vmovd %xmm0, %eax 128; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 129; AVXNC-NEXT: retq 130 %add = fadd bfloat %a, %b 131 ret bfloat %add 132} 133 134define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind { 135; X86-LABEL: add_double: 136; X86: # %bb.0: 137; X86-NEXT: pushl %ebx 138; X86-NEXT: pushl %edi 139; X86-NEXT: pushl %esi 140; X86-NEXT: subl $16, %esp 141; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 142; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx 143; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 144; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 145; X86-NEXT: vmovsd %xmm0, (%esp) 146; X86-NEXT: calll __truncdfbf2 147; X86-NEXT: vmovw %xmm0, %edi 148; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 149; X86-NEXT: vmovsd %xmm0, (%esp) 150; X86-NEXT: calll __truncdfbf2 151; X86-NEXT: vmovw %xmm0, %eax 152; X86-NEXT: shll $16, %eax 153; X86-NEXT: vmovd %eax, %xmm0 154; X86-NEXT: shll $16, %edi 155; X86-NEXT: vmovd %edi, %xmm1 156; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0 157; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 158; X86-NEXT: vmovw %xmm0, %eax 159; X86-NEXT: shll $16, %eax 160; X86-NEXT: vmovd %eax, %xmm0 161; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 162; X86-NEXT: vmovsd %xmm0, (%esi) 163; X86-NEXT: addl $16, %esp 164; X86-NEXT: popl %esi 165; X86-NEXT: popl %edi 166; X86-NEXT: popl %ebx 167; X86-NEXT: retl 168; 169; SSE2-LABEL: add_double: 170; SSE2: # %bb.0: 171; SSE2-NEXT: pushq %rbp 172; SSE2-NEXT: pushq %r14 173; SSE2-NEXT: pushq %rbx 174; SSE2-NEXT: movq %rdx, %rbx 175; SSE2-NEXT: movq %rsi, %r14 176; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 177; SSE2-NEXT: callq __truncdfbf2@PLT 178; SSE2-NEXT: pextrw $0, %xmm0, %ebp 179; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 180; SSE2-NEXT: callq __truncdfbf2@PLT 181; SSE2-NEXT: pextrw $0, %xmm0, %eax 182; SSE2-NEXT: shll $16, %eax 183; SSE2-NEXT: movd %eax, %xmm1 184; SSE2-NEXT: shll $16, %ebp 185; SSE2-NEXT: movd %ebp, %xmm0 186; SSE2-NEXT: addss %xmm1, %xmm0 187; SSE2-NEXT: callq __truncsfbf2@PLT 188; SSE2-NEXT: pextrw $0, %xmm0, %eax 189; SSE2-NEXT: shll $16, %eax 190; SSE2-NEXT: movd %eax, %xmm0 191; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 192; SSE2-NEXT: movsd %xmm0, (%rbx) 193; SSE2-NEXT: popq %rbx 194; SSE2-NEXT: popq %r14 195; SSE2-NEXT: popq %rbp 196; SSE2-NEXT: retq 197; 198; FP16-LABEL: add_double: 199; FP16: # %bb.0: 200; FP16-NEXT: pushq %rbp 201; FP16-NEXT: pushq %r14 202; FP16-NEXT: pushq %rbx 203; FP16-NEXT: movq %rdx, %rbx 204; FP16-NEXT: movq %rsi, %r14 205; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 206; FP16-NEXT: callq __truncdfbf2@PLT 207; FP16-NEXT: vmovw %xmm0, %ebp 208; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 209; FP16-NEXT: callq __truncdfbf2@PLT 210; FP16-NEXT: vmovw %xmm0, %eax 211; FP16-NEXT: shll $16, %eax 212; FP16-NEXT: vmovd %eax, %xmm0 213; FP16-NEXT: shll $16, %ebp 214; FP16-NEXT: vmovd %ebp, %xmm1 215; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 216; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 217; FP16-NEXT: vmovw %xmm0, %eax 218; FP16-NEXT: shll $16, %eax 219; FP16-NEXT: vmovd %eax, %xmm0 220; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 221; FP16-NEXT: vmovsd %xmm0, (%rbx) 222; FP16-NEXT: popq %rbx 223; FP16-NEXT: popq %r14 224; FP16-NEXT: popq %rbp 225; FP16-NEXT: retq 226; 227; AVXNC-LABEL: add_double: 228; AVXNC: # %bb.0: 229; AVXNC-NEXT: pushq %rbp 230; AVXNC-NEXT: pushq %r14 231; AVXNC-NEXT: pushq %rbx 232; AVXNC-NEXT: movq %rdx, %rbx 233; AVXNC-NEXT: movq %rsi, %r14 234; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 235; AVXNC-NEXT: callq __truncdfbf2@PLT 236; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp 237; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 238; AVXNC-NEXT: callq __truncdfbf2@PLT 239; AVXNC-NEXT: vpextrw $0, %xmm0, %eax 240; AVXNC-NEXT: shll $16, %eax 241; AVXNC-NEXT: vmovd %eax, %xmm0 242; AVXNC-NEXT: shll $16, %ebp 243; AVXNC-NEXT: vmovd %ebp, %xmm1 244; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 245; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 246; AVXNC-NEXT: vmovd %xmm0, %eax 247; AVXNC-NEXT: shll $16, %eax 248; AVXNC-NEXT: vmovd %eax, %xmm0 249; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 250; AVXNC-NEXT: vmovsd %xmm0, (%rbx) 251; AVXNC-NEXT: popq %rbx 252; AVXNC-NEXT: popq %r14 253; AVXNC-NEXT: popq %rbp 254; AVXNC-NEXT: retq 255 %la = load double, ptr %pa 256 %a = fptrunc double %la to bfloat 257 %lb = load double, ptr %pb 258 %b = fptrunc double %lb to bfloat 259 %add = fadd bfloat %a, %b 260 %dadd = fpext bfloat %add to double 261 store double %dadd, ptr %pc 262 ret void 263} 264 265define double @add_double2(double %da, double %db) nounwind { 266; X86-LABEL: add_double2: 267; X86: # %bb.0: 268; X86-NEXT: pushl %esi 269; X86-NEXT: subl $24, %esp 270; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 271; X86-NEXT: vmovsd %xmm0, (%esp) 272; X86-NEXT: calll __truncdfbf2 273; X86-NEXT: vmovw %xmm0, %esi 274; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 275; X86-NEXT: vmovsd %xmm0, (%esp) 276; X86-NEXT: calll __truncdfbf2 277; X86-NEXT: vmovw %xmm0, %eax 278; X86-NEXT: shll $16, %eax 279; X86-NEXT: vmovd %eax, %xmm0 280; X86-NEXT: shll $16, %esi 281; X86-NEXT: vmovd %esi, %xmm1 282; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0 283; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 284; X86-NEXT: vmovw %xmm0, %eax 285; X86-NEXT: shll $16, %eax 286; X86-NEXT: vmovd %eax, %xmm0 287; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 288; X86-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) 289; X86-NEXT: fldl {{[0-9]+}}(%esp) 290; X86-NEXT: addl $24, %esp 291; X86-NEXT: popl %esi 292; X86-NEXT: retl 293; 294; SSE2-LABEL: add_double2: 295; SSE2: # %bb.0: 296; SSE2-NEXT: pushq %rbx 297; SSE2-NEXT: subq $16, %rsp 298; SSE2-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 299; SSE2-NEXT: callq __truncdfbf2@PLT 300; SSE2-NEXT: pextrw $0, %xmm0, %ebx 301; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload 302; SSE2-NEXT: # xmm0 = mem[0],zero 303; SSE2-NEXT: callq __truncdfbf2@PLT 304; SSE2-NEXT: pextrw $0, %xmm0, %eax 305; SSE2-NEXT: shll $16, %eax 306; SSE2-NEXT: movd %eax, %xmm1 307; SSE2-NEXT: shll $16, %ebx 308; SSE2-NEXT: movd %ebx, %xmm0 309; SSE2-NEXT: addss %xmm1, %xmm0 310; SSE2-NEXT: callq __truncsfbf2@PLT 311; SSE2-NEXT: pextrw $0, %xmm0, %eax 312; SSE2-NEXT: shll $16, %eax 313; SSE2-NEXT: movd %eax, %xmm0 314; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 315; SSE2-NEXT: addq $16, %rsp 316; SSE2-NEXT: popq %rbx 317; SSE2-NEXT: retq 318; 319; FP16-LABEL: add_double2: 320; FP16: # %bb.0: 321; FP16-NEXT: pushq %rbx 322; FP16-NEXT: subq $16, %rsp 323; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 324; FP16-NEXT: callq __truncdfbf2@PLT 325; FP16-NEXT: vmovw %xmm0, %ebx 326; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload 327; FP16-NEXT: # xmm0 = mem[0],zero 328; FP16-NEXT: callq __truncdfbf2@PLT 329; FP16-NEXT: vmovw %xmm0, %eax 330; FP16-NEXT: shll $16, %eax 331; FP16-NEXT: vmovd %eax, %xmm0 332; FP16-NEXT: shll $16, %ebx 333; FP16-NEXT: vmovd %ebx, %xmm1 334; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 335; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 336; FP16-NEXT: vmovw %xmm0, %eax 337; FP16-NEXT: shll $16, %eax 338; FP16-NEXT: vmovd %eax, %xmm0 339; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 340; FP16-NEXT: addq $16, %rsp 341; FP16-NEXT: popq %rbx 342; FP16-NEXT: retq 343; 344; AVXNC-LABEL: add_double2: 345; AVXNC: # %bb.0: 346; AVXNC-NEXT: pushq %rbx 347; AVXNC-NEXT: subq $16, %rsp 348; AVXNC-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 349; AVXNC-NEXT: callq __truncdfbf2@PLT 350; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx 351; AVXNC-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload 352; AVXNC-NEXT: # xmm0 = mem[0],zero 353; AVXNC-NEXT: callq __truncdfbf2@PLT 354; AVXNC-NEXT: vpextrw $0, %xmm0, %eax 355; AVXNC-NEXT: shll $16, %eax 356; AVXNC-NEXT: vmovd %eax, %xmm0 357; AVXNC-NEXT: shll $16, %ebx 358; AVXNC-NEXT: vmovd %ebx, %xmm1 359; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 360; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 361; AVXNC-NEXT: vmovd %xmm0, %eax 362; AVXNC-NEXT: shll $16, %eax 363; AVXNC-NEXT: vmovd %eax, %xmm0 364; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 365; AVXNC-NEXT: addq $16, %rsp 366; AVXNC-NEXT: popq %rbx 367; AVXNC-NEXT: retq 368 %a = fptrunc double %da to bfloat 369 %b = fptrunc double %db to bfloat 370 %add = fadd bfloat %a, %b 371 %dadd = fpext bfloat %add to double 372 ret double %dadd 373} 374 375define void @add_constant(ptr %pa, ptr %pc) nounwind { 376; X86-LABEL: add_constant: 377; X86: # %bb.0: 378; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 379; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 380; X86-NEXT: movzwl (%ecx), %ecx 381; X86-NEXT: shll $16, %ecx 382; X86-NEXT: vmovd %ecx, %xmm0 383; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 384; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 385; X86-NEXT: vpextrw $0, %xmm0, (%eax) 386; X86-NEXT: retl 387; 388; SSE2-LABEL: add_constant: 389; SSE2: # %bb.0: 390; SSE2-NEXT: pushq %rbx 391; SSE2-NEXT: movq %rsi, %rbx 392; SSE2-NEXT: movzwl (%rdi), %eax 393; SSE2-NEXT: shll $16, %eax 394; SSE2-NEXT: movd %eax, %xmm0 395; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 396; SSE2-NEXT: callq __truncsfbf2@PLT 397; SSE2-NEXT: pextrw $0, %xmm0, %eax 398; SSE2-NEXT: movw %ax, (%rbx) 399; SSE2-NEXT: popq %rbx 400; SSE2-NEXT: retq 401; 402; F16-LABEL: add_constant: 403; F16: # %bb.0: 404; F16-NEXT: movzwl (%rdi), %eax 405; F16-NEXT: shll $16, %eax 406; F16-NEXT: vmovd %eax, %xmm0 407; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 408; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 409; F16-NEXT: vpextrw $0, %xmm0, (%rsi) 410; F16-NEXT: retq 411; 412; AVXNC-LABEL: add_constant: 413; AVXNC: # %bb.0: 414; AVXNC-NEXT: movzwl (%rdi), %eax 415; AVXNC-NEXT: shll $16, %eax 416; AVXNC-NEXT: vmovd %eax, %xmm0 417; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 418; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 419; AVXNC-NEXT: vpextrw $0, %xmm0, (%rsi) 420; AVXNC-NEXT: retq 421 %a = load bfloat, ptr %pa 422 %add = fadd bfloat %a, 1.0 423 store bfloat %add, ptr %pc 424 ret void 425} 426 427define bfloat @add_constant2(bfloat %a) nounwind { 428; X86-LABEL: add_constant2: 429; X86: # %bb.0: 430; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 431; X86-NEXT: shll $16, %eax 432; X86-NEXT: vmovd %eax, %xmm0 433; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 434; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 435; X86-NEXT: vmovw %xmm0, %eax 436; X86-NEXT: vmovw %eax, %xmm0 437; X86-NEXT: retl 438; 439; SSE2-LABEL: add_constant2: 440; SSE2: # %bb.0: 441; SSE2-NEXT: pushq %rax 442; SSE2-NEXT: pextrw $0, %xmm0, %eax 443; SSE2-NEXT: shll $16, %eax 444; SSE2-NEXT: movd %eax, %xmm0 445; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 446; SSE2-NEXT: callq __truncsfbf2@PLT 447; SSE2-NEXT: popq %rax 448; SSE2-NEXT: retq 449; 450; FP16-LABEL: add_constant2: 451; FP16: # %bb.0: 452; FP16-NEXT: vmovw %xmm0, %eax 453; FP16-NEXT: shll $16, %eax 454; FP16-NEXT: vmovd %eax, %xmm0 455; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 456; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 457; FP16-NEXT: vmovw %xmm0, %eax 458; FP16-NEXT: vmovw %eax, %xmm0 459; FP16-NEXT: retq 460; 461; AVXNC-LABEL: add_constant2: 462; AVXNC: # %bb.0: 463; AVXNC-NEXT: vpextrw $0, %xmm0, %eax 464; AVXNC-NEXT: shll $16, %eax 465; AVXNC-NEXT: vmovd %eax, %xmm0 466; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 467; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 468; AVXNC-NEXT: vmovd %xmm0, %eax 469; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 470; AVXNC-NEXT: retq 471 %add = fadd bfloat %a, 1.0 472 ret bfloat %add 473} 474 475define void @store_constant(ptr %pc) nounwind { 476; X86-LABEL: store_constant: 477; X86: # %bb.0: 478; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 479; X86-NEXT: movw $16256, (%eax) # imm = 0x3F80 480; X86-NEXT: retl 481; 482; CHECK-LABEL: store_constant: 483; CHECK: # %bb.0: 484; CHECK-NEXT: movw $16256, (%rdi) # imm = 0x3F80 485; CHECK-NEXT: retq 486 store bfloat 1.0, ptr %pc 487 ret void 488} 489 490define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind { 491; X86-LABEL: fold_ext_trunc: 492; X86: # %bb.0: 493; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 494; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 495; X86-NEXT: movzwl (%ecx), %ecx 496; X86-NEXT: movw %cx, (%eax) 497; X86-NEXT: retl 498; 499; CHECK-LABEL: fold_ext_trunc: 500; CHECK: # %bb.0: 501; CHECK-NEXT: movzwl (%rdi), %eax 502; CHECK-NEXT: movw %ax, (%rsi) 503; CHECK-NEXT: retq 504 %a = load bfloat, ptr %pa 505 %ext = fpext bfloat %a to float 506 %trunc = fptrunc float %ext to bfloat 507 store bfloat %trunc, ptr %pc 508 ret void 509} 510 511define bfloat @fold_ext_trunc2(bfloat %a) nounwind { 512; X86-LABEL: fold_ext_trunc2: 513; X86: # %bb.0: 514; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 515; X86-NEXT: retl 516; 517; CHECK-LABEL: fold_ext_trunc2: 518; CHECK: # %bb.0: 519; CHECK-NEXT: retq 520 %ext = fpext bfloat %a to float 521 %trunc = fptrunc float %ext to bfloat 522 ret bfloat %trunc 523} 524 525define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { 526; X86-LABEL: addv: 527; X86: # %bb.0: 528; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 529; X86-NEXT: vpslld $16, %ymm1, %ymm1 530; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 531; X86-NEXT: vpslld $16, %ymm0, %ymm0 532; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 533; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0 534; X86-NEXT: vzeroupper 535; X86-NEXT: retl 536; 537; SSE2-LABEL: addv: 538; SSE2: # %bb.0: 539; SSE2-NEXT: pushq %rbp 540; SSE2-NEXT: pushq %r15 541; SSE2-NEXT: pushq %r14 542; SSE2-NEXT: pushq %r13 543; SSE2-NEXT: pushq %r12 544; SSE2-NEXT: pushq %rbx 545; SSE2-NEXT: subq $56, %rsp 546; SSE2-NEXT: movq %xmm0, %rcx 547; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 548; SSE2-NEXT: movq %rcx, %rax 549; SSE2-NEXT: shrq $48, %rax 550; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 551; SSE2-NEXT: movq %xmm1, %rdx 552; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 553; SSE2-NEXT: movq %rdx, %rax 554; SSE2-NEXT: shrq $48, %rax 555; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 556; SSE2-NEXT: movq %rcx, %rax 557; SSE2-NEXT: shrq $32, %rax 558; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 559; SSE2-NEXT: movq %rdx, %rax 560; SSE2-NEXT: shrq $32, %rax 561; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 562; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 563; SSE2-NEXT: movq %xmm0, %r15 564; SSE2-NEXT: movq %r15, %rbx 565; SSE2-NEXT: shrq $48, %rbx 566; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] 567; SSE2-NEXT: movq %xmm1, %r14 568; SSE2-NEXT: movq %r14, %rbp 569; SSE2-NEXT: shrq $48, %rbp 570; SSE2-NEXT: movq %r15, %r12 571; SSE2-NEXT: shrq $32, %r12 572; SSE2-NEXT: movq %r14, %r13 573; SSE2-NEXT: shrq $32, %r13 574; SSE2-NEXT: movl %r14d, %eax 575; SSE2-NEXT: shll $16, %eax 576; SSE2-NEXT: movd %eax, %xmm1 577; SSE2-NEXT: movl %r15d, %eax 578; SSE2-NEXT: shll $16, %eax 579; SSE2-NEXT: movd %eax, %xmm0 580; SSE2-NEXT: addss %xmm1, %xmm0 581; SSE2-NEXT: callq __truncsfbf2@PLT 582; SSE2-NEXT: pextrw $0, %xmm0, %eax 583; SSE2-NEXT: movzwl %ax, %eax 584; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 585; SSE2-NEXT: andl $-65536, %r14d # imm = 0xFFFF0000 586; SSE2-NEXT: movd %r14d, %xmm1 587; SSE2-NEXT: andl $-65536, %r15d # imm = 0xFFFF0000 588; SSE2-NEXT: movd %r15d, %xmm0 589; SSE2-NEXT: addss %xmm1, %xmm0 590; SSE2-NEXT: callq __truncsfbf2@PLT 591; SSE2-NEXT: pextrw $0, %xmm0, %r15d 592; SSE2-NEXT: shll $16, %r15d 593; SSE2-NEXT: addl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload 594; SSE2-NEXT: shll $16, %r13d 595; SSE2-NEXT: movd %r13d, %xmm1 596; SSE2-NEXT: shll $16, %r12d 597; SSE2-NEXT: movd %r12d, %xmm0 598; SSE2-NEXT: addss %xmm1, %xmm0 599; SSE2-NEXT: callq __truncsfbf2@PLT 600; SSE2-NEXT: pextrw $0, %xmm0, %eax 601; SSE2-NEXT: movzwl %ax, %r14d 602; SSE2-NEXT: shll $16, %ebp 603; SSE2-NEXT: movd %ebp, %xmm1 604; SSE2-NEXT: shll $16, %ebx 605; SSE2-NEXT: movd %ebx, %xmm0 606; SSE2-NEXT: addss %xmm1, %xmm0 607; SSE2-NEXT: callq __truncsfbf2@PLT 608; SSE2-NEXT: pextrw $0, %xmm0, %ebx 609; SSE2-NEXT: shll $16, %ebx 610; SSE2-NEXT: orl %r14d, %ebx 611; SSE2-NEXT: shlq $32, %rbx 612; SSE2-NEXT: orq %r15, %rbx 613; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload 614; SSE2-NEXT: movl %r15d, %eax 615; SSE2-NEXT: shll $16, %eax 616; SSE2-NEXT: movd %eax, %xmm1 617; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload 618; SSE2-NEXT: movl %r14d, %eax 619; SSE2-NEXT: shll $16, %eax 620; SSE2-NEXT: movd %eax, %xmm0 621; SSE2-NEXT: addss %xmm1, %xmm0 622; SSE2-NEXT: callq __truncsfbf2@PLT 623; SSE2-NEXT: pextrw $0, %xmm0, %eax 624; SSE2-NEXT: movzwl %ax, %ebp 625; SSE2-NEXT: movq %r15, %rax 626; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 627; SSE2-NEXT: movd %eax, %xmm1 628; SSE2-NEXT: movq %r14, %rax 629; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 630; SSE2-NEXT: movd %eax, %xmm0 631; SSE2-NEXT: addss %xmm1, %xmm0 632; SSE2-NEXT: callq __truncsfbf2@PLT 633; SSE2-NEXT: pextrw $0, %xmm0, %r14d 634; SSE2-NEXT: shll $16, %r14d 635; SSE2-NEXT: orl %ebp, %r14d 636; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 637; SSE2-NEXT: shll $16, %eax 638; SSE2-NEXT: movd %eax, %xmm1 639; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 640; SSE2-NEXT: shll $16, %eax 641; SSE2-NEXT: movd %eax, %xmm0 642; SSE2-NEXT: addss %xmm1, %xmm0 643; SSE2-NEXT: callq __truncsfbf2@PLT 644; SSE2-NEXT: pextrw $0, %xmm0, %eax 645; SSE2-NEXT: movzwl %ax, %ebp 646; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 647; SSE2-NEXT: shll $16, %eax 648; SSE2-NEXT: movd %eax, %xmm1 649; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 650; SSE2-NEXT: shll $16, %eax 651; SSE2-NEXT: movd %eax, %xmm0 652; SSE2-NEXT: addss %xmm1, %xmm0 653; SSE2-NEXT: callq __truncsfbf2@PLT 654; SSE2-NEXT: pextrw $0, %xmm0, %eax 655; SSE2-NEXT: shll $16, %eax 656; SSE2-NEXT: orl %ebp, %eax 657; SSE2-NEXT: shlq $32, %rax 658; SSE2-NEXT: orq %r14, %rax 659; SSE2-NEXT: movq %rax, %xmm0 660; SSE2-NEXT: movq %rbx, %xmm1 661; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 662; SSE2-NEXT: addq $56, %rsp 663; SSE2-NEXT: popq %rbx 664; SSE2-NEXT: popq %r12 665; SSE2-NEXT: popq %r13 666; SSE2-NEXT: popq %r14 667; SSE2-NEXT: popq %r15 668; SSE2-NEXT: popq %rbp 669; SSE2-NEXT: retq 670; 671; F16-LABEL: addv: 672; F16: # %bb.0: 673; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 674; F16-NEXT: vpslld $16, %ymm1, %ymm1 675; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 676; F16-NEXT: vpslld $16, %ymm0, %ymm0 677; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0 678; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0 679; F16-NEXT: vzeroupper 680; F16-NEXT: retq 681; 682; AVXNC-LABEL: addv: 683; AVXNC: # %bb.0: 684; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 685; AVXNC-NEXT: vpslld $16, %ymm1, %ymm1 686; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 687; AVXNC-NEXT: vpslld $16, %ymm0, %ymm0 688; AVXNC-NEXT: vaddps %ymm1, %ymm0, %ymm0 689; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0 690; AVXNC-NEXT: vzeroupper 691; AVXNC-NEXT: retq 692 %add = fadd <8 x bfloat> %a, %b 693 ret <8 x bfloat> %add 694} 695 696define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) { 697; X86-LABEL: pr62997: 698; X86: # %bb.0: 699; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 700; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 701; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 702; X86-NEXT: retl 703; 704; SSE2-LABEL: pr62997: 705; SSE2: # %bb.0: 706; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 707; SSE2-NEXT: retq 708; 709; BF16-LABEL: pr62997: 710; BF16: # %bb.0: 711; BF16-NEXT: vpextrw $0, %xmm1, %eax 712; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 713; BF16-NEXT: retq 714; 715; FP16-LABEL: pr62997: 716; FP16: # %bb.0: 717; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 718; FP16-NEXT: retq 719 %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0 720 %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1 721 ret <2 x bfloat> %2 722} 723 724define <32 x bfloat> @pr63017() { 725; X86-LABEL: pr63017: 726; X86: # %bb.0: 727; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 728; X86-NEXT: retl 729; 730; SSE2-LABEL: pr63017: 731; SSE2: # %bb.0: 732; SSE2-NEXT: xorps %xmm0, %xmm0 733; SSE2-NEXT: xorps %xmm1, %xmm1 734; SSE2-NEXT: xorps %xmm2, %xmm2 735; SSE2-NEXT: xorps %xmm3, %xmm3 736; SSE2-NEXT: retq 737; 738; F16-LABEL: pr63017: 739; F16: # %bb.0: 740; F16-NEXT: vxorps %xmm0, %xmm0, %xmm0 741; F16-NEXT: retq 742; 743; AVXNC-LABEL: pr63017: 744; AVXNC: # %bb.0: 745; AVXNC-NEXT: vxorps %xmm0, %xmm0, %xmm0 746; AVXNC-NEXT: vxorps %xmm1, %xmm1, %xmm1 747; AVXNC-NEXT: retq 748 ret <32 x bfloat> zeroinitializer 749} 750 751define <32 x bfloat> @pr63017_2() nounwind { 752; X86-LABEL: pr63017_2: 753; X86: # %bb.0: 754; X86-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0] 755; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} 756; X86-NEXT: retl 757; 758; SSE2-LABEL: pr63017_2: 759; SSE2: # %bb.0: 760; SSE2-NEXT: xorl %eax, %eax 761; SSE2-NEXT: testb %al, %al 762; SSE2-NEXT: jne .LBB12_1 763; SSE2-NEXT: # %bb.2: # %cond.load 764; SSE2-NEXT: movzwl (%rax), %eax 765; SSE2-NEXT: shll $16, %eax 766; SSE2-NEXT: movd %eax, %xmm0 767; SSE2-NEXT: jmp .LBB12_3 768; SSE2-NEXT: .LBB12_1: 769; SSE2-NEXT: movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0] 770; SSE2-NEXT: .LBB12_3: 771; SSE2-NEXT: pushq %r14 772; SSE2-NEXT: pushq %rbx 773; SSE2-NEXT: subq $88, %rsp 774; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 775; SSE2-NEXT: callq __truncsfbf2@PLT 776; SSE2-NEXT: pextrw $0, %xmm0, %ebx 777; SSE2-NEXT: shll $16, %ebx 778; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 779; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 780; SSE2-NEXT: callq __truncsfbf2@PLT 781; SSE2-NEXT: pextrw $0, %xmm0, %eax 782; SSE2-NEXT: movzwl %ax, %r14d 783; SSE2-NEXT: orl %ebx, %r14d 784; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 785; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 786; SSE2-NEXT: callq __truncsfbf2@PLT 787; SSE2-NEXT: pextrw $0, %xmm0, %ebx 788; SSE2-NEXT: shll $16, %ebx 789; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 790; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 791; SSE2-NEXT: callq __truncsfbf2@PLT 792; SSE2-NEXT: pextrw $0, %xmm0, %eax 793; SSE2-NEXT: movzwl %ax, %eax 794; SSE2-NEXT: orl %ebx, %eax 795; SSE2-NEXT: shlq $32, %rax 796; SSE2-NEXT: orq %r14, %rax 797; SSE2-NEXT: movq %rax, %xmm0 798; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 799; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 800; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 801; SSE2-NEXT: callq __truncsfbf2@PLT 802; SSE2-NEXT: pextrw $0, %xmm0, %ebx 803; SSE2-NEXT: shll $16, %ebx 804; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 805; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 806; SSE2-NEXT: callq __truncsfbf2@PLT 807; SSE2-NEXT: pextrw $0, %xmm0, %eax 808; SSE2-NEXT: movzwl %ax, %r14d 809; SSE2-NEXT: orl %ebx, %r14d 810; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 811; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 812; SSE2-NEXT: callq __truncsfbf2@PLT 813; SSE2-NEXT: pextrw $0, %xmm0, %ebx 814; SSE2-NEXT: shll $16, %ebx 815; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 816; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 817; SSE2-NEXT: callq __truncsfbf2@PLT 818; SSE2-NEXT: pextrw $0, %xmm0, %eax 819; SSE2-NEXT: movzwl %ax, %eax 820; SSE2-NEXT: orl %ebx, %eax 821; SSE2-NEXT: shlq $32, %rax 822; SSE2-NEXT: orq %r14, %rax 823; SSE2-NEXT: movq %rax, %xmm0 824; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 825; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 826; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 827; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 828; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 829; SSE2-NEXT: callq __truncsfbf2@PLT 830; SSE2-NEXT: pextrw $0, %xmm0, %ebx 831; SSE2-NEXT: shll $16, %ebx 832; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 833; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 834; SSE2-NEXT: callq __truncsfbf2@PLT 835; SSE2-NEXT: pextrw $0, %xmm0, %eax 836; SSE2-NEXT: movzwl %ax, %r14d 837; SSE2-NEXT: orl %ebx, %r14d 838; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 839; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 840; SSE2-NEXT: callq __truncsfbf2@PLT 841; SSE2-NEXT: pextrw $0, %xmm0, %ebx 842; SSE2-NEXT: shll $16, %ebx 843; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 844; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 845; SSE2-NEXT: callq __truncsfbf2@PLT 846; SSE2-NEXT: pextrw $0, %xmm0, %eax 847; SSE2-NEXT: movzwl %ax, %eax 848; SSE2-NEXT: orl %ebx, %eax 849; SSE2-NEXT: shlq $32, %rax 850; SSE2-NEXT: orq %r14, %rax 851; SSE2-NEXT: movq %rax, %xmm0 852; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 853; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 854; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 855; SSE2-NEXT: callq __truncsfbf2@PLT 856; SSE2-NEXT: pextrw $0, %xmm0, %ebx 857; SSE2-NEXT: shll $16, %ebx 858; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 859; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 860; SSE2-NEXT: callq __truncsfbf2@PLT 861; SSE2-NEXT: pextrw $0, %xmm0, %eax 862; SSE2-NEXT: movzwl %ax, %r14d 863; SSE2-NEXT: orl %ebx, %r14d 864; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 865; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 866; SSE2-NEXT: callq __truncsfbf2@PLT 867; SSE2-NEXT: pextrw $0, %xmm0, %ebx 868; SSE2-NEXT: shll $16, %ebx 869; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 870; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 871; SSE2-NEXT: callq __truncsfbf2@PLT 872; SSE2-NEXT: pextrw $0, %xmm0, %eax 873; SSE2-NEXT: movzwl %ax, %eax 874; SSE2-NEXT: orl %ebx, %eax 875; SSE2-NEXT: shlq $32, %rax 876; SSE2-NEXT: orq %r14, %rax 877; SSE2-NEXT: movq %rax, %xmm0 878; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 879; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 880; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 881; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 882; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 883; SSE2-NEXT: callq __truncsfbf2@PLT 884; SSE2-NEXT: pextrw $0, %xmm0, %ebx 885; SSE2-NEXT: shll $16, %ebx 886; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 887; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 888; SSE2-NEXT: callq __truncsfbf2@PLT 889; SSE2-NEXT: pextrw $0, %xmm0, %eax 890; SSE2-NEXT: movzwl %ax, %r14d 891; SSE2-NEXT: orl %ebx, %r14d 892; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 893; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 894; SSE2-NEXT: callq __truncsfbf2@PLT 895; SSE2-NEXT: pextrw $0, %xmm0, %ebx 896; SSE2-NEXT: shll $16, %ebx 897; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 898; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 899; SSE2-NEXT: callq __truncsfbf2@PLT 900; SSE2-NEXT: pextrw $0, %xmm0, %eax 901; SSE2-NEXT: movzwl %ax, %eax 902; SSE2-NEXT: orl %ebx, %eax 903; SSE2-NEXT: shlq $32, %rax 904; SSE2-NEXT: orq %r14, %rax 905; SSE2-NEXT: movq %rax, %xmm0 906; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 907; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 908; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 909; SSE2-NEXT: callq __truncsfbf2@PLT 910; SSE2-NEXT: pextrw $0, %xmm0, %ebx 911; SSE2-NEXT: shll $16, %ebx 912; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 913; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 914; SSE2-NEXT: callq __truncsfbf2@PLT 915; SSE2-NEXT: pextrw $0, %xmm0, %eax 916; SSE2-NEXT: movzwl %ax, %r14d 917; SSE2-NEXT: orl %ebx, %r14d 918; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 919; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 920; SSE2-NEXT: callq __truncsfbf2@PLT 921; SSE2-NEXT: pextrw $0, %xmm0, %ebx 922; SSE2-NEXT: shll $16, %ebx 923; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 924; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 925; SSE2-NEXT: callq __truncsfbf2@PLT 926; SSE2-NEXT: pextrw $0, %xmm0, %eax 927; SSE2-NEXT: movzwl %ax, %eax 928; SSE2-NEXT: orl %ebx, %eax 929; SSE2-NEXT: shlq $32, %rax 930; SSE2-NEXT: orq %r14, %rax 931; SSE2-NEXT: movq %rax, %xmm0 932; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 933; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 934; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 935; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 936; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 937; SSE2-NEXT: callq __truncsfbf2@PLT 938; SSE2-NEXT: pextrw $0, %xmm0, %ebx 939; SSE2-NEXT: shll $16, %ebx 940; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 941; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 942; SSE2-NEXT: callq __truncsfbf2@PLT 943; SSE2-NEXT: pextrw $0, %xmm0, %eax 944; SSE2-NEXT: movzwl %ax, %r14d 945; SSE2-NEXT: orl %ebx, %r14d 946; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 947; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 948; SSE2-NEXT: callq __truncsfbf2@PLT 949; SSE2-NEXT: pextrw $0, %xmm0, %ebx 950; SSE2-NEXT: shll $16, %ebx 951; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 952; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 953; SSE2-NEXT: callq __truncsfbf2@PLT 954; SSE2-NEXT: pextrw $0, %xmm0, %eax 955; SSE2-NEXT: movzwl %ax, %eax 956; SSE2-NEXT: orl %ebx, %eax 957; SSE2-NEXT: shlq $32, %rax 958; SSE2-NEXT: orq %r14, %rax 959; SSE2-NEXT: movq %rax, %xmm0 960; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 961; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 962; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 963; SSE2-NEXT: callq __truncsfbf2@PLT 964; SSE2-NEXT: pextrw $0, %xmm0, %ebx 965; SSE2-NEXT: shll $16, %ebx 966; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 967; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 968; SSE2-NEXT: callq __truncsfbf2@PLT 969; SSE2-NEXT: pextrw $0, %xmm0, %eax 970; SSE2-NEXT: movzwl %ax, %r14d 971; SSE2-NEXT: orl %ebx, %r14d 972; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 973; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 974; SSE2-NEXT: callq __truncsfbf2@PLT 975; SSE2-NEXT: pextrw $0, %xmm0, %ebx 976; SSE2-NEXT: shll $16, %ebx 977; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 978; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 979; SSE2-NEXT: callq __truncsfbf2@PLT 980; SSE2-NEXT: pextrw $0, %xmm0, %eax 981; SSE2-NEXT: movzwl %ax, %eax 982; SSE2-NEXT: orl %ebx, %eax 983; SSE2-NEXT: shlq $32, %rax 984; SSE2-NEXT: orq %r14, %rax 985; SSE2-NEXT: movq %rax, %xmm0 986; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 987; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] 988; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 989; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 990; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 991; SSE2-NEXT: addq $88, %rsp 992; SSE2-NEXT: popq %rbx 993; SSE2-NEXT: popq %r14 994; SSE2-NEXT: retq 995; 996; FP16-LABEL: pr63017_2: 997; FP16: # %bb.0: 998; FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0] 999; FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} 1000; FP16-NEXT: retq 1001; 1002; AVXNC-LABEL: pr63017_2: 1003; AVXNC: # %bb.0: 1004; AVXNC-NEXT: vbroadcastss {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] 1005; AVXNC-NEXT: xorl %eax, %eax 1006; AVXNC-NEXT: testb %al, %al 1007; AVXNC-NEXT: jne .LBB12_2 1008; AVXNC-NEXT: # %bb.1: # %cond.load 1009; AVXNC-NEXT: vmovups (%rax), %ymm0 1010; AVXNC-NEXT: .LBB12_2: 1011; AVXNC-NEXT: vmovaps %ymm0, %ymm1 1012; AVXNC-NEXT: retq 1013 %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>) 1014 ret <32 x bfloat> %1 1015} 1016 1017define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) { 1018; X86-LABEL: pr62997_3: 1019; X86: # %bb.0: 1020; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 1021; X86-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 1022; X86-NEXT: retl 1023; 1024; SSE2-LABEL: pr62997_3: 1025; SSE2: # %bb.0: 1026; SSE2-NEXT: movq %xmm0, %rax 1027; SSE2-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 1028; SSE2-NEXT: andq %rax, %rcx 1029; SSE2-NEXT: movzwl %ax, %eax 1030; SSE2-NEXT: pextrw $0, %xmm4, %edx 1031; SSE2-NEXT: shll $16, %edx 1032; SSE2-NEXT: orl %eax, %edx 1033; SSE2-NEXT: orq %rcx, %rdx 1034; SSE2-NEXT: movq %rdx, %xmm4 1035; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1036; SSE2-NEXT: retq 1037; 1038; FP16-LABEL: pr62997_3: 1039; FP16: # %bb.0: 1040; FP16-NEXT: vmovw %xmm1, %eax 1041; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 1042; FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 1043; FP16-NEXT: retq 1044; 1045; AVXNC-LABEL: pr62997_3: 1046; AVXNC: # %bb.0: 1047; AVXNC-NEXT: vpextrw $0, %xmm2, %eax 1048; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2 1049; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 1050; AVXNC-NEXT: retq 1051 %3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1 1052 ret <32 x bfloat> %3 1053} 1054 1055declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>) 1056 1057define <4 x float> @pr64460_1(<4 x bfloat> %a) { 1058; X86-LABEL: pr64460_1: 1059; X86: # %bb.0: 1060; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1061; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1062; X86-NEXT: retl 1063; 1064; SSE2-LABEL: pr64460_1: 1065; SSE2: # %bb.0: 1066; SSE2-NEXT: pxor %xmm1, %xmm1 1067; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1068; SSE2-NEXT: movdqa %xmm1, %xmm0 1069; SSE2-NEXT: retq 1070; 1071; AVX-LABEL: pr64460_1: 1072; AVX: # %bb.0: 1073; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1074; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1075; AVX-NEXT: retq 1076 %b = fpext <4 x bfloat> %a to <4 x float> 1077 ret <4 x float> %b 1078} 1079 1080define <8 x float> @pr64460_2(<8 x bfloat> %a) { 1081; X86-LABEL: pr64460_2: 1082; X86: # %bb.0: 1083; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1084; X86-NEXT: vpslld $16, %ymm0, %ymm0 1085; X86-NEXT: retl 1086; 1087; SSE2-LABEL: pr64460_2: 1088; SSE2: # %bb.0: 1089; SSE2-NEXT: pxor %xmm1, %xmm1 1090; SSE2-NEXT: pxor %xmm2, %xmm2 1091; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 1092; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1093; SSE2-NEXT: movdqa %xmm2, %xmm0 1094; SSE2-NEXT: retq 1095; 1096; AVX-LABEL: pr64460_2: 1097; AVX: # %bb.0: 1098; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1099; AVX-NEXT: vpslld $16, %ymm0, %ymm0 1100; AVX-NEXT: retq 1101 %b = fpext <8 x bfloat> %a to <8 x float> 1102 ret <8 x float> %b 1103} 1104 1105define <16 x float> @pr64460_3(<16 x bfloat> %a) { 1106; X86-LABEL: pr64460_3: 1107; X86: # %bb.0: 1108; X86-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1109; X86-NEXT: vpslld $16, %zmm0, %zmm0 1110; X86-NEXT: retl 1111; 1112; SSE2-LABEL: pr64460_3: 1113; SSE2: # %bb.0: 1114; SSE2-NEXT: pxor %xmm3, %xmm3 1115; SSE2-NEXT: pxor %xmm5, %xmm5 1116; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 1117; SSE2-NEXT: pxor %xmm4, %xmm4 1118; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 1119; SSE2-NEXT: pxor %xmm2, %xmm2 1120; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1121; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1122; SSE2-NEXT: movdqa %xmm5, %xmm0 1123; SSE2-NEXT: movdqa %xmm4, %xmm1 1124; SSE2-NEXT: retq 1125; 1126; F16-LABEL: pr64460_3: 1127; F16: # %bb.0: 1128; F16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1129; F16-NEXT: vpslld $16, %zmm0, %zmm0 1130; F16-NEXT: retq 1131; 1132; AVXNC-LABEL: pr64460_3: 1133; AVXNC: # %bb.0: 1134; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1135; AVXNC-NEXT: vpslld $16, %ymm1, %ymm2 1136; AVXNC-NEXT: vextracti128 $1, %ymm0, %xmm0 1137; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1138; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1 1139; AVXNC-NEXT: vmovdqa %ymm2, %ymm0 1140; AVXNC-NEXT: retq 1141 %b = fpext <16 x bfloat> %a to <16 x float> 1142 ret <16 x float> %b 1143} 1144 1145define <8 x double> @pr64460_4(<8 x bfloat> %a) { 1146; X86-LABEL: pr64460_4: 1147; X86: # %bb.0: 1148; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1149; X86-NEXT: vpslld $16, %ymm0, %ymm0 1150; X86-NEXT: vcvtps2pd %ymm0, %zmm0 1151; X86-NEXT: retl 1152; 1153; SSE2-LABEL: pr64460_4: 1154; SSE2: # %bb.0: 1155; SSE2-NEXT: pxor %xmm3, %xmm3 1156; SSE2-NEXT: pxor %xmm1, %xmm1 1157; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1158; SSE2-NEXT: cvtps2pd %xmm1, %xmm4 1159; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1160; SSE2-NEXT: cvtps2pd %xmm3, %xmm2 1161; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1162; SSE2-NEXT: cvtps2pd %xmm0, %xmm1 1163; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 1164; SSE2-NEXT: cvtps2pd %xmm0, %xmm3 1165; SSE2-NEXT: movaps %xmm4, %xmm0 1166; SSE2-NEXT: retq 1167; 1168; F16-LABEL: pr64460_4: 1169; F16: # %bb.0: 1170; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1171; F16-NEXT: vpslld $16, %ymm0, %ymm0 1172; F16-NEXT: vcvtps2pd %ymm0, %zmm0 1173; F16-NEXT: retq 1174; 1175; AVXNC-LABEL: pr64460_4: 1176; AVXNC: # %bb.0: 1177; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1178; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1 1179; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm0 1180; AVXNC-NEXT: vextracti128 $1, %ymm1, %xmm1 1181; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm1 1182; AVXNC-NEXT: retq 1183 %b = fpext <8 x bfloat> %a to <8 x double> 1184 ret <8 x double> %b 1185} 1186 1187define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind { 1188; X86-LABEL: fptrunc_v4f32: 1189; X86: # %bb.0: 1190; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1191; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0 1192; X86-NEXT: vzeroupper 1193; X86-NEXT: retl 1194; 1195; SSE2-LABEL: fptrunc_v4f32: 1196; SSE2: # %bb.0: 1197; SSE2-NEXT: subq $72, %rsp 1198; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 1199; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 1200; SSE2-NEXT: callq __truncsfbf2@PLT 1201; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1202; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1203; SSE2-NEXT: callq __truncsfbf2@PLT 1204; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1205; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1206; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1207; SSE2-NEXT: callq __truncsfbf2@PLT 1208; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1209; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1210; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1211; SSE2-NEXT: callq __truncsfbf2@PLT 1212; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1213; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1214; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1215; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1216; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 1217; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1218; SSE2-NEXT: addq $72, %rsp 1219; SSE2-NEXT: retq 1220; 1221; F16-LABEL: fptrunc_v4f32: 1222; F16: # %bb.0: 1223; F16-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1224; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0 1225; F16-NEXT: vzeroupper 1226; F16-NEXT: retq 1227; 1228; AVXNC-LABEL: fptrunc_v4f32: 1229; AVXNC: # %bb.0: 1230; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1231; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0 1232; AVXNC-NEXT: vzeroupper 1233; AVXNC-NEXT: retq 1234 %b = fptrunc <4 x float> %a to <4 x bfloat> 1235 ret <4 x bfloat> %b 1236} 1237 1238define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind { 1239; X86-LABEL: fptrunc_v8f32: 1240; X86: # %bb.0: 1241; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0 1242; X86-NEXT: vzeroupper 1243; X86-NEXT: retl 1244; 1245; SSE2-LABEL: fptrunc_v8f32: 1246; SSE2: # %bb.0: 1247; SSE2-NEXT: pushq %rbp 1248; SSE2-NEXT: pushq %r14 1249; SSE2-NEXT: pushq %rbx 1250; SSE2-NEXT: subq $32, %rsp 1251; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 1252; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1253; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 1254; SSE2-NEXT: callq __truncsfbf2@PLT 1255; SSE2-NEXT: pextrw $0, %xmm0, %ebx 1256; SSE2-NEXT: shll $16, %ebx 1257; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1258; SSE2-NEXT: callq __truncsfbf2@PLT 1259; SSE2-NEXT: pextrw $0, %xmm0, %eax 1260; SSE2-NEXT: movzwl %ax, %r14d 1261; SSE2-NEXT: orl %ebx, %r14d 1262; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1263; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1264; SSE2-NEXT: callq __truncsfbf2@PLT 1265; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1266; SSE2-NEXT: shll $16, %ebp 1267; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1268; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1269; SSE2-NEXT: callq __truncsfbf2@PLT 1270; SSE2-NEXT: pextrw $0, %xmm0, %eax 1271; SSE2-NEXT: movzwl %ax, %ebx 1272; SSE2-NEXT: orl %ebp, %ebx 1273; SSE2-NEXT: shlq $32, %rbx 1274; SSE2-NEXT: orq %r14, %rbx 1275; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1276; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 1277; SSE2-NEXT: callq __truncsfbf2@PLT 1278; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1279; SSE2-NEXT: shll $16, %ebp 1280; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 1281; SSE2-NEXT: callq __truncsfbf2@PLT 1282; SSE2-NEXT: pextrw $0, %xmm0, %eax 1283; SSE2-NEXT: movzwl %ax, %r14d 1284; SSE2-NEXT: orl %ebp, %r14d 1285; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1286; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1287; SSE2-NEXT: callq __truncsfbf2@PLT 1288; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1289; SSE2-NEXT: shll $16, %ebp 1290; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 1291; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1292; SSE2-NEXT: callq __truncsfbf2@PLT 1293; SSE2-NEXT: pextrw $0, %xmm0, %eax 1294; SSE2-NEXT: movzwl %ax, %eax 1295; SSE2-NEXT: orl %ebp, %eax 1296; SSE2-NEXT: shlq $32, %rax 1297; SSE2-NEXT: orq %r14, %rax 1298; SSE2-NEXT: movq %rax, %xmm1 1299; SSE2-NEXT: movq %rbx, %xmm0 1300; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1301; SSE2-NEXT: addq $32, %rsp 1302; SSE2-NEXT: popq %rbx 1303; SSE2-NEXT: popq %r14 1304; SSE2-NEXT: popq %rbp 1305; SSE2-NEXT: retq 1306; 1307; F16-LABEL: fptrunc_v8f32: 1308; F16: # %bb.0: 1309; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0 1310; F16-NEXT: vzeroupper 1311; F16-NEXT: retq 1312; 1313; AVXNC-LABEL: fptrunc_v8f32: 1314; AVXNC: # %bb.0: 1315; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0 1316; AVXNC-NEXT: vzeroupper 1317; AVXNC-NEXT: retq 1318 %b = fptrunc <8 x float> %a to <8 x bfloat> 1319 ret <8 x bfloat> %b 1320} 1321 1322define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind { 1323; X86-LABEL: fptrunc_v16f32: 1324; X86: # %bb.0: 1325; X86-NEXT: vcvtneps2bf16 %zmm0, %ymm0 1326; X86-NEXT: retl 1327; 1328; SSE2-LABEL: fptrunc_v16f32: 1329; SSE2: # %bb.0: 1330; SSE2-NEXT: pushq %rbp 1331; SSE2-NEXT: pushq %r15 1332; SSE2-NEXT: pushq %r14 1333; SSE2-NEXT: pushq %r12 1334; SSE2-NEXT: pushq %rbx 1335; SSE2-NEXT: subq $64, %rsp 1336; SSE2-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill 1337; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1338; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1339; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1340; SSE2-NEXT: movaps %xmm2, %xmm0 1341; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] 1342; SSE2-NEXT: callq __truncsfbf2@PLT 1343; SSE2-NEXT: pextrw $0, %xmm0, %ebx 1344; SSE2-NEXT: shll $16, %ebx 1345; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1346; SSE2-NEXT: callq __truncsfbf2@PLT 1347; SSE2-NEXT: pextrw $0, %xmm0, %eax 1348; SSE2-NEXT: movzwl %ax, %r14d 1349; SSE2-NEXT: orl %ebx, %r14d 1350; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1351; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1352; SSE2-NEXT: callq __truncsfbf2@PLT 1353; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1354; SSE2-NEXT: shll $16, %ebp 1355; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1356; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1357; SSE2-NEXT: callq __truncsfbf2@PLT 1358; SSE2-NEXT: pextrw $0, %xmm0, %eax 1359; SSE2-NEXT: movzwl %ax, %ebx 1360; SSE2-NEXT: orl %ebp, %ebx 1361; SSE2-NEXT: shlq $32, %rbx 1362; SSE2-NEXT: orq %r14, %rbx 1363; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1364; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 1365; SSE2-NEXT: callq __truncsfbf2@PLT 1366; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1367; SSE2-NEXT: shll $16, %ebp 1368; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 1369; SSE2-NEXT: callq __truncsfbf2@PLT 1370; SSE2-NEXT: pextrw $0, %xmm0, %eax 1371; SSE2-NEXT: movzwl %ax, %r15d 1372; SSE2-NEXT: orl %ebp, %r15d 1373; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1374; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1375; SSE2-NEXT: callq __truncsfbf2@PLT 1376; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1377; SSE2-NEXT: shll $16, %ebp 1378; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 1379; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1380; SSE2-NEXT: callq __truncsfbf2@PLT 1381; SSE2-NEXT: pextrw $0, %xmm0, %eax 1382; SSE2-NEXT: movzwl %ax, %r14d 1383; SSE2-NEXT: orl %ebp, %r14d 1384; SSE2-NEXT: shlq $32, %r14 1385; SSE2-NEXT: orq %r15, %r14 1386; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1387; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 1388; SSE2-NEXT: callq __truncsfbf2@PLT 1389; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1390; SSE2-NEXT: shll $16, %ebp 1391; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1392; SSE2-NEXT: callq __truncsfbf2@PLT 1393; SSE2-NEXT: pextrw $0, %xmm0, %eax 1394; SSE2-NEXT: movzwl %ax, %r12d 1395; SSE2-NEXT: orl %ebp, %r12d 1396; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1397; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1398; SSE2-NEXT: callq __truncsfbf2@PLT 1399; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1400; SSE2-NEXT: shll $16, %ebp 1401; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1402; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1403; SSE2-NEXT: callq __truncsfbf2@PLT 1404; SSE2-NEXT: pextrw $0, %xmm0, %eax 1405; SSE2-NEXT: movzwl %ax, %r15d 1406; SSE2-NEXT: orl %ebp, %r15d 1407; SSE2-NEXT: shlq $32, %r15 1408; SSE2-NEXT: orq %r12, %r15 1409; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1410; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 1411; SSE2-NEXT: callq __truncsfbf2@PLT 1412; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1413; SSE2-NEXT: shll $16, %ebp 1414; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1415; SSE2-NEXT: callq __truncsfbf2@PLT 1416; SSE2-NEXT: pextrw $0, %xmm0, %eax 1417; SSE2-NEXT: movzwl %ax, %r12d 1418; SSE2-NEXT: orl %ebp, %r12d 1419; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1420; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1421; SSE2-NEXT: callq __truncsfbf2@PLT 1422; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1423; SSE2-NEXT: shll $16, %ebp 1424; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1425; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1426; SSE2-NEXT: callq __truncsfbf2@PLT 1427; SSE2-NEXT: pextrw $0, %xmm0, %eax 1428; SSE2-NEXT: movzwl %ax, %eax 1429; SSE2-NEXT: orl %ebp, %eax 1430; SSE2-NEXT: shlq $32, %rax 1431; SSE2-NEXT: orq %r12, %rax 1432; SSE2-NEXT: movq %rax, %xmm1 1433; SSE2-NEXT: movq %r15, %xmm0 1434; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1435; SSE2-NEXT: movq %r14, %xmm2 1436; SSE2-NEXT: movq %rbx, %xmm1 1437; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1438; SSE2-NEXT: addq $64, %rsp 1439; SSE2-NEXT: popq %rbx 1440; SSE2-NEXT: popq %r12 1441; SSE2-NEXT: popq %r14 1442; SSE2-NEXT: popq %r15 1443; SSE2-NEXT: popq %rbp 1444; SSE2-NEXT: retq 1445; 1446; F16-LABEL: fptrunc_v16f32: 1447; F16: # %bb.0: 1448; F16-NEXT: vcvtneps2bf16 %zmm0, %ymm0 1449; F16-NEXT: retq 1450; 1451; AVXNC-LABEL: fptrunc_v16f32: 1452; AVXNC: # %bb.0: 1453; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0 1454; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1 1455; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1456; AVXNC-NEXT: retq 1457 %b = fptrunc <16 x float> %a to <16 x bfloat> 1458 ret <16 x bfloat> %b 1459} 1460 1461define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { 1462; X86-LABEL: fptrunc_v8f64: 1463; X86: # %bb.0: 1464; X86-NEXT: subl $204, %esp 1465; X86-NEXT: vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill 1466; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 1467; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1468; X86-NEXT: vmovlps %xmm0, (%esp) 1469; X86-NEXT: vzeroupper 1470; X86-NEXT: calll __truncdfbf2 1471; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1472; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 1473; X86-NEXT: vmovhps %xmm0, (%esp) 1474; X86-NEXT: calll __truncdfbf2 1475; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1476; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload 1477; X86-NEXT: vmovlps %xmm0, (%esp) 1478; X86-NEXT: vzeroupper 1479; X86-NEXT: calll __truncdfbf2 1480; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1481; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload 1482; X86-NEXT: vmovhps %xmm0, (%esp) 1483; X86-NEXT: vzeroupper 1484; X86-NEXT: calll __truncdfbf2 1485; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1486; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload 1487; X86-NEXT: vextractf32x4 $2, %zmm0, %xmm0 1488; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1489; X86-NEXT: vmovlps %xmm0, (%esp) 1490; X86-NEXT: vzeroupper 1491; X86-NEXT: calll __truncdfbf2 1492; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1493; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 1494; X86-NEXT: vmovhps %xmm0, (%esp) 1495; X86-NEXT: calll __truncdfbf2 1496; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1497; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload 1498; X86-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1499; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1500; X86-NEXT: vmovlps %xmm0, (%esp) 1501; X86-NEXT: vzeroupper 1502; X86-NEXT: calll __truncdfbf2 1503; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1504; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 1505; X86-NEXT: vmovhps %xmm0, (%esp) 1506; X86-NEXT: calll __truncdfbf2 1507; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 1508; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1509; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 1510; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 1511; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 1512; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1513; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 1514; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 1515; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 1516; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload 1517; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 1518; X86-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 1519; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1520; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1521; X86-NEXT: addl $204, %esp 1522; X86-NEXT: retl 1523; 1524; SSE2-LABEL: fptrunc_v8f64: 1525; SSE2: # %bb.0: 1526; SSE2-NEXT: pushq %rbp 1527; SSE2-NEXT: pushq %r14 1528; SSE2-NEXT: pushq %rbx 1529; SSE2-NEXT: subq $64, %rsp 1530; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1531; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1532; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill 1533; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1534; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1535; SSE2-NEXT: callq __truncdfbf2@PLT 1536; SSE2-NEXT: pextrw $0, %xmm0, %ebx 1537; SSE2-NEXT: shll $16, %ebx 1538; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1539; SSE2-NEXT: callq __truncdfbf2@PLT 1540; SSE2-NEXT: pextrw $0, %xmm0, %eax 1541; SSE2-NEXT: movzwl %ax, %r14d 1542; SSE2-NEXT: orl %ebx, %r14d 1543; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 1544; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1545; SSE2-NEXT: callq __truncdfbf2@PLT 1546; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1547; SSE2-NEXT: shll $16, %ebp 1548; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 1549; SSE2-NEXT: callq __truncdfbf2@PLT 1550; SSE2-NEXT: pextrw $0, %xmm0, %eax 1551; SSE2-NEXT: movzwl %ax, %ebx 1552; SSE2-NEXT: orl %ebp, %ebx 1553; SSE2-NEXT: shlq $32, %rbx 1554; SSE2-NEXT: orq %r14, %rbx 1555; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1556; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1557; SSE2-NEXT: callq __truncdfbf2@PLT 1558; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1559; SSE2-NEXT: shll $16, %ebp 1560; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1561; SSE2-NEXT: callq __truncdfbf2@PLT 1562; SSE2-NEXT: pextrw $0, %xmm0, %eax 1563; SSE2-NEXT: movzwl %ax, %r14d 1564; SSE2-NEXT: orl %ebp, %r14d 1565; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1566; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] 1567; SSE2-NEXT: callq __truncdfbf2@PLT 1568; SSE2-NEXT: pextrw $0, %xmm0, %ebp 1569; SSE2-NEXT: shll $16, %ebp 1570; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1571; SSE2-NEXT: callq __truncdfbf2@PLT 1572; SSE2-NEXT: pextrw $0, %xmm0, %eax 1573; SSE2-NEXT: movzwl %ax, %eax 1574; SSE2-NEXT: orl %ebp, %eax 1575; SSE2-NEXT: shlq $32, %rax 1576; SSE2-NEXT: orq %r14, %rax 1577; SSE2-NEXT: movq %rax, %xmm1 1578; SSE2-NEXT: movq %rbx, %xmm0 1579; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1580; SSE2-NEXT: addq $64, %rsp 1581; SSE2-NEXT: popq %rbx 1582; SSE2-NEXT: popq %r14 1583; SSE2-NEXT: popq %rbp 1584; SSE2-NEXT: retq 1585; 1586; FP16-LABEL: fptrunc_v8f64: 1587; FP16: # %bb.0: 1588; FP16-NEXT: subq $184, %rsp 1589; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1590; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0 1591; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1592; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1593; FP16-NEXT: vzeroupper 1594; FP16-NEXT: callq __truncdfbf2@PLT 1595; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1596; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1597; FP16-NEXT: callq __truncdfbf2@PLT 1598; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1599; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1600; FP16-NEXT: # xmm0 = mem[1,0] 1601; FP16-NEXT: callq __truncdfbf2@PLT 1602; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1603; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1604; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1605; FP16-NEXT: vzeroupper 1606; FP16-NEXT: callq __truncdfbf2@PLT 1607; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1608; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1609; FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0 1610; FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 1611; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1612; FP16-NEXT: vzeroupper 1613; FP16-NEXT: callq __truncdfbf2@PLT 1614; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1615; FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1616; FP16-NEXT: callq __truncdfbf2@PLT 1617; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1618; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1619; FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1620; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1621; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1622; FP16-NEXT: vzeroupper 1623; FP16-NEXT: callq __truncdfbf2@PLT 1624; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1625; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1626; FP16-NEXT: callq __truncdfbf2@PLT 1627; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1628; FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 1629; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 1630; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 1631; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 1632; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1633; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1634; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 1635; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 1636; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1637; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 1638; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 1639; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1640; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1641; FP16-NEXT: addq $184, %rsp 1642; FP16-NEXT: retq 1643; 1644; AVXNC-LABEL: fptrunc_v8f64: 1645; AVXNC: # %bb.0: 1646; AVXNC-NEXT: pushq %rbp 1647; AVXNC-NEXT: pushq %r15 1648; AVXNC-NEXT: pushq %r14 1649; AVXNC-NEXT: pushq %r13 1650; AVXNC-NEXT: pushq %r12 1651; AVXNC-NEXT: pushq %rbx 1652; AVXNC-NEXT: subq $168, %rsp 1653; AVXNC-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 1654; AVXNC-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1655; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1656; AVXNC-NEXT: vzeroupper 1657; AVXNC-NEXT: callq __truncdfbf2@PLT 1658; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1659; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1660; AVXNC-NEXT: # xmm0 = mem[1,0] 1661; AVXNC-NEXT: callq __truncdfbf2@PLT 1662; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1663; AVXNC-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1664; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0 1665; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1666; AVXNC-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1667; AVXNC-NEXT: vzeroupper 1668; AVXNC-NEXT: callq __truncdfbf2@PLT 1669; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1670; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 1671; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1672; AVXNC-NEXT: vzeroupper 1673; AVXNC-NEXT: callq __truncdfbf2@PLT 1674; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1675; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 1676; AVXNC-NEXT: # xmm0 = mem[1,0] 1677; AVXNC-NEXT: callq __truncdfbf2@PLT 1678; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1679; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 1680; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0 1681; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1682; AVXNC-NEXT: vzeroupper 1683; AVXNC-NEXT: callq __truncdfbf2@PLT 1684; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1685; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1686; AVXNC-NEXT: # xmm0 = mem[1,0] 1687; AVXNC-NEXT: callq __truncdfbf2@PLT 1688; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx 1689; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1690; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp 1691; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1692; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d 1693; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1694; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d 1695; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1696; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d 1697; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1698; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d 1699; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1700; AVXNC-NEXT: callq __truncdfbf2@PLT 1701; AVXNC-NEXT: vpextrw $0, %xmm0, %eax 1702; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1703; AVXNC-NEXT: vpinsrw $1, %r13d, %xmm0, %xmm0 1704; AVXNC-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 1705; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0 1706; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0 1707; AVXNC-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 1708; AVXNC-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 1709; AVXNC-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0 1710; AVXNC-NEXT: addq $168, %rsp 1711; AVXNC-NEXT: popq %rbx 1712; AVXNC-NEXT: popq %r12 1713; AVXNC-NEXT: popq %r13 1714; AVXNC-NEXT: popq %r14 1715; AVXNC-NEXT: popq %r15 1716; AVXNC-NEXT: popq %rbp 1717; AVXNC-NEXT: retq 1718 %b = fptrunc <8 x double> %a to <8 x bfloat> 1719 ret <8 x bfloat> %b 1720} 1721 1722define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) { 1723; X86-LABEL: test_v8bf16_v32bf16: 1724; X86: # %bb.0: 1725; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1726; X86-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1727; X86-NEXT: retl 1728; 1729; SSE2-LABEL: test_v8bf16_v32bf16: 1730; SSE2: # %bb.0: 1731; SSE2-NEXT: movaps (%rdi), %xmm0 1732; SSE2-NEXT: movaps %xmm0, %xmm1 1733; SSE2-NEXT: movaps %xmm0, %xmm2 1734; SSE2-NEXT: movaps %xmm0, %xmm3 1735; SSE2-NEXT: retq 1736; 1737; F16-LABEL: test_v8bf16_v32bf16: 1738; F16: # %bb.0: 1739; F16-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1740; F16-NEXT: retq 1741; 1742; AVXNC-LABEL: test_v8bf16_v32bf16: 1743; AVXNC: # %bb.0: 1744; AVXNC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1745; AVXNC-NEXT: vmovaps %ymm0, %ymm1 1746; AVXNC-NEXT: retq 1747 %2 = load <8 x bfloat>, ptr %0, align 16 1748 %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1749 ret <32 x bfloat> %3 1750} 1751 1752define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { 1753; X86-LABEL: concat_v8bf16: 1754; X86: # %bb.0: 1755; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1756; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1757; X86-NEXT: retl 1758; 1759; SSE2-LABEL: concat_v8bf16: 1760; SSE2: # %bb.0: 1761; SSE2-NEXT: retq 1762; 1763; AVX-LABEL: concat_v8bf16: 1764; AVX: # %bb.0: 1765; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1766; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1767; AVX-NEXT: retq 1768 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1769 ret <16 x bfloat> %a 1770} 1771 1772define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) { 1773; X86-LABEL: extract_v32bf16_v8bf16: 1774; X86: # %bb.0: 1775; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 1776; X86-NEXT: vzeroupper 1777; X86-NEXT: retl 1778; 1779; SSE2-LABEL: extract_v32bf16_v8bf16: 1780; SSE2: # %bb.0: 1781; SSE2-NEXT: pextrw $0, %xmm1, %eax 1782; SSE2-NEXT: pextrw $1, %xmm1, %ecx 1783; SSE2-NEXT: shll $16, %ecx 1784; SSE2-NEXT: orl %eax, %ecx 1785; SSE2-NEXT: pextrw $2, %xmm1, %eax 1786; SSE2-NEXT: pextrw $3, %xmm1, %edx 1787; SSE2-NEXT: shll $16, %edx 1788; SSE2-NEXT: orl %eax, %edx 1789; SSE2-NEXT: shlq $32, %rdx 1790; SSE2-NEXT: orq %rcx, %rdx 1791; SSE2-NEXT: pextrw $4, %xmm1, %eax 1792; SSE2-NEXT: pextrw $5, %xmm1, %ecx 1793; SSE2-NEXT: shll $16, %ecx 1794; SSE2-NEXT: orl %eax, %ecx 1795; SSE2-NEXT: pextrw $6, %xmm1, %eax 1796; SSE2-NEXT: pextrw $7, %xmm1, %esi 1797; SSE2-NEXT: shll $16, %esi 1798; SSE2-NEXT: orl %eax, %esi 1799; SSE2-NEXT: shlq $32, %rsi 1800; SSE2-NEXT: orq %rcx, %rsi 1801; SSE2-NEXT: movq %rsi, %xmm1 1802; SSE2-NEXT: movq %rdx, %xmm0 1803; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1804; SSE2-NEXT: retq 1805; 1806; AVX-LABEL: extract_v32bf16_v8bf16: 1807; AVX: # %bb.0: 1808; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1809; AVX-NEXT: vzeroupper 1810; AVX-NEXT: retq 1811 %a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1812 ret <8 x bfloat> %a 1813} 1814 1815define <16 x bfloat> @concat_zero_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { 1816; X86-LABEL: concat_zero_v8bf16: 1817; X86: # %bb.0: 1818; X86-NEXT: vmovaps %xmm0, %xmm0 1819; X86-NEXT: retl 1820; 1821; SSE2-LABEL: concat_zero_v8bf16: 1822; SSE2: # %bb.0: 1823; SSE2-NEXT: xorps %xmm1, %xmm1 1824; SSE2-NEXT: retq 1825; 1826; AVX-LABEL: concat_zero_v8bf16: 1827; AVX: # %bb.0: 1828; AVX-NEXT: vmovaps %xmm0, %xmm0 1829; AVX-NEXT: retq 1830 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1831 ret <16 x bfloat> %a 1832} 1833 1834define <16 x bfloat> @concat_dup_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { 1835; X86-LABEL: concat_dup_v8bf16: 1836; X86: # %bb.0: 1837; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1838; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1839; X86-NEXT: retl 1840; 1841; SSE2-LABEL: concat_dup_v8bf16: 1842; SSE2: # %bb.0: 1843; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 1844; SSE2-NEXT: retq 1845; 1846; AVX-LABEL: concat_dup_v8bf16: 1847; AVX: # %bb.0: 1848; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1849; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1850; AVX-NEXT: retq 1851 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1852 ret <16 x bfloat> %a 1853} 1854 1855define float @trunc_ext(float %a) nounwind { 1856; X86-LABEL: trunc_ext: 1857; X86: # %bb.0: 1858; X86-NEXT: pushl %eax 1859; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1860; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 1861; X86-NEXT: vmovw %xmm0, %eax 1862; X86-NEXT: shll $16, %eax 1863; X86-NEXT: vmovd %eax, %xmm0 1864; X86-NEXT: vmovd %xmm0, (%esp) 1865; X86-NEXT: flds (%esp) 1866; X86-NEXT: popl %eax 1867; X86-NEXT: retl 1868; 1869; SSE2-LABEL: trunc_ext: 1870; SSE2: # %bb.0: 1871; SSE2-NEXT: pushq %rax 1872; SSE2-NEXT: callq __truncsfbf2@PLT 1873; SSE2-NEXT: pextrw $0, %xmm0, %eax 1874; SSE2-NEXT: shll $16, %eax 1875; SSE2-NEXT: movd %eax, %xmm0 1876; SSE2-NEXT: popq %rax 1877; SSE2-NEXT: retq 1878; 1879; FP16-LABEL: trunc_ext: 1880; FP16: # %bb.0: 1881; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 1882; FP16-NEXT: vmovw %xmm0, %eax 1883; FP16-NEXT: shll $16, %eax 1884; FP16-NEXT: vmovd %eax, %xmm0 1885; FP16-NEXT: retq 1886; 1887; AVXNC-LABEL: trunc_ext: 1888; AVXNC: # %bb.0: 1889; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 1890; AVXNC-NEXT: vmovd %xmm0, %eax 1891; AVXNC-NEXT: shll $16, %eax 1892; AVXNC-NEXT: vmovd %eax, %xmm0 1893; AVXNC-NEXT: retq 1894 %b = fptrunc float %a to bfloat 1895 %c = fpext bfloat %b to float 1896 ret float %c 1897} 1898 1899define void @PR92471(ptr %0, ptr %1) nounwind { 1900; X86-LABEL: PR92471: 1901; X86: # %bb.0: 1902; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1903; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1904; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1905; X86-NEXT: vpinsrd $1, 4(%ecx), %xmm0, %xmm0 1906; X86-NEXT: vpinsrd $2, 8(%ecx), %xmm0, %xmm0 1907; X86-NEXT: vpinsrw $6, 12(%ecx), %xmm0, %xmm0 1908; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1909; X86-NEXT: vpslld $16, %ymm0, %ymm0 1910; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 1911; X86-NEXT: vpextrd $2, %xmm1, 24(%eax) 1912; X86-NEXT: vpextrd $1, %xmm1, 20(%eax) 1913; X86-NEXT: vmovd %xmm1, 16(%eax) 1914; X86-NEXT: vmovdqu %xmm0, (%eax) 1915; X86-NEXT: vzeroupper 1916; X86-NEXT: retl 1917; 1918; SSE2-LABEL: PR92471: 1919; SSE2: # %bb.0: 1920; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1921; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1922; SSE2-NEXT: pinsrw $2, 12(%rdi), %xmm1 1923; SSE2-NEXT: pxor %xmm2, %xmm2 1924; SSE2-NEXT: pxor %xmm3, %xmm3 1925; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 1926; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 1927; SSE2-NEXT: movdqu %xmm2, (%rsi) 1928; SSE2-NEXT: movq %xmm3, 16(%rsi) 1929; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 1930; SSE2-NEXT: movd %xmm0, 24(%rsi) 1931; SSE2-NEXT: retq 1932; 1933; AVX-LABEL: PR92471: 1934; AVX: # %bb.0: 1935; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1936; AVX-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm0 1937; AVX-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm0 1938; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1939; AVX-NEXT: vpslld $16, %ymm0, %ymm0 1940; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 1941; AVX-NEXT: vpextrd $2, %xmm1, 24(%rsi) 1942; AVX-NEXT: vmovq %xmm1, 16(%rsi) 1943; AVX-NEXT: vmovdqu %xmm0, (%rsi) 1944; AVX-NEXT: vzeroupper 1945; AVX-NEXT: retq 1946 %3 = load <7 x bfloat>, ptr %0, align 2 1947 %4 = fpext <7 x bfloat> %3 to <7 x float> 1948 store <7 x float> %4, ptr %1, align 4 1949 ret void 1950} 1951 1952define bfloat @PR108936(x86_fp80 %0) nounwind { 1953; X86-LABEL: PR108936: 1954; X86: # %bb.0: 1955; X86-NEXT: subl $12, %esp 1956; X86-NEXT: fldt {{[0-9]+}}(%esp) 1957; X86-NEXT: fstpt (%esp) 1958; X86-NEXT: calll __truncxfbf2 1959; X86-NEXT: addl $12, %esp 1960; X86-NEXT: retl 1961; 1962; CHECK-LABEL: PR108936: 1963; CHECK: # %bb.0: 1964; CHECK-NEXT: subq $24, %rsp 1965; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 1966; CHECK-NEXT: fstpt (%rsp) 1967; CHECK-NEXT: callq __truncxfbf2@PLT 1968; CHECK-NEXT: addq $24, %rsp 1969; CHECK-NEXT: retq 1970 %2 = fptrunc x86_fp80 %0 to bfloat 1971 ret bfloat %2 1972} 1973 1974define bfloat @PR115710(fp128 %0) nounwind { 1975; X86-LABEL: PR115710: 1976; X86: # %bb.0: 1977; X86-NEXT: subl $28, %esp 1978; X86-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 1979; X86-NEXT: vmovups %xmm0, (%esp) 1980; X86-NEXT: calll __trunctfbf2 1981; X86-NEXT: addl $28, %esp 1982; X86-NEXT: retl 1983; 1984; CHECK-LABEL: PR115710: 1985; CHECK: # %bb.0: 1986; CHECK-NEXT: pushq %rax 1987; CHECK-NEXT: callq __trunctfbf2@PLT 1988; CHECK-NEXT: popq %rax 1989; CHECK-NEXT: retq 1990 %2 = fptrunc fp128 %0 to bfloat 1991 ret bfloat %2 1992} 1993