1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512DQVL 4 5define <4 x i64> @PR32546(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { 6; AVX512VL-LABEL: PR32546: 7; AVX512VL: ## %bb.0: ## %entry 8; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0 9; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1 10; AVX512VL-NEXT: kandw %k0, %k1, %k0 11; AVX512VL-NEXT: kmovw %k0, %eax 12; AVX512VL-NEXT: movzbl %al, %eax 13; AVX512VL-NEXT: vpbroadcastd %eax, %ymm0 14; AVX512VL-NEXT: retq 15; 16; AVX512DQVL-LABEL: PR32546: 17; AVX512DQVL: ## %bb.0: ## %entry 18; AVX512DQVL-NEXT: vcmpltps %ymm1, %ymm0, %k0 19; AVX512DQVL-NEXT: vcmpltps %ymm3, %ymm2, %k1 20; AVX512DQVL-NEXT: kandb %k0, %k1, %k0 21; AVX512DQVL-NEXT: kmovb %k0, %eax 22; AVX512DQVL-NEXT: vpbroadcastd %eax, %ymm0 23; AVX512DQVL-NEXT: retq 24entry: 25 %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1) 26 %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1) 27 %and17 = and i8 %1, %0 28 %and = zext i8 %and17 to i32 29 %2 = insertelement <8 x i32> undef, i32 %and, i32 0 30 %vecinit7.i = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer 31 %3 = bitcast <8 x i32> %vecinit7.i to <4 x i64> 32 ret <4 x i64> %3 33} 34 35define void @PR32547(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, ptr %p) { 36; CHECK-LABEL: PR32547: 37; CHECK: ## %bb.0: ## %entry 38; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %k0 39; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %k1 40; CHECK-NEXT: kunpckbw %k1, %k0, %k1 41; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 42; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1} 43; CHECK-NEXT: vzeroupper 44; CHECK-NEXT: retq 45 entry: 46 %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1) 47 %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1) 48 %conv.i = zext i8 %0 to i16 49 %conv.i18 = zext i8 %1 to i16 50 %shl = shl nuw i16 %conv.i, 8 51 %or = or i16 %shl, %conv.i18 52 %2 = bitcast i16 %or to <16 x i1> 53 tail call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr %p, i32 64, <16 x i1> %2) 54 ret void 55} 56 57define void @PR32547_swap(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, ptr %p) { 58; CHECK-LABEL: PR32547_swap: 59; CHECK: ## %bb.0: ## %entry 60; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %k0 61; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %k1 62; CHECK-NEXT: kunpckbw %k1, %k0, %k1 63; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 64; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1} 65; CHECK-NEXT: vzeroupper 66; CHECK-NEXT: retq 67 entry: 68 %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 1, i8 -1) 69 %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %c, <8 x float> %d, i32 1, i8 -1) 70 %conv.i = zext i8 %0 to i16 71 %conv.i18 = zext i8 %1 to i16 72 %shl = shl nuw i16 %conv.i, 8 73 %or = or i16 %conv.i18, %shl 74 %2 = bitcast i16 %or to <16 x i1> 75 tail call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr %p, i32 64, <16 x i1> %2) 76 ret void 77} 78 79define void @mask_cmp_128(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p) { 80; AVX512VL-LABEL: mask_cmp_128: 81; AVX512VL: ## %bb.0: ## %entry 82; AVX512VL-NEXT: vcmpltps %xmm1, %xmm0, %k0 83; AVX512VL-NEXT: kmovw %k0, %eax 84; AVX512VL-NEXT: vcmpltps %xmm3, %xmm2, %k0 85; AVX512VL-NEXT: shlb $4, %al 86; AVX512VL-NEXT: kmovw %eax, %k1 87; AVX512VL-NEXT: korw %k1, %k0, %k1 88; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 89; AVX512VL-NEXT: vmovaps %ymm0, (%rdi) {%k1} 90; AVX512VL-NEXT: vzeroupper 91; AVX512VL-NEXT: retq 92; 93; AVX512DQVL-LABEL: mask_cmp_128: 94; AVX512DQVL: ## %bb.0: ## %entry 95; AVX512DQVL-NEXT: vcmpltps %xmm1, %xmm0, %k0 96; AVX512DQVL-NEXT: vcmpltps %xmm3, %xmm2, %k1 97; AVX512DQVL-NEXT: kshiftlb $4, %k0, %k0 98; AVX512DQVL-NEXT: korb %k0, %k1, %k1 99; AVX512DQVL-NEXT: vxorps %xmm0, %xmm0, %xmm0 100; AVX512DQVL-NEXT: vmovaps %ymm0, (%rdi) {%k1} 101; AVX512DQVL-NEXT: vzeroupper 102; AVX512DQVL-NEXT: retq 103entry: 104 %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 1, i8 -1) 105 %1 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %c, <4 x float> %d, i32 1, i8 -1) 106 %shl = shl nuw i8 %0, 4 107 %or = or i8 %1, %shl 108 %2 = bitcast i8 %or to <8 x i1> 109 tail call void @llvm.masked.store.v8f32.p0(<8 x float> zeroinitializer, ptr %p, i32 64, <8 x i1> %2) 110 ret void 111} 112 113define <16 x float> @mask_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, ptr %p) { 114; CHECK-LABEL: mask_cmp_512: 115; CHECK: ## %bb.0: ## %entry 116; CHECK-NEXT: vcmpltps {sae}, %zmm1, %zmm0, %k0 117; CHECK-NEXT: vcmpltps %zmm3, %zmm2, %k1 118; CHECK-NEXT: kxnorw %k1, %k0, %k1 119; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} 120; CHECK-NEXT: retq 121 entry: 122 %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i16 -1, i32 8) 123 %1 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i16 -1, i32 4) 124 %2 = load <16 x float>, ptr %p 125 %3 = xor i16 %0, %1 126 %4 = bitcast i16 %3 to <16 x i1> 127 %5 = select <16 x i1> %4, <16 x float> zeroinitializer, <16 x float> %2 128 ret <16 x float> %5 129} 130declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8) 131declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8) 132declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32) 133declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>) 134declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>) 135