1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI 4; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-VBMI1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI 12 13; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. 14 15define dso_local void @add256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" { 16; CHECK-LABEL: add256: 17; CHECK: # %bb.0: 18; CHECK-NEXT: vmovdqa (%rdi), %ymm0 19; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 20; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 21; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 22; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 23; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 24; CHECK-NEXT: vzeroupper 25; CHECK-NEXT: retq 26 %d = load <16 x i32>, ptr %a 27 %e = load <16 x i32>, ptr %b 28 %f = add <16 x i32> %d, %e 29 store <16 x i32> %f, ptr %c 30 ret void 31} 32 33define dso_local void @add512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" { 34; CHECK-LABEL: add512: 35; CHECK: # %bb.0: 36; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 37; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0 38; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 39; CHECK-NEXT: vzeroupper 40; CHECK-NEXT: retq 41 %d = load <16 x i32>, ptr %a 42 %e = load <16 x i32>, ptr %b 43 %f = add <16 x i32> %d, %e 44 store <16 x i32> %f, ptr %c 45 ret void 46} 47 48define dso_local void @avg_v64i8_256(ptr %a, ptr %b) "min-legal-vector-width"="256" { 49; CHECK-LABEL: avg_v64i8_256: 50; CHECK: # %bb.0: 51; CHECK-NEXT: vmovdqa (%rdi), %ymm0 52; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 53; CHECK-NEXT: vpavgb (%rsi), %ymm0, %ymm0 54; CHECK-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1 55; CHECK-NEXT: vmovdqu %ymm1, (%rax) 56; CHECK-NEXT: vmovdqu %ymm0, (%rax) 57; CHECK-NEXT: vzeroupper 58; CHECK-NEXT: retq 59 %1 = load <64 x i8>, ptr %a 60 %2 = load <64 x i8>, ptr %b 61 %3 = zext <64 x i8> %1 to <64 x i32> 62 %4 = zext <64 x i8> %2 to <64 x i32> 63 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 64 %6 = add nuw nsw <64 x i32> %5, %4 65 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 66 %8 = trunc <64 x i32> %7 to <64 x i8> 67 store <64 x i8> %8, ptr undef, align 4 68 ret void 69} 70 71 72define dso_local void @avg_v64i8_512(ptr %a, ptr %b) "min-legal-vector-width"="512" { 73; CHECK-LABEL: avg_v64i8_512: 74; CHECK: # %bb.0: 75; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 76; CHECK-NEXT: vpavgb (%rsi), %zmm0, %zmm0 77; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) 78; CHECK-NEXT: vzeroupper 79; CHECK-NEXT: retq 80 %1 = load <64 x i8>, ptr %a 81 %2 = load <64 x i8>, ptr %b 82 %3 = zext <64 x i8> %1 to <64 x i32> 83 %4 = zext <64 x i8> %2 to <64 x i32> 84 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 85 %6 = add nuw nsw <64 x i32> %5, %4 86 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 87 %8 = trunc <64 x i32> %7 to <64 x i8> 88 store <64 x i8> %8, ptr undef, align 4 89 ret void 90} 91 92define dso_local void @pmaddwd_32_256(ptr %APtr, ptr %BPtr, ptr %CPtr) "min-legal-vector-width"="256" { 93; CHECK-LABEL: pmaddwd_32_256: 94; CHECK: # %bb.0: 95; CHECK-NEXT: vmovdqa (%rdi), %ymm0 96; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 97; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 98; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 99; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 100; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 101; CHECK-NEXT: vzeroupper 102; CHECK-NEXT: retq 103 %A = load <32 x i16>, ptr %APtr 104 %B = load <32 x i16>, ptr %BPtr 105 %a = sext <32 x i16> %A to <32 x i32> 106 %b = sext <32 x i16> %B to <32 x i32> 107 %m = mul nsw <32 x i32> %a, %b 108 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 109 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 110 %ret = add <16 x i32> %odd, %even 111 store <16 x i32> %ret, ptr %CPtr 112 ret void 113} 114 115define dso_local void @pmaddwd_32_512(ptr %APtr, ptr %BPtr, ptr %CPtr) "min-legal-vector-width"="512" { 116; CHECK-LABEL: pmaddwd_32_512: 117; CHECK: # %bb.0: 118; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 119; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 120; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 121; CHECK-NEXT: vzeroupper 122; CHECK-NEXT: retq 123 %A = load <32 x i16>, ptr %APtr 124 %B = load <32 x i16>, ptr %BPtr 125 %a = sext <32 x i16> %A to <32 x i32> 126 %b = sext <32 x i16> %B to <32 x i32> 127 %m = mul nsw <32 x i32> %a, %b 128 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 129 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 130 %ret = add <16 x i32> %odd, %even 131 store <16 x i32> %ret, ptr %CPtr 132 ret void 133} 134 135define dso_local void @psubus_64i8_max_256(ptr %xptr, ptr %yptr, ptr %zptr) "min-legal-vector-width"="256" { 136; CHECK-LABEL: psubus_64i8_max_256: 137; CHECK: # %bb.0: 138; CHECK-NEXT: vmovdqa (%rdi), %ymm0 139; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 140; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 141; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 142; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 143; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 144; CHECK-NEXT: vzeroupper 145; CHECK-NEXT: retq 146 %x = load <64 x i8>, ptr %xptr 147 %y = load <64 x i8>, ptr %yptr 148 %cmp = icmp ult <64 x i8> %x, %y 149 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x 150 %res = sub <64 x i8> %max, %y 151 store <64 x i8> %res, ptr %zptr 152 ret void 153} 154 155define dso_local void @psubus_64i8_max_512(ptr %xptr, ptr %yptr, ptr %zptr) "min-legal-vector-width"="512" { 156; CHECK-LABEL: psubus_64i8_max_512: 157; CHECK: # %bb.0: 158; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 159; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 160; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) 161; CHECK-NEXT: vzeroupper 162; CHECK-NEXT: retq 163 %x = load <64 x i8>, ptr %xptr 164 %y = load <64 x i8>, ptr %yptr 165 %cmp = icmp ult <64 x i8> %x, %y 166 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x 167 %res = sub <64 x i8> %max, %y 168 store <64 x i8> %res, ptr %zptr 169 ret void 170} 171 172define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocapture readonly, i32) "min-legal-vector-width"="256" { 173; CHECK-SKX-LABEL: _Z9test_charPcS_i_256: 174; CHECK-SKX: # %bb.0: # %entry 175; CHECK-SKX-NEXT: movl %edx, %eax 176; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 177; CHECK-SKX-NEXT: xorl %ecx, %ecx 178; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 179; CHECK-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 180; CHECK-SKX-NEXT: .p2align 4 181; CHECK-SKX-NEXT: .LBB8_1: # %vector.body 182; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 183; CHECK-SKX-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 184; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 185; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 186; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 187; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2 188; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 189; CHECK-SKX-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 190; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1 191; CHECK-SKX-NEXT: addq $32, %rcx 192; CHECK-SKX-NEXT: cmpq %rcx, %rax 193; CHECK-SKX-NEXT: jne .LBB8_1 194; CHECK-SKX-NEXT: # %bb.2: # %middle.block 195; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 196; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0 197; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 198; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 199; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 200; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 201; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 202; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 203; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 204; CHECK-SKX-NEXT: vmovd %xmm0, %eax 205; CHECK-SKX-NEXT: vzeroupper 206; CHECK-SKX-NEXT: retq 207; 208; CHECK-AVX512-LABEL: _Z9test_charPcS_i_256: 209; CHECK-AVX512: # %bb.0: # %entry 210; CHECK-AVX512-NEXT: movl %edx, %eax 211; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 212; CHECK-AVX512-NEXT: xorl %ecx, %ecx 213; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 214; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 215; CHECK-AVX512-NEXT: .p2align 4 216; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body 217; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 218; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 219; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 220; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 221; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 222; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2 223; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 224; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 225; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1 226; CHECK-AVX512-NEXT: addq $32, %rcx 227; CHECK-AVX512-NEXT: cmpq %rcx, %rax 228; CHECK-AVX512-NEXT: jne .LBB8_1 229; CHECK-AVX512-NEXT: # %bb.2: # %middle.block 230; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm1 231; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 232; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 233; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 234; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 235; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 236; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 237; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 238; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 239; CHECK-AVX512-NEXT: vmovd %xmm0, %eax 240; CHECK-AVX512-NEXT: vzeroupper 241; CHECK-AVX512-NEXT: retq 242; 243; CHECK-VBMI-LABEL: _Z9test_charPcS_i_256: 244; CHECK-VBMI: # %bb.0: # %entry 245; CHECK-VBMI-NEXT: movl %edx, %eax 246; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 247; CHECK-VBMI-NEXT: xorl %ecx, %ecx 248; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 249; CHECK-VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 250; CHECK-VBMI-NEXT: .p2align 4 251; CHECK-VBMI-NEXT: .LBB8_1: # %vector.body 252; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 253; CHECK-VBMI-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 254; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 255; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 256; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 257; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2 258; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 259; CHECK-VBMI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 260; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 261; CHECK-VBMI-NEXT: addq $32, %rcx 262; CHECK-VBMI-NEXT: cmpq %rcx, %rax 263; CHECK-VBMI-NEXT: jne .LBB8_1 264; CHECK-VBMI-NEXT: # %bb.2: # %middle.block 265; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm1 266; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 267; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 268; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 269; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 270; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 271; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 272; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 273; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 274; CHECK-VBMI-NEXT: vmovd %xmm0, %eax 275; CHECK-VBMI-NEXT: vzeroupper 276; CHECK-VBMI-NEXT: retq 277entry: 278 %3 = zext i32 %2 to i64 279 br label %vector.body 280 281vector.body: 282 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 283 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 284 %4 = getelementptr inbounds i8, ptr %0, i64 %index 285 %5 = bitcast ptr %4 to ptr 286 %wide.load = load <32 x i8>, ptr %5, align 1 287 %6 = sext <32 x i8> %wide.load to <32 x i32> 288 %7 = getelementptr inbounds i8, ptr %1, i64 %index 289 %8 = bitcast ptr %7 to ptr 290 %wide.load14 = load <32 x i8>, ptr %8, align 1 291 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 292 %10 = mul nsw <32 x i32> %9, %6 293 %11 = add nsw <32 x i32> %10, %vec.phi 294 %index.next = add i64 %index, 32 295 %12 = icmp eq i64 %index.next, %3 296 br i1 %12, label %middle.block, label %vector.body 297 298middle.block: 299 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 300 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 301 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 302 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 303 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 304 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 305 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 306 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 307 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 308 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 309 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 310 ret i32 %13 311} 312 313define dso_local i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly, i32) "min-legal-vector-width"="512" { 314; CHECK-SKX-LABEL: _Z9test_charPcS_i_512: 315; CHECK-SKX: # %bb.0: # %entry 316; CHECK-SKX-NEXT: movl %edx, %eax 317; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 318; CHECK-SKX-NEXT: xorl %ecx, %ecx 319; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 320; CHECK-SKX-NEXT: .p2align 4 321; CHECK-SKX-NEXT: .LBB9_1: # %vector.body 322; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 323; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 324; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 325; CHECK-SKX-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 326; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm2, %zmm1 327; CHECK-SKX-NEXT: addq $32, %rcx 328; CHECK-SKX-NEXT: cmpq %rcx, %rax 329; CHECK-SKX-NEXT: jne .LBB9_1 330; CHECK-SKX-NEXT: # %bb.2: # %middle.block 331; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 332; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 333; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 334; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 335; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 336; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 337; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 338; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 339; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 340; CHECK-SKX-NEXT: vmovd %xmm0, %eax 341; CHECK-SKX-NEXT: vzeroupper 342; CHECK-SKX-NEXT: retq 343; 344; CHECK-AVX512-LABEL: _Z9test_charPcS_i_512: 345; CHECK-AVX512: # %bb.0: # %entry 346; CHECK-AVX512-NEXT: movl %edx, %eax 347; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 348; CHECK-AVX512-NEXT: xorl %ecx, %ecx 349; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 350; CHECK-AVX512-NEXT: .p2align 4 351; CHECK-AVX512-NEXT: .LBB9_1: # %vector.body 352; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 353; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 354; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 355; CHECK-AVX512-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 356; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 357; CHECK-AVX512-NEXT: addq $32, %rcx 358; CHECK-AVX512-NEXT: cmpq %rcx, %rax 359; CHECK-AVX512-NEXT: jne .LBB9_1 360; CHECK-AVX512-NEXT: # %bb.2: # %middle.block 361; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 362; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 363; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 364; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 365; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 366; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 367; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 368; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 369; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 370; CHECK-AVX512-NEXT: vmovd %xmm0, %eax 371; CHECK-AVX512-NEXT: vzeroupper 372; CHECK-AVX512-NEXT: retq 373; 374; CHECK-VBMI-LABEL: _Z9test_charPcS_i_512: 375; CHECK-VBMI: # %bb.0: # %entry 376; CHECK-VBMI-NEXT: movl %edx, %eax 377; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 378; CHECK-VBMI-NEXT: xorl %ecx, %ecx 379; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 380; CHECK-VBMI-NEXT: .p2align 4 381; CHECK-VBMI-NEXT: .LBB9_1: # %vector.body 382; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 383; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 384; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 385; CHECK-VBMI-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 386; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm2, %zmm1 387; CHECK-VBMI-NEXT: addq $32, %rcx 388; CHECK-VBMI-NEXT: cmpq %rcx, %rax 389; CHECK-VBMI-NEXT: jne .LBB9_1 390; CHECK-VBMI-NEXT: # %bb.2: # %middle.block 391; CHECK-VBMI-NEXT: vpaddd %zmm0, %zmm1, %zmm0 392; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 393; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 394; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 395; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 396; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 397; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 398; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 399; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 400; CHECK-VBMI-NEXT: vmovd %xmm0, %eax 401; CHECK-VBMI-NEXT: vzeroupper 402; CHECK-VBMI-NEXT: retq 403entry: 404 %3 = zext i32 %2 to i64 405 br label %vector.body 406 407vector.body: 408 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 409 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 410 %4 = getelementptr inbounds i8, ptr %0, i64 %index 411 %5 = bitcast ptr %4 to ptr 412 %wide.load = load <32 x i8>, ptr %5, align 1 413 %6 = sext <32 x i8> %wide.load to <32 x i32> 414 %7 = getelementptr inbounds i8, ptr %1, i64 %index 415 %8 = bitcast ptr %7 to ptr 416 %wide.load14 = load <32 x i8>, ptr %8, align 1 417 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 418 %10 = mul nsw <32 x i32> %9, %6 419 %11 = add nsw <32 x i32> %10, %vec.phi 420 %index.next = add i64 %index, 32 421 %12 = icmp eq i64 %index.next, %3 422 br i1 %12, label %middle.block, label %vector.body 423 424middle.block: 425 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 426 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 427 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 428 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 429 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 430 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 431 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 432 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 433 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 434 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 435 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 436 ret i32 %13 437} 438 439@a = dso_local global [1024 x i8] zeroinitializer, align 16 440@b = dso_local global [1024 x i8] zeroinitializer, align 16 441 442define dso_local i32 @sad_16i8_256() "min-legal-vector-width"="256" { 443; CHECK-SKX-LABEL: sad_16i8_256: 444; CHECK-SKX: # %bb.0: # %entry 445; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 446; CHECK-SKX-NEXT: movq $-1024, %rax # imm = 0xFC00 447; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 448; CHECK-SKX-NEXT: .p2align 4 449; CHECK-SKX-NEXT: .LBB10_1: # %vector.body 450; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 451; CHECK-SKX-NEXT: vmovdqu a+1024(%rax), %xmm2 452; CHECK-SKX-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 453; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm2, %ymm1 454; CHECK-SKX-NEXT: addq $4, %rax 455; CHECK-SKX-NEXT: jne .LBB10_1 456; CHECK-SKX-NEXT: # %bb.2: # %middle.block 457; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 458; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 459; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 460; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 461; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 462; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 463; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 464; CHECK-SKX-NEXT: vmovd %xmm0, %eax 465; CHECK-SKX-NEXT: vzeroupper 466; CHECK-SKX-NEXT: retq 467; 468; CHECK-AVX512-LABEL: sad_16i8_256: 469; CHECK-AVX512: # %bb.0: # %entry 470; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 471; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 472; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 473; CHECK-AVX512-NEXT: .p2align 4 474; CHECK-AVX512-NEXT: .LBB10_1: # %vector.body 475; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 476; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm2 477; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 478; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1 479; CHECK-AVX512-NEXT: addq $4, %rax 480; CHECK-AVX512-NEXT: jne .LBB10_1 481; CHECK-AVX512-NEXT: # %bb.2: # %middle.block 482; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 483; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 484; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 485; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 486; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 487; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 488; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 489; CHECK-AVX512-NEXT: vmovd %xmm0, %eax 490; CHECK-AVX512-NEXT: vzeroupper 491; CHECK-AVX512-NEXT: retq 492; 493; CHECK-VBMI-LABEL: sad_16i8_256: 494; CHECK-VBMI: # %bb.0: # %entry 495; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 496; CHECK-VBMI-NEXT: movq $-1024, %rax # imm = 0xFC00 497; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 498; CHECK-VBMI-NEXT: .p2align 4 499; CHECK-VBMI-NEXT: .LBB10_1: # %vector.body 500; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 501; CHECK-VBMI-NEXT: vmovdqu a+1024(%rax), %xmm2 502; CHECK-VBMI-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 503; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm2, %ymm1 504; CHECK-VBMI-NEXT: addq $4, %rax 505; CHECK-VBMI-NEXT: jne .LBB10_1 506; CHECK-VBMI-NEXT: # %bb.2: # %middle.block 507; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 508; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 509; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 510; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 511; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 512; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 513; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 514; CHECK-VBMI-NEXT: vmovd %xmm0, %eax 515; CHECK-VBMI-NEXT: vzeroupper 516; CHECK-VBMI-NEXT: retq 517entry: 518 br label %vector.body 519 520vector.body: 521 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 522 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 523 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index 524 %1 = bitcast ptr %0 to ptr 525 %wide.load = load <16 x i8>, ptr %1, align 4 526 %2 = zext <16 x i8> %wide.load to <16 x i32> 527 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index 528 %4 = bitcast ptr %3 to ptr 529 %wide.load1 = load <16 x i8>, ptr %4, align 4 530 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 531 %6 = sub nsw <16 x i32> %2, %5 532 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 533 %8 = sub nsw <16 x i32> zeroinitializer, %6 534 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 535 %10 = add nsw <16 x i32> %9, %vec.phi 536 %index.next = add i64 %index, 4 537 %11 = icmp eq i64 %index.next, 1024 538 br i1 %11, label %middle.block, label %vector.body 539 540middle.block: 541 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 542 %bin.rdx = add <16 x i32> %10, %rdx.shuf 543 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 544 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 545 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 546 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 547 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 548 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 549 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 550 ret i32 %12 551} 552 553define dso_local i32 @sad_16i8_512() "min-legal-vector-width"="512" { 554; CHECK-SKX-LABEL: sad_16i8_512: 555; CHECK-SKX: # %bb.0: # %entry 556; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 557; CHECK-SKX-NEXT: movq $-1024, %rax # imm = 0xFC00 558; CHECK-SKX-NEXT: .p2align 4 559; CHECK-SKX-NEXT: .LBB11_1: # %vector.body 560; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 561; CHECK-SKX-NEXT: vmovdqu a+1024(%rax), %xmm1 562; CHECK-SKX-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 563; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 564; CHECK-SKX-NEXT: addq $4, %rax 565; CHECK-SKX-NEXT: jne .LBB11_1 566; CHECK-SKX-NEXT: # %bb.2: # %middle.block 567; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 568; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 569; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 570; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 571; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 572; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 573; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 574; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 575; CHECK-SKX-NEXT: vmovd %xmm0, %eax 576; CHECK-SKX-NEXT: vzeroupper 577; CHECK-SKX-NEXT: retq 578; 579; CHECK-AVX512-LABEL: sad_16i8_512: 580; CHECK-AVX512: # %bb.0: # %entry 581; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 582; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 583; CHECK-AVX512-NEXT: .p2align 4 584; CHECK-AVX512-NEXT: .LBB11_1: # %vector.body 585; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 586; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1 587; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 588; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 589; CHECK-AVX512-NEXT: addq $4, %rax 590; CHECK-AVX512-NEXT: jne .LBB11_1 591; CHECK-AVX512-NEXT: # %bb.2: # %middle.block 592; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 593; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 594; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 595; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 596; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 597; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 598; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 599; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 600; CHECK-AVX512-NEXT: vmovd %xmm0, %eax 601; CHECK-AVX512-NEXT: vzeroupper 602; CHECK-AVX512-NEXT: retq 603; 604; CHECK-VBMI-LABEL: sad_16i8_512: 605; CHECK-VBMI: # %bb.0: # %entry 606; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 607; CHECK-VBMI-NEXT: movq $-1024, %rax # imm = 0xFC00 608; CHECK-VBMI-NEXT: .p2align 4 609; CHECK-VBMI-NEXT: .LBB11_1: # %vector.body 610; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 611; CHECK-VBMI-NEXT: vmovdqu a+1024(%rax), %xmm1 612; CHECK-VBMI-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 613; CHECK-VBMI-NEXT: vpaddd %zmm0, %zmm1, %zmm0 614; CHECK-VBMI-NEXT: addq $4, %rax 615; CHECK-VBMI-NEXT: jne .LBB11_1 616; CHECK-VBMI-NEXT: # %bb.2: # %middle.block 617; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 618; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 619; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 620; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 621; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 622; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 623; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 624; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 625; CHECK-VBMI-NEXT: vmovd %xmm0, %eax 626; CHECK-VBMI-NEXT: vzeroupper 627; CHECK-VBMI-NEXT: retq 628entry: 629 br label %vector.body 630 631vector.body: 632 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 633 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 634 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index 635 %1 = bitcast ptr %0 to ptr 636 %wide.load = load <16 x i8>, ptr %1, align 4 637 %2 = zext <16 x i8> %wide.load to <16 x i32> 638 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index 639 %4 = bitcast ptr %3 to ptr 640 %wide.load1 = load <16 x i8>, ptr %4, align 4 641 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 642 %6 = sub nsw <16 x i32> %2, %5 643 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 644 %8 = sub nsw <16 x i32> zeroinitializer, %6 645 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 646 %10 = add nsw <16 x i32> %9, %vec.phi 647 %index.next = add i64 %index, 4 648 %11 = icmp eq i64 %index.next, 1024 649 br i1 %11, label %middle.block, label %vector.body 650 651middle.block: 652 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 653 %bin.rdx = add <16 x i32> %10, %rdx.shuf 654 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 655 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 656 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 657 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 658 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 659 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 660 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 661 ret i32 %12 662} 663 664define dso_local void @sbto16f32_256(<16 x i16> %a, ptr %res) "min-legal-vector-width"="256" { 665; CHECK-LABEL: sbto16f32_256: 666; CHECK: # %bb.0: 667; CHECK-NEXT: vpmovw2m %ymm0, %k0 668; CHECK-NEXT: kshiftrw $8, %k0, %k1 669; CHECK-NEXT: vpmovm2d %k1, %ymm0 670; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 671; CHECK-NEXT: vpmovm2d %k0, %ymm1 672; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 673; CHECK-NEXT: vmovaps %ymm1, (%rdi) 674; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) 675; CHECK-NEXT: vzeroupper 676; CHECK-NEXT: retq 677 %mask = icmp slt <16 x i16> %a, zeroinitializer 678 %1 = sitofp <16 x i1> %mask to <16 x float> 679 store <16 x float> %1, ptr %res 680 ret void 681} 682 683define dso_local void @sbto16f32_512(<16 x i16> %a, ptr %res) "min-legal-vector-width"="512" { 684; CHECK-LABEL: sbto16f32_512: 685; CHECK: # %bb.0: 686; CHECK-NEXT: vpmovw2m %ymm0, %k0 687; CHECK-NEXT: vpmovm2d %k0, %zmm0 688; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 689; CHECK-NEXT: vmovaps %zmm0, (%rdi) 690; CHECK-NEXT: vzeroupper 691; CHECK-NEXT: retq 692 %mask = icmp slt <16 x i16> %a, zeroinitializer 693 %1 = sitofp <16 x i1> %mask to <16 x float> 694 store <16 x float> %1, ptr %res 695 ret void 696} 697 698define dso_local void @sbto16f64_256(<16 x i16> %a, ptr %res) "min-legal-vector-width"="256" { 699; CHECK-LABEL: sbto16f64_256: 700; CHECK: # %bb.0: 701; CHECK-NEXT: vpmovw2m %ymm0, %k0 702; CHECK-NEXT: kshiftrw $8, %k0, %k1 703; CHECK-NEXT: vpmovm2d %k1, %ymm0 704; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 705; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 706; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 707; CHECK-NEXT: vpmovm2d %k0, %ymm2 708; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 709; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 710; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 711; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) 712; CHECK-NEXT: vmovaps %ymm3, (%rdi) 713; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) 714; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) 715; CHECK-NEXT: vzeroupper 716; CHECK-NEXT: retq 717 %mask = icmp slt <16 x i16> %a, zeroinitializer 718 %1 = sitofp <16 x i1> %mask to <16 x double> 719 store <16 x double> %1, ptr %res 720 ret void 721} 722 723define dso_local void @sbto16f64_512(<16 x i16> %a, ptr %res) "min-legal-vector-width"="512" { 724; CHECK-LABEL: sbto16f64_512: 725; CHECK: # %bb.0: 726; CHECK-NEXT: vpmovw2m %ymm0, %k0 727; CHECK-NEXT: vpmovm2d %k0, %zmm0 728; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 729; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 730; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 731; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) 732; CHECK-NEXT: vmovaps %zmm1, (%rdi) 733; CHECK-NEXT: vzeroupper 734; CHECK-NEXT: retq 735 %mask = icmp slt <16 x i16> %a, zeroinitializer 736 %1 = sitofp <16 x i1> %mask to <16 x double> 737 store <16 x double> %1, ptr %res 738 ret void 739} 740 741define dso_local void @ubto16f32_256(<16 x i16> %a, ptr %res) "min-legal-vector-width"="256" { 742; CHECK-LABEL: ubto16f32_256: 743; CHECK: # %bb.0: 744; CHECK-NEXT: vpmovw2m %ymm0, %k0 745; CHECK-NEXT: kshiftrw $8, %k0, %k1 746; CHECK-NEXT: vpmovm2d %k1, %ymm0 747; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 748; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 749; CHECK-NEXT: vpmovm2d %k0, %ymm1 750; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 751; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 752; CHECK-NEXT: vmovaps %ymm1, (%rdi) 753; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) 754; CHECK-NEXT: vzeroupper 755; CHECK-NEXT: retq 756 %mask = icmp slt <16 x i16> %a, zeroinitializer 757 %1 = uitofp <16 x i1> %mask to <16 x float> 758 store <16 x float> %1, ptr %res 759 ret void 760} 761 762define dso_local void @ubto16f32_512(<16 x i16> %a, ptr %res) "min-legal-vector-width"="512" { 763; CHECK-LABEL: ubto16f32_512: 764; CHECK: # %bb.0: 765; CHECK-NEXT: vpmovw2m %ymm0, %k0 766; CHECK-NEXT: vpmovm2d %k0, %zmm0 767; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 768; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 769; CHECK-NEXT: vmovaps %zmm0, (%rdi) 770; CHECK-NEXT: vzeroupper 771; CHECK-NEXT: retq 772 %mask = icmp slt <16 x i16> %a, zeroinitializer 773 %1 = uitofp <16 x i1> %mask to <16 x float> 774 store <16 x float> %1, ptr %res 775 ret void 776} 777 778define dso_local void @ubto16f64_256(<16 x i16> %a, ptr %res) "min-legal-vector-width"="256" { 779; CHECK-LABEL: ubto16f64_256: 780; CHECK: # %bb.0: 781; CHECK-NEXT: vpmovw2m %ymm0, %k0 782; CHECK-NEXT: kshiftrw $8, %k0, %k1 783; CHECK-NEXT: vpmovm2d %k1, %ymm0 784; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 785; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 786; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 787; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 788; CHECK-NEXT: vpmovm2d %k0, %ymm2 789; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2 790; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 791; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 792; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 793; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) 794; CHECK-NEXT: vmovaps %ymm3, (%rdi) 795; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) 796; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) 797; CHECK-NEXT: vzeroupper 798; CHECK-NEXT: retq 799 %mask = icmp slt <16 x i16> %a, zeroinitializer 800 %1 = uitofp <16 x i1> %mask to <16 x double> 801 store <16 x double> %1, ptr %res 802 ret void 803} 804 805define dso_local void @ubto16f64_512(<16 x i16> %a, ptr %res) "min-legal-vector-width"="512" { 806; CHECK-LABEL: ubto16f64_512: 807; CHECK: # %bb.0: 808; CHECK-NEXT: vpmovw2m %ymm0, %k0 809; CHECK-NEXT: vpmovm2d %k0, %zmm0 810; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 811; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 812; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 813; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 814; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) 815; CHECK-NEXT: vmovaps %zmm1, (%rdi) 816; CHECK-NEXT: vzeroupper 817; CHECK-NEXT: retq 818 %mask = icmp slt <16 x i16> %a, zeroinitializer 819 %1 = uitofp <16 x i1> %mask to <16 x double> 820 store <16 x double> %1, ptr %res 821 ret void 822} 823 824define <16 x i16> @test_16f32toub_256(ptr %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" { 825; CHECK-LABEL: test_16f32toub_256: 826; CHECK: # %bb.0: 827; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 828; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 829; CHECK-NEXT: vpmovd2m %ymm1, %k0 830; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 831; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 832; CHECK-NEXT: vpmovd2m %ymm1, %k1 833; CHECK-NEXT: kunpckbw %k0, %k1, %k1 834; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 835; CHECK-NEXT: retq 836 %a = load <16 x float>, ptr %ptr 837 %mask = fptoui <16 x float> %a to <16 x i1> 838 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 839 ret <16 x i16> %select 840} 841 842define <16 x i16> @test_16f32toub_512(ptr %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" { 843; CHECK-LABEL: test_16f32toub_512: 844; CHECK: # %bb.0: 845; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 846; CHECK-NEXT: vpslld $31, %zmm1, %zmm1 847; CHECK-NEXT: vpmovd2m %zmm1, %k1 848; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 849; CHECK-NEXT: retq 850 %a = load <16 x float>, ptr %ptr 851 %mask = fptoui <16 x float> %a to <16 x i1> 852 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 853 ret <16 x i16> %select 854} 855 856define <16 x i16> @test_16f32tosb_256(ptr %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" { 857; CHECK-LABEL: test_16f32tosb_256: 858; CHECK: # %bb.0: 859; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 860; CHECK-NEXT: vpmovd2m %ymm1, %k0 861; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 862; CHECK-NEXT: vpmovd2m %ymm1, %k1 863; CHECK-NEXT: kunpckbw %k0, %k1, %k1 864; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 865; CHECK-NEXT: retq 866 %a = load <16 x float>, ptr %ptr 867 %mask = fptosi <16 x float> %a to <16 x i1> 868 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 869 ret <16 x i16> %select 870} 871 872define <16 x i16> @test_16f32tosb_512(ptr %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" { 873; CHECK-LABEL: test_16f32tosb_512: 874; CHECK: # %bb.0: 875; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 876; CHECK-NEXT: vpmovd2m %zmm1, %k1 877; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} 878; CHECK-NEXT: retq 879 %a = load <16 x float>, ptr %ptr 880 %mask = fptosi <16 x float> %a to <16 x i1> 881 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer 882 ret <16 x i16> %select 883} 884 885define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" { 886; CHECK-SKX-VBMI-LABEL: mul256: 887; CHECK-SKX-VBMI: # %bb.0: 888; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm0 889; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 890; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2 891; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 892; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 893; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 894; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 895; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 896; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 897; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] 898; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 899; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 900; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 901; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 902; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 903; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 904; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) 905; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) 906; CHECK-SKX-VBMI-NEXT: vzeroupper 907; CHECK-SKX-VBMI-NEXT: retq 908; 909; CHECK-AVX512-LABEL: mul256: 910; CHECK-AVX512: # %bb.0: 911; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0 912; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 913; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 914; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 915; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 916; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5 917; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 918; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 919; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 920; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 921; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) 922; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3 923; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 924; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 925; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 926; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0 927; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4) 928; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) 929; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) 930; CHECK-AVX512-NEXT: vzeroupper 931; CHECK-AVX512-NEXT: retq 932; 933; CHECK-VBMI-LABEL: mul256: 934; CHECK-VBMI: # %bb.0: 935; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm0 936; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 937; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2 938; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 939; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 940; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 941; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 942; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 943; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 944; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] 945; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 946; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 947; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 948; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 949; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 950; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 951; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) 952; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) 953; CHECK-VBMI-NEXT: vzeroupper 954; CHECK-VBMI-NEXT: retq 955 %d = load <64 x i8>, ptr %a 956 %e = load <64 x i8>, ptr %b 957 %f = mul <64 x i8> %d, %e 958 store <64 x i8> %f, ptr %c 959 ret void 960} 961 962define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" { 963; CHECK-SKX-VBMI-LABEL: mul512: 964; CHECK-SKX-VBMI: # %bb.0: 965; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 966; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 967; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 968; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 969; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 970; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 971; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 972; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] 973; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 974; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) 975; CHECK-SKX-VBMI-NEXT: vzeroupper 976; CHECK-SKX-VBMI-NEXT: retq 977; 978; CHECK-AVX512-LABEL: mul512: 979; CHECK-AVX512: # %bb.0: 980; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 981; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 982; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 983; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3 984; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 985; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 986; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 987; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0 988; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) 989; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) 990; CHECK-AVX512-NEXT: vzeroupper 991; CHECK-AVX512-NEXT: retq 992; 993; CHECK-VBMI-LABEL: mul512: 994; CHECK-VBMI: # %bb.0: 995; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 996; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 997; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 998; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 999; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 1000; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 1001; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 1002; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] 1003; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 1004; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) 1005; CHECK-VBMI-NEXT: vzeroupper 1006; CHECK-VBMI-NEXT: retq 1007 %d = load <64 x i8>, ptr %a 1008 %e = load <64 x i8>, ptr %b 1009 %f = mul <64 x i8> %d, %e 1010 store <64 x i8> %f, ptr %c 1011 ret void 1012} 1013 1014; This threw an assertion at one point. 1015define <4 x i32> @mload_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) "min-legal-vector-width"="256" { 1016; CHECK-LABEL: mload_v4i32: 1017; CHECK: # %bb.0: 1018; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1019; CHECK-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} 1020; CHECK-NEXT: retq 1021 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 1022 %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) 1023 ret <4 x i32> %res 1024} 1025declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) 1026 1027define <16 x i32> @trunc_v16i64_v16i32(ptr %x) nounwind "min-legal-vector-width"="256" { 1028; CHECK-LABEL: trunc_v16i64_v16i32: 1029; CHECK: # %bb.0: 1030; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1031; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 1032; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2 1033; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 1034; CHECK-NEXT: vpmovqd %ymm0, %xmm0 1035; CHECK-NEXT: vpmovqd %ymm1, %xmm1 1036; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1037; CHECK-NEXT: vpmovqd %ymm2, %xmm1 1038; CHECK-NEXT: vpmovqd %ymm3, %xmm2 1039; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1040; CHECK-NEXT: retq 1041 %a = load <16 x i64>, ptr %x 1042 %b = trunc <16 x i64> %a to <16 x i32> 1043 ret <16 x i32> %b 1044} 1045 1046define <16 x i8> @trunc_v16i64_v16i8(ptr %x) nounwind "min-legal-vector-width"="256" { 1047; CHECK-LABEL: trunc_v16i64_v16i8: 1048; CHECK: # %bb.0: 1049; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1050; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 1051; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2 1052; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 1053; CHECK-NEXT: vpmovqb %ymm3, %xmm3 1054; CHECK-NEXT: vpmovqb %ymm2, %xmm2 1055; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1056; CHECK-NEXT: vpmovqb %ymm1, %xmm1 1057; CHECK-NEXT: vpmovqb %ymm0, %xmm0 1058; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1059; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1060; CHECK-NEXT: vzeroupper 1061; CHECK-NEXT: retq 1062 %a = load <16 x i64>, ptr %x 1063 %b = trunc <16 x i64> %a to <16 x i8> 1064 ret <16 x i8> %b 1065} 1066 1067define <16 x i8> @trunc_v16i32_v16i8(ptr %x) nounwind "min-legal-vector-width"="256" { 1068; CHECK-LABEL: trunc_v16i32_v16i8: 1069; CHECK: # %bb.0: 1070; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1071; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 1072; CHECK-NEXT: vpmovdb %ymm1, %xmm1 1073; CHECK-NEXT: vpmovdb %ymm0, %xmm0 1074; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1075; CHECK-NEXT: vzeroupper 1076; CHECK-NEXT: retq 1077 %a = load <16 x i32>, ptr %x 1078 %b = trunc <16 x i32> %a to <16 x i8> 1079 ret <16 x i8> %b 1080} 1081 1082define <8 x i8> @trunc_v8i64_v8i8(ptr %x) nounwind "min-legal-vector-width"="256" { 1083; CHECK-LABEL: trunc_v8i64_v8i8: 1084; CHECK: # %bb.0: 1085; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1086; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 1087; CHECK-NEXT: vpmovqb %ymm1, %xmm1 1088; CHECK-NEXT: vpmovqb %ymm0, %xmm0 1089; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1090; CHECK-NEXT: vzeroupper 1091; CHECK-NEXT: retq 1092 %a = load <8 x i64>, ptr %x 1093 %b = trunc <8 x i64> %a to <8 x i8> 1094 ret <8 x i8> %b 1095} 1096 1097define <8 x i16> @trunc_v8i64_v8i16(ptr %x) nounwind "min-legal-vector-width"="256" { 1098; CHECK-LABEL: trunc_v8i64_v8i16: 1099; CHECK: # %bb.0: 1100; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1101; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 1102; CHECK-NEXT: vpmovqw %ymm1, %xmm1 1103; CHECK-NEXT: vpmovqw %ymm0, %xmm0 1104; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1105; CHECK-NEXT: vzeroupper 1106; CHECK-NEXT: retq 1107 %a = load <8 x i64>, ptr %x 1108 %b = trunc <8 x i64> %a to <8 x i16> 1109 ret <8 x i16> %b 1110} 1111 1112define <8 x i32> @trunc_v8i64_v8i32_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" { 1113; CHECK-LABEL: trunc_v8i64_v8i32_zeroes: 1114; CHECK: # %bb.0: 1115; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0 1116; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1 1117; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 1118; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1119; CHECK-NEXT: retq 1120 %a = load <8 x i64>, ptr %x 1121 %b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48> 1122 %c = trunc <8 x i64> %b to <8 x i32> 1123 ret <8 x i32> %c 1124} 1125 1126define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" { 1127; CHECK-LABEL: trunc_v16i32_v16i16_zeroes: 1128; CHECK: # %bb.0: 1129; CHECK-NEXT: vmovdqa (%rdi), %ymm1 1130; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] 1131; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 1132; CHECK-NEXT: retq 1133 %a = load <16 x i32>, ptr %x 1134 %b = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1135 %c = trunc <16 x i32> %b to <16 x i16> 1136 ret <16 x i16> %c 1137} 1138 1139define <32 x i8> @trunc_v32i16_v32i8_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" { 1140; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes: 1141; CHECK-SKX-VBMI: # %bb.0: 1142; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1 1143; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] 1144; CHECK-SKX-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 1145; CHECK-SKX-VBMI-NEXT: retq 1146; 1147; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_zeroes: 1148; CHECK-AVX512: # %bb.0: 1149; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 1150; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1 1151; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 1152; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1153; CHECK-AVX512-NEXT: retq 1154; 1155; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_zeroes: 1156; CHECK-VBMI: # %bb.0: 1157; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 1158; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] 1159; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 1160; CHECK-VBMI-NEXT: retq 1161 %a = load <32 x i16>, ptr %x 1162 %b = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1163 %c = trunc <32 x i16> %b to <32 x i8> 1164 ret <32 x i8> %c 1165} 1166 1167define <8 x i32> @trunc_v8i64_v8i32_sign(ptr %x) nounwind "min-legal-vector-width"="256" { 1168; CHECK-LABEL: trunc_v8i64_v8i32_sign: 1169; CHECK: # %bb.0: 1170; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm0 1171; CHECK-NEXT: vpsraq $48, (%rdi), %ymm1 1172; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 1173; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1174; CHECK-NEXT: retq 1175 %a = load <8 x i64>, ptr %x 1176 %b = ashr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48> 1177 %c = trunc <8 x i64> %b to <8 x i32> 1178 ret <8 x i32> %c 1179} 1180 1181define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-width"="256" { 1182; CHECK-LABEL: trunc_v16i32_v16i16_sign: 1183; CHECK: # %bb.0: 1184; CHECK-NEXT: vmovdqa (%rdi), %ymm1 1185; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] 1186; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 1187; CHECK-NEXT: retq 1188 %a = load <16 x i32>, ptr %x 1189 %b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1190 %c = trunc <16 x i32> %b to <16 x i16> 1191 ret <16 x i16> %c 1192} 1193 1194define <32 x i8> @trunc_v32i16_v32i8_sign(ptr %x) nounwind "min-legal-vector-width"="256" { 1195; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign: 1196; CHECK-SKX-VBMI: # %bb.0: 1197; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1 1198; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] 1199; CHECK-SKX-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 1200; CHECK-SKX-VBMI-NEXT: retq 1201; 1202; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign: 1203; CHECK-AVX512: # %bb.0: 1204; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 1205; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1 1206; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 1207; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1208; CHECK-AVX512-NEXT: retq 1209; 1210; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign: 1211; CHECK-VBMI: # %bb.0: 1212; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 1213; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] 1214; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 1215; CHECK-VBMI-NEXT: retq 1216 %a = load <32 x i16>, ptr %x 1217 %b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1218 %c = trunc <32 x i16> %b to <32 x i8> 1219 ret <32 x i8> %c 1220} 1221 1222define dso_local void @zext_v16i8_v16i64(<16 x i8> %x, ptr %y) nounwind "min-legal-vector-width"="256" { 1223; CHECK-LABEL: zext_v16i8_v16i64: 1224; CHECK: # %bb.0: 1225; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1226; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1227; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 1228; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 1229; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 1230; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1231; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1232; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero 1233; CHECK-NEXT: vmovdqa %ymm0, (%rdi) 1234; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) 1235; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi) 1236; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) 1237; CHECK-NEXT: vzeroupper 1238; CHECK-NEXT: retq 1239 %a = zext <16 x i8> %x to <16 x i64> 1240 store <16 x i64> %a, ptr %y 1241 ret void 1242} 1243 1244define dso_local void @sext_v16i8_v16i64(<16 x i8> %x, ptr %y) nounwind "min-legal-vector-width"="256" { 1245; CHECK-LABEL: sext_v16i8_v16i64: 1246; CHECK: # %bb.0: 1247; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 1248; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1249; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2 1250; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 1251; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 1252; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3 1253; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1 1254; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 1255; CHECK-NEXT: vmovdqa %ymm0, (%rdi) 1256; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) 1257; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi) 1258; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) 1259; CHECK-NEXT: vzeroupper 1260; CHECK-NEXT: retq 1261 %a = sext <16 x i8> %x to <16 x i64> 1262 store <16 x i64> %a, ptr %y 1263 ret void 1264} 1265 1266define dso_local void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, ptr %p, ptr %q, ptr %r) "min-legal-vector-width"="256" { 1267; CHECK-LABEL: vselect_split_v8i16_setcc: 1268; CHECK: # %bb.0: 1269; CHECK-NEXT: vmovdqa (%rsi), %ymm2 1270; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 1271; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 1272; CHECK-NEXT: kshiftrb $4, %k1, %k2 1273; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} 1274; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} 1275; CHECK-NEXT: vmovdqa %ymm2, (%rdx) 1276; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) 1277; CHECK-NEXT: vzeroupper 1278; CHECK-NEXT: retq 1279 %x = load <8 x i64>, ptr %p 1280 %y = load <8 x i64>, ptr %q 1281 %a = icmp eq <8 x i16> %s, %t 1282 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y 1283 store <8 x i64> %b, ptr %r 1284 ret void 1285} 1286 1287define dso_local void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, ptr %p, ptr %q, ptr %r) "min-legal-vector-width"="256" { 1288; CHECK-LABEL: vselect_split_v8i32_setcc: 1289; CHECK: # %bb.0: 1290; CHECK-NEXT: vmovdqa (%rsi), %ymm2 1291; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 1292; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 1293; CHECK-NEXT: kshiftrb $4, %k1, %k2 1294; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} 1295; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} 1296; CHECK-NEXT: vmovdqa %ymm2, (%rdx) 1297; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) 1298; CHECK-NEXT: vzeroupper 1299; CHECK-NEXT: retq 1300 %x = load <8 x i64>, ptr %p 1301 %y = load <8 x i64>, ptr %q 1302 %a = icmp eq <8 x i32> %s, %t 1303 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y 1304 store <8 x i64> %b, ptr %r 1305 ret void 1306} 1307 1308define dso_local void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, ptr %p, ptr %q, ptr %r) "min-legal-vector-width"="256" { 1309; CHECK-LABEL: vselect_split_v16i8_setcc: 1310; CHECK: # %bb.0: 1311; CHECK-NEXT: vmovdqa (%rsi), %ymm2 1312; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 1313; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 1314; CHECK-NEXT: kshiftrw $8, %k1, %k2 1315; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} 1316; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} 1317; CHECK-NEXT: vmovdqa %ymm2, (%rdx) 1318; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) 1319; CHECK-NEXT: vzeroupper 1320; CHECK-NEXT: retq 1321 %x = load <16 x i32>, ptr %p 1322 %y = load <16 x i32>, ptr %q 1323 %a = icmp eq <16 x i8> %s, %t 1324 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y 1325 store <16 x i32> %b, ptr %r 1326 ret void 1327} 1328 1329define dso_local void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, ptr %p, ptr %q, ptr %r) "min-legal-vector-width"="256" { 1330; CHECK-LABEL: vselect_split_v16i16_setcc: 1331; CHECK: # %bb.0: 1332; CHECK-NEXT: vmovdqa (%rsi), %ymm2 1333; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 1334; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 1335; CHECK-NEXT: kshiftrw $8, %k1, %k2 1336; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} 1337; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} 1338; CHECK-NEXT: vmovdqa %ymm2, (%rdx) 1339; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) 1340; CHECK-NEXT: vzeroupper 1341; CHECK-NEXT: retq 1342 %x = load <16 x i32>, ptr %p 1343 %y = load <16 x i32>, ptr %q 1344 %a = icmp eq <16 x i16> %s, %t 1345 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y 1346 store <16 x i32> %b, ptr %r 1347 ret void 1348} 1349 1350define <16 x i8> @trunc_packus_v16i32_v16i8(ptr %p) "min-legal-vector-width"="256" { 1351; CHECK-LABEL: trunc_packus_v16i32_v16i8: 1352; CHECK: # %bb.0: 1353; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1354; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 1355; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1356; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 1357; CHECK-NEXT: vzeroupper 1358; CHECK-NEXT: retq 1359 %a = load <16 x i32>, ptr %p 1360 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1361 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1362 %d = icmp sgt <16 x i32> %c, zeroinitializer 1363 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer 1364 %f = trunc <16 x i32> %e to <16 x i8> 1365 ret <16 x i8> %f 1366} 1367 1368define dso_local void @trunc_packus_v16i32_v16i8_store(ptr %p, ptr %q) "min-legal-vector-width"="256" { 1369; CHECK-LABEL: trunc_packus_v16i32_v16i8_store: 1370; CHECK: # %bb.0: 1371; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1372; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 1373; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1374; CHECK-NEXT: vpmovuswb %ymm0, (%rsi) 1375; CHECK-NEXT: vzeroupper 1376; CHECK-NEXT: retq 1377 %a = load <16 x i32>, ptr %p 1378 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1379 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1380 %d = icmp sgt <16 x i32> %c, zeroinitializer 1381 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer 1382 %f = trunc <16 x i32> %e to <16 x i8> 1383 store <16 x i8> %f, ptr %q 1384 ret void 1385} 1386 1387define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"="256" { 1388; CHECK-LABEL: v64i1_argument_return: 1389; CHECK: # %bb.0: 1390; CHECK-NEXT: retq 1391 ret <64 x i1> %x 1392} 1393 1394define dso_local void @v64i1_shuffle(ptr %x, ptr %y) "min-legal-vector-width"="256" { 1395; CHECK-LABEL: v64i1_shuffle: 1396; CHECK: # %bb.0: # %entry 1397; CHECK-NEXT: vmovdqa (%rdi), %ymm1 1398; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 1399; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k0 1400; CHECK-NEXT: kshiftrd $1, %k0, %k1 1401; CHECK-NEXT: kshiftlq $63, %k0, %k2 1402; CHECK-NEXT: kshiftrq $62, %k2, %k2 1403; CHECK-NEXT: kshiftlq $63, %k1, %k1 1404; CHECK-NEXT: kshiftrq $63, %k1, %k1 1405; CHECK-NEXT: korq %k2, %k1, %k1 1406; CHECK-NEXT: movq $-5, %rax 1407; CHECK-NEXT: kmovq %rax, %k2 1408; CHECK-NEXT: kandq %k2, %k1, %k1 1409; CHECK-NEXT: kshiftrd $3, %k0, %k2 1410; CHECK-NEXT: kshiftlq $63, %k2, %k2 1411; CHECK-NEXT: kshiftrq $61, %k2, %k2 1412; CHECK-NEXT: korq %k2, %k1, %k1 1413; CHECK-NEXT: movq $-9, %rax 1414; CHECK-NEXT: kmovq %rax, %k2 1415; CHECK-NEXT: kandq %k2, %k1, %k1 1416; CHECK-NEXT: kshiftrd $2, %k0, %k2 1417; CHECK-NEXT: kshiftlq $63, %k2, %k2 1418; CHECK-NEXT: kshiftrq $60, %k2, %k2 1419; CHECK-NEXT: korq %k2, %k1, %k1 1420; CHECK-NEXT: movq $-17, %rax 1421; CHECK-NEXT: kmovq %rax, %k2 1422; CHECK-NEXT: kandq %k2, %k1, %k1 1423; CHECK-NEXT: kshiftrd $5, %k0, %k2 1424; CHECK-NEXT: kshiftlq $63, %k2, %k2 1425; CHECK-NEXT: kshiftrq $59, %k2, %k2 1426; CHECK-NEXT: korq %k2, %k1, %k1 1427; CHECK-NEXT: movq $-33, %rax 1428; CHECK-NEXT: kmovq %rax, %k2 1429; CHECK-NEXT: kandq %k2, %k1, %k1 1430; CHECK-NEXT: kshiftrd $4, %k0, %k2 1431; CHECK-NEXT: kshiftlq $63, %k2, %k2 1432; CHECK-NEXT: kshiftrq $58, %k2, %k2 1433; CHECK-NEXT: korq %k2, %k1, %k1 1434; CHECK-NEXT: movq $-65, %rax 1435; CHECK-NEXT: kmovq %rax, %k2 1436; CHECK-NEXT: kandq %k2, %k1, %k1 1437; CHECK-NEXT: kshiftrd $7, %k0, %k2 1438; CHECK-NEXT: kshiftlq $63, %k2, %k2 1439; CHECK-NEXT: kshiftrq $57, %k2, %k2 1440; CHECK-NEXT: korq %k2, %k1, %k1 1441; CHECK-NEXT: movq $-129, %rax 1442; CHECK-NEXT: kmovq %rax, %k2 1443; CHECK-NEXT: kandq %k2, %k1, %k1 1444; CHECK-NEXT: kshiftrd $6, %k0, %k2 1445; CHECK-NEXT: kshiftlq $63, %k2, %k2 1446; CHECK-NEXT: kshiftrq $56, %k2, %k2 1447; CHECK-NEXT: korq %k2, %k1, %k1 1448; CHECK-NEXT: movq $-257, %rax # imm = 0xFEFF 1449; CHECK-NEXT: kmovq %rax, %k2 1450; CHECK-NEXT: kandq %k2, %k1, %k1 1451; CHECK-NEXT: kshiftrd $9, %k0, %k2 1452; CHECK-NEXT: kshiftlq $63, %k2, %k2 1453; CHECK-NEXT: kshiftrq $55, %k2, %k2 1454; CHECK-NEXT: korq %k2, %k1, %k1 1455; CHECK-NEXT: movq $-513, %rax # imm = 0xFDFF 1456; CHECK-NEXT: kmovq %rax, %k2 1457; CHECK-NEXT: kandq %k2, %k1, %k1 1458; CHECK-NEXT: kshiftrd $8, %k0, %k2 1459; CHECK-NEXT: kshiftlq $63, %k2, %k2 1460; CHECK-NEXT: kshiftrq $54, %k2, %k2 1461; CHECK-NEXT: korq %k2, %k1, %k1 1462; CHECK-NEXT: movq $-1025, %rax # imm = 0xFBFF 1463; CHECK-NEXT: kmovq %rax, %k2 1464; CHECK-NEXT: kandq %k2, %k1, %k1 1465; CHECK-NEXT: kshiftrd $11, %k0, %k2 1466; CHECK-NEXT: kshiftlq $63, %k2, %k2 1467; CHECK-NEXT: kshiftrq $53, %k2, %k2 1468; CHECK-NEXT: korq %k2, %k1, %k1 1469; CHECK-NEXT: movq $-2049, %rax # imm = 0xF7FF 1470; CHECK-NEXT: kmovq %rax, %k2 1471; CHECK-NEXT: kandq %k2, %k1, %k1 1472; CHECK-NEXT: kshiftrd $10, %k0, %k2 1473; CHECK-NEXT: kshiftlq $63, %k2, %k2 1474; CHECK-NEXT: kshiftrq $52, %k2, %k2 1475; CHECK-NEXT: korq %k2, %k1, %k1 1476; CHECK-NEXT: movq $-4097, %rax # imm = 0xEFFF 1477; CHECK-NEXT: kmovq %rax, %k2 1478; CHECK-NEXT: kandq %k2, %k1, %k1 1479; CHECK-NEXT: kshiftrd $13, %k0, %k2 1480; CHECK-NEXT: kshiftlq $63, %k2, %k2 1481; CHECK-NEXT: kshiftrq $51, %k2, %k2 1482; CHECK-NEXT: korq %k2, %k1, %k1 1483; CHECK-NEXT: movq $-8193, %rax # imm = 0xDFFF 1484; CHECK-NEXT: kmovq %rax, %k2 1485; CHECK-NEXT: kandq %k2, %k1, %k1 1486; CHECK-NEXT: kshiftrd $12, %k0, %k2 1487; CHECK-NEXT: kshiftlq $63, %k2, %k2 1488; CHECK-NEXT: kshiftrq $50, %k2, %k2 1489; CHECK-NEXT: korq %k2, %k1, %k1 1490; CHECK-NEXT: movq $-16385, %rax # imm = 0xBFFF 1491; CHECK-NEXT: kmovq %rax, %k2 1492; CHECK-NEXT: kandq %k2, %k1, %k1 1493; CHECK-NEXT: kshiftrd $15, %k0, %k2 1494; CHECK-NEXT: kshiftlq $63, %k2, %k2 1495; CHECK-NEXT: kshiftrq $49, %k2, %k2 1496; CHECK-NEXT: korq %k2, %k1, %k1 1497; CHECK-NEXT: movq $-32769, %rax # imm = 0xFFFF7FFF 1498; CHECK-NEXT: kmovq %rax, %k2 1499; CHECK-NEXT: kandq %k2, %k1, %k1 1500; CHECK-NEXT: kshiftrd $14, %k0, %k2 1501; CHECK-NEXT: kshiftlq $63, %k2, %k2 1502; CHECK-NEXT: kshiftrq $48, %k2, %k2 1503; CHECK-NEXT: korq %k2, %k1, %k1 1504; CHECK-NEXT: movq $-65537, %rax # imm = 0xFFFEFFFF 1505; CHECK-NEXT: kmovq %rax, %k2 1506; CHECK-NEXT: kandq %k2, %k1, %k1 1507; CHECK-NEXT: kshiftrd $17, %k0, %k2 1508; CHECK-NEXT: kshiftlq $63, %k2, %k2 1509; CHECK-NEXT: kshiftrq $47, %k2, %k2 1510; CHECK-NEXT: korq %k2, %k1, %k1 1511; CHECK-NEXT: movq $-131073, %rax # imm = 0xFFFDFFFF 1512; CHECK-NEXT: kmovq %rax, %k2 1513; CHECK-NEXT: kandq %k2, %k1, %k1 1514; CHECK-NEXT: kshiftrd $16, %k0, %k2 1515; CHECK-NEXT: kshiftlq $63, %k2, %k2 1516; CHECK-NEXT: kshiftrq $46, %k2, %k2 1517; CHECK-NEXT: korq %k2, %k1, %k1 1518; CHECK-NEXT: movq $-262145, %rax # imm = 0xFFFBFFFF 1519; CHECK-NEXT: kmovq %rax, %k2 1520; CHECK-NEXT: kandq %k2, %k1, %k1 1521; CHECK-NEXT: kshiftrd $19, %k0, %k2 1522; CHECK-NEXT: kshiftlq $63, %k2, %k2 1523; CHECK-NEXT: kshiftrq $45, %k2, %k2 1524; CHECK-NEXT: korq %k2, %k1, %k1 1525; CHECK-NEXT: movq $-524289, %rax # imm = 0xFFF7FFFF 1526; CHECK-NEXT: kmovq %rax, %k2 1527; CHECK-NEXT: kandq %k2, %k1, %k1 1528; CHECK-NEXT: kshiftrd $18, %k0, %k2 1529; CHECK-NEXT: kshiftlq $63, %k2, %k2 1530; CHECK-NEXT: kshiftrq $44, %k2, %k2 1531; CHECK-NEXT: korq %k2, %k1, %k1 1532; CHECK-NEXT: movq $-1048577, %rax # imm = 0xFFEFFFFF 1533; CHECK-NEXT: kmovq %rax, %k2 1534; CHECK-NEXT: kandq %k2, %k1, %k1 1535; CHECK-NEXT: kshiftrd $21, %k0, %k2 1536; CHECK-NEXT: kshiftlq $63, %k2, %k2 1537; CHECK-NEXT: kshiftrq $43, %k2, %k2 1538; CHECK-NEXT: korq %k2, %k1, %k1 1539; CHECK-NEXT: movq $-2097153, %rax # imm = 0xFFDFFFFF 1540; CHECK-NEXT: kmovq %rax, %k2 1541; CHECK-NEXT: kandq %k2, %k1, %k1 1542; CHECK-NEXT: kshiftrd $20, %k0, %k2 1543; CHECK-NEXT: kshiftlq $63, %k2, %k2 1544; CHECK-NEXT: kshiftrq $42, %k2, %k2 1545; CHECK-NEXT: korq %k2, %k1, %k1 1546; CHECK-NEXT: movq $-4194305, %rax # imm = 0xFFBFFFFF 1547; CHECK-NEXT: kmovq %rax, %k2 1548; CHECK-NEXT: kandq %k2, %k1, %k1 1549; CHECK-NEXT: kshiftrd $23, %k0, %k2 1550; CHECK-NEXT: kshiftlq $63, %k2, %k2 1551; CHECK-NEXT: kshiftrq $41, %k2, %k2 1552; CHECK-NEXT: korq %k2, %k1, %k1 1553; CHECK-NEXT: movq $-8388609, %rax # imm = 0xFF7FFFFF 1554; CHECK-NEXT: kmovq %rax, %k2 1555; CHECK-NEXT: kandq %k2, %k1, %k1 1556; CHECK-NEXT: kshiftrd $22, %k0, %k2 1557; CHECK-NEXT: kshiftlq $63, %k2, %k2 1558; CHECK-NEXT: kshiftrq $40, %k2, %k2 1559; CHECK-NEXT: korq %k2, %k1, %k1 1560; CHECK-NEXT: movq $-16777217, %rax # imm = 0xFEFFFFFF 1561; CHECK-NEXT: kmovq %rax, %k2 1562; CHECK-NEXT: kandq %k2, %k1, %k1 1563; CHECK-NEXT: kshiftrd $25, %k0, %k2 1564; CHECK-NEXT: kshiftlq $63, %k2, %k2 1565; CHECK-NEXT: kshiftrq $39, %k2, %k2 1566; CHECK-NEXT: korq %k2, %k1, %k1 1567; CHECK-NEXT: movq $-33554433, %rax # imm = 0xFDFFFFFF 1568; CHECK-NEXT: kmovq %rax, %k2 1569; CHECK-NEXT: kandq %k2, %k1, %k1 1570; CHECK-NEXT: kshiftrd $24, %k0, %k2 1571; CHECK-NEXT: kshiftlq $63, %k2, %k2 1572; CHECK-NEXT: kshiftrq $38, %k2, %k2 1573; CHECK-NEXT: korq %k2, %k1, %k1 1574; CHECK-NEXT: movq $-67108865, %rax # imm = 0xFBFFFFFF 1575; CHECK-NEXT: kmovq %rax, %k2 1576; CHECK-NEXT: kandq %k2, %k1, %k1 1577; CHECK-NEXT: kshiftrd $27, %k0, %k2 1578; CHECK-NEXT: kshiftlq $63, %k2, %k2 1579; CHECK-NEXT: kshiftrq $37, %k2, %k2 1580; CHECK-NEXT: korq %k2, %k1, %k1 1581; CHECK-NEXT: movq $-134217729, %rax # imm = 0xF7FFFFFF 1582; CHECK-NEXT: kmovq %rax, %k2 1583; CHECK-NEXT: kandq %k2, %k1, %k1 1584; CHECK-NEXT: kshiftrd $26, %k0, %k2 1585; CHECK-NEXT: kshiftlq $63, %k2, %k2 1586; CHECK-NEXT: kshiftrq $36, %k2, %k2 1587; CHECK-NEXT: korq %k2, %k1, %k1 1588; CHECK-NEXT: movq $-268435457, %rax # imm = 0xEFFFFFFF 1589; CHECK-NEXT: kmovq %rax, %k2 1590; CHECK-NEXT: kandq %k2, %k1, %k1 1591; CHECK-NEXT: kshiftrd $29, %k0, %k2 1592; CHECK-NEXT: kshiftlq $63, %k2, %k2 1593; CHECK-NEXT: kshiftrq $35, %k2, %k2 1594; CHECK-NEXT: korq %k2, %k1, %k1 1595; CHECK-NEXT: movq $-536870913, %rax # imm = 0xDFFFFFFF 1596; CHECK-NEXT: kmovq %rax, %k2 1597; CHECK-NEXT: kandq %k2, %k1, %k1 1598; CHECK-NEXT: kshiftrd $28, %k0, %k2 1599; CHECK-NEXT: kshiftlq $63, %k2, %k2 1600; CHECK-NEXT: kshiftrq $34, %k2, %k2 1601; CHECK-NEXT: korq %k2, %k1, %k1 1602; CHECK-NEXT: movq $-1073741825, %rax # imm = 0xBFFFFFFF 1603; CHECK-NEXT: kmovq %rax, %k2 1604; CHECK-NEXT: kandq %k2, %k1, %k1 1605; CHECK-NEXT: kshiftrd $31, %k0, %k2 1606; CHECK-NEXT: kshiftlq $63, %k2, %k2 1607; CHECK-NEXT: kshiftrq $33, %k2, %k2 1608; CHECK-NEXT: korq %k2, %k1, %k1 1609; CHECK-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF 1610; CHECK-NEXT: kmovq %rax, %k2 1611; CHECK-NEXT: kandq %k2, %k1, %k2 1612; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 1613; CHECK-NEXT: kshiftrd $30, %k0, %k0 1614; CHECK-NEXT: kshiftlq $63, %k0, %k0 1615; CHECK-NEXT: kshiftrq $32, %k0, %k0 1616; CHECK-NEXT: korq %k0, %k2, %k0 1617; CHECK-NEXT: movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF 1618; CHECK-NEXT: kmovq %rax, %k2 1619; CHECK-NEXT: kandq %k2, %k0, %k0 1620; CHECK-NEXT: kshiftrd $1, %k1, %k2 1621; CHECK-NEXT: kshiftlq $63, %k2, %k2 1622; CHECK-NEXT: kshiftrq $31, %k2, %k2 1623; CHECK-NEXT: korq %k2, %k0, %k0 1624; CHECK-NEXT: movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF 1625; CHECK-NEXT: kmovq %rax, %k2 1626; CHECK-NEXT: kandq %k2, %k0, %k0 1627; CHECK-NEXT: kshiftlq $63, %k1, %k2 1628; CHECK-NEXT: kshiftrq $30, %k2, %k2 1629; CHECK-NEXT: korq %k2, %k0, %k0 1630; CHECK-NEXT: movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF 1631; CHECK-NEXT: kmovq %rax, %k2 1632; CHECK-NEXT: kandq %k2, %k0, %k0 1633; CHECK-NEXT: kshiftrd $3, %k1, %k2 1634; CHECK-NEXT: kshiftlq $63, %k2, %k2 1635; CHECK-NEXT: kshiftrq $29, %k2, %k2 1636; CHECK-NEXT: korq %k2, %k0, %k0 1637; CHECK-NEXT: movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF 1638; CHECK-NEXT: kmovq %rax, %k2 1639; CHECK-NEXT: kandq %k2, %k0, %k0 1640; CHECK-NEXT: kshiftrd $2, %k1, %k2 1641; CHECK-NEXT: kshiftlq $63, %k2, %k2 1642; CHECK-NEXT: kshiftrq $28, %k2, %k2 1643; CHECK-NEXT: korq %k2, %k0, %k0 1644; CHECK-NEXT: movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF 1645; CHECK-NEXT: kmovq %rax, %k2 1646; CHECK-NEXT: kandq %k2, %k0, %k0 1647; CHECK-NEXT: kshiftrd $5, %k1, %k2 1648; CHECK-NEXT: kshiftlq $63, %k2, %k2 1649; CHECK-NEXT: kshiftrq $27, %k2, %k2 1650; CHECK-NEXT: korq %k2, %k0, %k0 1651; CHECK-NEXT: movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF 1652; CHECK-NEXT: kmovq %rax, %k2 1653; CHECK-NEXT: kandq %k2, %k0, %k0 1654; CHECK-NEXT: kshiftrd $4, %k1, %k2 1655; CHECK-NEXT: kshiftlq $63, %k2, %k2 1656; CHECK-NEXT: kshiftrq $26, %k2, %k2 1657; CHECK-NEXT: korq %k2, %k0, %k0 1658; CHECK-NEXT: movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF 1659; CHECK-NEXT: kmovq %rax, %k2 1660; CHECK-NEXT: kandq %k2, %k0, %k0 1661; CHECK-NEXT: kshiftrd $7, %k1, %k2 1662; CHECK-NEXT: kshiftlq $63, %k2, %k2 1663; CHECK-NEXT: kshiftrq $25, %k2, %k2 1664; CHECK-NEXT: korq %k2, %k0, %k0 1665; CHECK-NEXT: movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF 1666; CHECK-NEXT: kmovq %rax, %k2 1667; CHECK-NEXT: kandq %k2, %k0, %k0 1668; CHECK-NEXT: kshiftrd $6, %k1, %k2 1669; CHECK-NEXT: kshiftlq $63, %k2, %k2 1670; CHECK-NEXT: kshiftrq $24, %k2, %k2 1671; CHECK-NEXT: korq %k2, %k0, %k0 1672; CHECK-NEXT: movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF 1673; CHECK-NEXT: kmovq %rax, %k2 1674; CHECK-NEXT: kandq %k2, %k0, %k0 1675; CHECK-NEXT: kshiftrd $9, %k1, %k2 1676; CHECK-NEXT: kshiftlq $63, %k2, %k2 1677; CHECK-NEXT: kshiftrq $23, %k2, %k2 1678; CHECK-NEXT: korq %k2, %k0, %k0 1679; CHECK-NEXT: movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF 1680; CHECK-NEXT: kmovq %rax, %k2 1681; CHECK-NEXT: kandq %k2, %k0, %k0 1682; CHECK-NEXT: kshiftrd $8, %k1, %k2 1683; CHECK-NEXT: kshiftlq $63, %k2, %k2 1684; CHECK-NEXT: kshiftrq $22, %k2, %k2 1685; CHECK-NEXT: korq %k2, %k0, %k0 1686; CHECK-NEXT: movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF 1687; CHECK-NEXT: kmovq %rax, %k2 1688; CHECK-NEXT: kandq %k2, %k0, %k0 1689; CHECK-NEXT: kshiftrd $11, %k1, %k2 1690; CHECK-NEXT: kshiftlq $63, %k2, %k2 1691; CHECK-NEXT: kshiftrq $21, %k2, %k2 1692; CHECK-NEXT: korq %k2, %k0, %k0 1693; CHECK-NEXT: movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF 1694; CHECK-NEXT: kmovq %rax, %k2 1695; CHECK-NEXT: kandq %k2, %k0, %k0 1696; CHECK-NEXT: kshiftrd $10, %k1, %k2 1697; CHECK-NEXT: kshiftlq $63, %k2, %k2 1698; CHECK-NEXT: kshiftrq $20, %k2, %k2 1699; CHECK-NEXT: korq %k2, %k0, %k0 1700; CHECK-NEXT: movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF 1701; CHECK-NEXT: kmovq %rax, %k2 1702; CHECK-NEXT: kandq %k2, %k0, %k0 1703; CHECK-NEXT: kshiftrd $13, %k1, %k2 1704; CHECK-NEXT: kshiftlq $63, %k2, %k2 1705; CHECK-NEXT: kshiftrq $19, %k2, %k2 1706; CHECK-NEXT: korq %k2, %k0, %k0 1707; CHECK-NEXT: movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF 1708; CHECK-NEXT: kmovq %rax, %k2 1709; CHECK-NEXT: kandq %k2, %k0, %k0 1710; CHECK-NEXT: kshiftrd $12, %k1, %k2 1711; CHECK-NEXT: kshiftlq $63, %k2, %k2 1712; CHECK-NEXT: kshiftrq $18, %k2, %k2 1713; CHECK-NEXT: korq %k2, %k0, %k0 1714; CHECK-NEXT: movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF 1715; CHECK-NEXT: kmovq %rax, %k2 1716; CHECK-NEXT: kandq %k2, %k0, %k0 1717; CHECK-NEXT: kshiftrd $15, %k1, %k2 1718; CHECK-NEXT: kshiftlq $63, %k2, %k2 1719; CHECK-NEXT: kshiftrq $17, %k2, %k2 1720; CHECK-NEXT: korq %k2, %k0, %k0 1721; CHECK-NEXT: movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF 1722; CHECK-NEXT: kmovq %rax, %k2 1723; CHECK-NEXT: kandq %k2, %k0, %k0 1724; CHECK-NEXT: kshiftrd $14, %k1, %k2 1725; CHECK-NEXT: kshiftlq $63, %k2, %k2 1726; CHECK-NEXT: kshiftrq $16, %k2, %k2 1727; CHECK-NEXT: korq %k2, %k0, %k0 1728; CHECK-NEXT: movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF 1729; CHECK-NEXT: kmovq %rax, %k2 1730; CHECK-NEXT: kandq %k2, %k0, %k0 1731; CHECK-NEXT: kshiftrd $17, %k1, %k2 1732; CHECK-NEXT: kshiftlq $63, %k2, %k2 1733; CHECK-NEXT: kshiftrq $15, %k2, %k2 1734; CHECK-NEXT: korq %k2, %k0, %k0 1735; CHECK-NEXT: movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF 1736; CHECK-NEXT: kmovq %rax, %k2 1737; CHECK-NEXT: kandq %k2, %k0, %k0 1738; CHECK-NEXT: kshiftrd $16, %k1, %k2 1739; CHECK-NEXT: kshiftlq $63, %k2, %k2 1740; CHECK-NEXT: kshiftrq $14, %k2, %k2 1741; CHECK-NEXT: korq %k2, %k0, %k0 1742; CHECK-NEXT: movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF 1743; CHECK-NEXT: kmovq %rax, %k2 1744; CHECK-NEXT: kandq %k2, %k0, %k0 1745; CHECK-NEXT: kshiftrd $19, %k1, %k2 1746; CHECK-NEXT: kshiftlq $63, %k2, %k2 1747; CHECK-NEXT: kshiftrq $13, %k2, %k2 1748; CHECK-NEXT: korq %k2, %k0, %k0 1749; CHECK-NEXT: movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF 1750; CHECK-NEXT: kmovq %rax, %k2 1751; CHECK-NEXT: kandq %k2, %k0, %k0 1752; CHECK-NEXT: kshiftrd $18, %k1, %k2 1753; CHECK-NEXT: kshiftlq $63, %k2, %k2 1754; CHECK-NEXT: kshiftrq $12, %k2, %k2 1755; CHECK-NEXT: korq %k2, %k0, %k0 1756; CHECK-NEXT: movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF 1757; CHECK-NEXT: kmovq %rax, %k2 1758; CHECK-NEXT: kandq %k2, %k0, %k0 1759; CHECK-NEXT: kshiftrd $21, %k1, %k2 1760; CHECK-NEXT: kshiftlq $63, %k2, %k2 1761; CHECK-NEXT: kshiftrq $11, %k2, %k2 1762; CHECK-NEXT: korq %k2, %k0, %k0 1763; CHECK-NEXT: movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF 1764; CHECK-NEXT: kmovq %rax, %k2 1765; CHECK-NEXT: kandq %k2, %k0, %k0 1766; CHECK-NEXT: kshiftrd $20, %k1, %k2 1767; CHECK-NEXT: kshiftlq $63, %k2, %k2 1768; CHECK-NEXT: kshiftrq $10, %k2, %k2 1769; CHECK-NEXT: korq %k2, %k0, %k0 1770; CHECK-NEXT: movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF 1771; CHECK-NEXT: kmovq %rax, %k2 1772; CHECK-NEXT: kandq %k2, %k0, %k0 1773; CHECK-NEXT: kshiftrd $23, %k1, %k2 1774; CHECK-NEXT: kshiftlq $63, %k2, %k2 1775; CHECK-NEXT: kshiftrq $9, %k2, %k2 1776; CHECK-NEXT: korq %k2, %k0, %k0 1777; CHECK-NEXT: movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF 1778; CHECK-NEXT: kmovq %rax, %k2 1779; CHECK-NEXT: kandq %k2, %k0, %k0 1780; CHECK-NEXT: kshiftrd $22, %k1, %k2 1781; CHECK-NEXT: kshiftlq $63, %k2, %k2 1782; CHECK-NEXT: kshiftrq $8, %k2, %k2 1783; CHECK-NEXT: korq %k2, %k0, %k0 1784; CHECK-NEXT: movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF 1785; CHECK-NEXT: kmovq %rax, %k2 1786; CHECK-NEXT: kandq %k2, %k0, %k0 1787; CHECK-NEXT: kshiftrd $25, %k1, %k2 1788; CHECK-NEXT: kshiftlq $63, %k2, %k2 1789; CHECK-NEXT: kshiftrq $7, %k2, %k2 1790; CHECK-NEXT: korq %k2, %k0, %k0 1791; CHECK-NEXT: movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF 1792; CHECK-NEXT: kmovq %rax, %k2 1793; CHECK-NEXT: kandq %k2, %k0, %k0 1794; CHECK-NEXT: kshiftrd $24, %k1, %k2 1795; CHECK-NEXT: kshiftlq $63, %k2, %k2 1796; CHECK-NEXT: kshiftrq $6, %k2, %k2 1797; CHECK-NEXT: korq %k2, %k0, %k0 1798; CHECK-NEXT: movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF 1799; CHECK-NEXT: kmovq %rax, %k2 1800; CHECK-NEXT: kandq %k2, %k0, %k0 1801; CHECK-NEXT: kshiftrd $27, %k1, %k2 1802; CHECK-NEXT: kshiftlq $63, %k2, %k2 1803; CHECK-NEXT: kshiftrq $5, %k2, %k2 1804; CHECK-NEXT: korq %k2, %k0, %k0 1805; CHECK-NEXT: movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF 1806; CHECK-NEXT: kmovq %rax, %k2 1807; CHECK-NEXT: kandq %k2, %k0, %k0 1808; CHECK-NEXT: kshiftrd $26, %k1, %k2 1809; CHECK-NEXT: kshiftlq $63, %k2, %k2 1810; CHECK-NEXT: kshiftrq $4, %k2, %k2 1811; CHECK-NEXT: korq %k2, %k0, %k0 1812; CHECK-NEXT: movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF 1813; CHECK-NEXT: kmovq %rax, %k2 1814; CHECK-NEXT: kandq %k2, %k0, %k0 1815; CHECK-NEXT: kshiftrd $29, %k1, %k2 1816; CHECK-NEXT: kshiftlq $63, %k2, %k2 1817; CHECK-NEXT: kshiftrq $3, %k2, %k2 1818; CHECK-NEXT: korq %k2, %k0, %k0 1819; CHECK-NEXT: movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF 1820; CHECK-NEXT: kmovq %rax, %k2 1821; CHECK-NEXT: kandq %k2, %k0, %k0 1822; CHECK-NEXT: kshiftrd $28, %k1, %k2 1823; CHECK-NEXT: kshiftlq $63, %k2, %k2 1824; CHECK-NEXT: kshiftrq $2, %k2, %k2 1825; CHECK-NEXT: korq %k2, %k0, %k0 1826; CHECK-NEXT: movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF 1827; CHECK-NEXT: kmovq %rax, %k2 1828; CHECK-NEXT: kandq %k2, %k0, %k0 1829; CHECK-NEXT: kshiftrd $31, %k1, %k2 1830; CHECK-NEXT: kshiftlq $62, %k2, %k2 1831; CHECK-NEXT: korq %k2, %k0, %k0 1832; CHECK-NEXT: kshiftrd $30, %k1, %k1 1833; CHECK-NEXT: kshiftlq $1, %k0, %k0 1834; CHECK-NEXT: kshiftrq $1, %k0, %k0 1835; CHECK-NEXT: kshiftlq $63, %k1, %k1 1836; CHECK-NEXT: korq %k1, %k0, %k1 1837; CHECK-NEXT: vmovdqu8 %ymm1, (%rsi) {%k1} 1838; CHECK-NEXT: kshiftrq $32, %k1, %k1 1839; CHECK-NEXT: vmovdqu8 %ymm0, 32(%rsi) {%k1} 1840; CHECK-NEXT: vzeroupper 1841; CHECK-NEXT: retq 1842entry: 1843 %a = load <64 x i8>, ptr %x 1844 %b = icmp eq <64 x i8> %a, zeroinitializer 1845 %shuf = shufflevector <64 x i1> %b, <64 x i1> undef, <64 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 25, i32 24, i32 27, i32 26, i32 29, i32 28, i32 31, i32 30, i32 33, i32 32, i32 35, i32 34, i32 37, i32 36, i32 39, i32 38, i32 41, i32 40, i32 43, i32 42, i32 45, i32 44, i32 47, i32 46, i32 49, i32 48, i32 51, i32 50, i32 53, i32 52, i32 55, i32 54, i32 57, i32 56, i32 59, i32 58, i32 61, i32 60, i32 63, i32 62> 1846 call void @llvm.masked.store.v64i8.p0(<64 x i8> %a, ptr %y, i32 1, <64 x i1> %shuf) 1847 ret void 1848} 1849declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>) 1850 1851@mem64_dst = dso_local global i64 0, align 8 1852@mem64_src = dso_local global i64 0, align 8 1853define dso_local i32 @v64i1_inline_asm() "min-legal-vector-width"="256" { 1854; CHECK-LABEL: v64i1_inline_asm: 1855; CHECK: # %bb.0: 1856; CHECK-NEXT: kmovq mem64_src(%rip), %k0 1857; CHECK-NEXT: #APP 1858; CHECK-NEXT: #NO_APP 1859; CHECK-NEXT: kmovq %k0, mem64_dst(%rip) 1860; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax 1861; CHECK-NEXT: retq 1862 %1 = alloca i32, align 4 1863 %2 = load i64, ptr @mem64_src, align 8 1864 %3 = call i64 asm "", "=k,k,~{dirflag},~{fpsr},~{flags}"(i64 %2) 1865 store i64 %3, ptr @mem64_dst, align 8 1866 %4 = load i32, ptr %1, align 4 1867 ret i32 %4 1868} 1869 1870define dso_local void @cmp_v8i64_sext(ptr %xptr, ptr %yptr, ptr %zptr) "min-legal-vector-width"="256" { 1871; CHECK-LABEL: cmp_v8i64_sext: 1872; CHECK: # %bb.0: 1873; CHECK-NEXT: vmovdqa (%rsi), %ymm0 1874; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 1875; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 1876; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 1877; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 1878; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 1879; CHECK-NEXT: vzeroupper 1880; CHECK-NEXT: retq 1881 %x = load <8 x i64>, ptr %xptr 1882 %y = load <8 x i64>, ptr %yptr 1883 %cmp = icmp slt <8 x i64> %x, %y 1884 %ext = sext <8 x i1> %cmp to <8 x i64> 1885 store <8 x i64> %ext, ptr %zptr 1886 ret void 1887} 1888 1889define dso_local void @cmp_v8i64_zext(ptr %xptr, ptr %yptr, ptr %zptr) "min-legal-vector-width"="256" { 1890; CHECK-LABEL: cmp_v8i64_zext: 1891; CHECK: # %bb.0: 1892; CHECK-NEXT: vmovdqa (%rsi), %ymm0 1893; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 1894; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 1895; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 1896; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1 1897; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0 1898; CHECK-NEXT: vmovdqa %ymm0, (%rdx) 1899; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) 1900; CHECK-NEXT: vzeroupper 1901; CHECK-NEXT: retq 1902 %x = load <8 x i64>, ptr %xptr 1903 %y = load <8 x i64>, ptr %yptr 1904 %cmp = icmp slt <8 x i64> %x, %y 1905 %ext = zext <8 x i1> %cmp to <8 x i64> 1906 store <8 x i64> %ext, ptr %zptr 1907 ret void 1908} 1909 1910define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" { 1911; CHECK-LABEL: var_rotate_v16i8: 1912; CHECK: # %bb.0: 1913; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 1914; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 1915; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1916; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1917; CHECK-NEXT: vpsllvw %xmm2, %xmm3, %xmm2 1918; CHECK-NEXT: vpsrlw $8, %xmm2, %xmm2 1919; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1920; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1921; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 1922; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0 1923; CHECK-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1924; CHECK-NEXT: retq 1925 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 1926 %shl = shl <16 x i8> %a, %b 1927 %lshr = lshr <16 x i8> %a, %b8 1928 %or = or <16 x i8> %shl, %lshr 1929 ret <16 x i8> %or 1930} 1931 1932define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { 1933; CHECK-LABEL: var_rotate_v32i8: 1934; CHECK: # %bb.0: 1935; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 1936; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 1937; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 1938; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1939; CHECK-NEXT: vpsllvw %ymm3, %ymm4, %ymm3 1940; CHECK-NEXT: vpsrlw $8, %ymm3, %ymm3 1941; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 1942; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1943; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 1944; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 1945; CHECK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 1946; CHECK-NEXT: retq 1947 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 1948 %shl = shl <32 x i8> %a, %b 1949 %lshr = lshr <32 x i8> %a, %b8 1950 %or = or <32 x i8> %shl, %lshr 1951 ret <32 x i8> %or 1952} 1953 1954define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { 1955; CHECK-LABEL: splatvar_rotate_v32i8: 1956; CHECK: # %bb.0: 1957; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1958; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1959; CHECK-NEXT: vpsllw %xmm1, %ymm2, %ymm2 1960; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2 1961; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1962; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 1963; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 1964; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 1965; CHECK-NEXT: retq 1966 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer 1967 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 1968 %shl = shl <32 x i8> %a, %splat 1969 %lshr = lshr <32 x i8> %a, %splat8 1970 %or = or <32 x i8> %shl, %lshr 1971 ret <32 x i8> %or 1972} 1973 1974define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { 1975; CHECK-LABEL: constant_rotate_v32i8: 1976; CHECK: # %bb.0: 1977; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1978; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1979; CHECK-NEXT: vpsrlw $8, %ymm1, %ymm1 1980; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1981; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1982; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 1983; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1984; CHECK-NEXT: retq 1985 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1986 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1987 %or = or <32 x i8> %shl, %lshr 1988 ret <32 x i8> %or 1989} 1990 1991define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { 1992; CHECK-SKX-LABEL: splatconstant_rotate_v32i8: 1993; CHECK-SKX: # %bb.0: 1994; CHECK-SKX-NEXT: vpsllw $4, %ymm0, %ymm1 1995; CHECK-SKX-NEXT: vpsrlw $4, %ymm0, %ymm0 1996; CHECK-SKX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 1997; CHECK-SKX-NEXT: retq 1998; 1999; CHECK-AVX512-LABEL: splatconstant_rotate_v32i8: 2000; CHECK-AVX512: # %bb.0: 2001; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1 2002; CHECK-AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 2003; CHECK-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 2004; CHECK-AVX512-NEXT: retq 2005; 2006; CHECK-VBMI1-LABEL: splatconstant_rotate_v32i8: 2007; CHECK-VBMI1: # %bb.0: 2008; CHECK-VBMI1-NEXT: vpsllw $4, %ymm0, %ymm1 2009; CHECK-VBMI1-NEXT: vpsrlw $4, %ymm0, %ymm0 2010; CHECK-VBMI1-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 2011; CHECK-VBMI1-NEXT: retq 2012; 2013; CHECK-GFNI-LABEL: splatconstant_rotate_v32i8: 2014; CHECK-GFNI: # %bb.0: 2015; CHECK-GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 2016; CHECK-GFNI-NEXT: retq 2017 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2018 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2019 %or = or <32 x i8> %shl, %lshr 2020 ret <32 x i8> %or 2021} 2022 2023define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { 2024; CHECK-SKX-LABEL: splatconstant_rotate_mask_v32i8: 2025; CHECK-SKX: # %bb.0: 2026; CHECK-SKX-NEXT: vpsllw $4, %ymm0, %ymm1 2027; CHECK-SKX-NEXT: vpsrlw $4, %ymm0, %ymm0 2028; CHECK-SKX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 2029; CHECK-SKX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 2030; CHECK-SKX-NEXT: retq 2031; 2032; CHECK-AVX512-LABEL: splatconstant_rotate_mask_v32i8: 2033; CHECK-AVX512: # %bb.0: 2034; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1 2035; CHECK-AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 2036; CHECK-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 2037; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 2038; CHECK-AVX512-NEXT: retq 2039; 2040; CHECK-VBMI1-LABEL: splatconstant_rotate_mask_v32i8: 2041; CHECK-VBMI1: # %bb.0: 2042; CHECK-VBMI1-NEXT: vpsllw $4, %ymm0, %ymm1 2043; CHECK-VBMI1-NEXT: vpsrlw $4, %ymm0, %ymm0 2044; CHECK-VBMI1-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 2045; CHECK-VBMI1-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 2046; CHECK-VBMI1-NEXT: retq 2047; 2048; CHECK-GFNI-LABEL: splatconstant_rotate_mask_v32i8: 2049; CHECK-GFNI: # %bb.0: 2050; CHECK-GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 2051; CHECK-GFNI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 2052; CHECK-GFNI-NEXT: retq 2053 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2054 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2055 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 2056 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 2057 %or = or <32 x i8> %lmask, %rmask 2058 ret <32 x i8> %or 2059} 2060