1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5 6; Only bottom 16 bits are set - upper 48 bits are zero. 7define <2 x i64> @combine_psadbw_shift(<16 x i8> %0, <16 x i8> %1) nounwind { 8; SSE-LABEL: combine_psadbw_shift: 9; SSE: # %bb.0: 10; SSE-NEXT: xorps %xmm0, %xmm0 11; SSE-NEXT: ret{{[l|q]}} 12; 13; AVX2-LABEL: combine_psadbw_shift: 14; AVX2: # %bb.0: 15; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 16; AVX2-NEXT: retq 17 %3 = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, <16 x i8> %1) 18 %4 = lshr <2 x i64> %3, <i64 48, i64 48> 19 ret <2 x i64> %4 20} 21 22; Propagate the demanded result elements to the 8 aliasing source elements. 23define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind { 24; X86-SSE-LABEL: combine_psadbw_demandedelt: 25; X86-SSE: # %bb.0: 26; X86-SSE-NEXT: psadbw %xmm1, %xmm0 27; X86-SSE-NEXT: movd %xmm0, %eax 28; X86-SSE-NEXT: xorl %edx, %edx 29; X86-SSE-NEXT: retl 30; 31; X64-SSE-LABEL: combine_psadbw_demandedelt: 32; X64-SSE: # %bb.0: 33; X64-SSE-NEXT: psadbw %xmm1, %xmm0 34; X64-SSE-NEXT: movq %xmm0, %rax 35; X64-SSE-NEXT: retq 36; 37; AVX2-LABEL: combine_psadbw_demandedelt: 38; AVX2: # %bb.0: 39; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 40; AVX2-NEXT: vmovq %xmm0, %rax 41; AVX2-NEXT: retq 42 %3 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11> 43 %4 = shufflevector <16 x i8> %1, <16 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11> 44 %5 = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %3, <16 x i8> %4) 45 %6 = extractelement <2 x i64> %5, i32 0 46 ret i64 %6 47} 48 49; TODO: Each PSADBW source element has a maximum value of 3 - so max sum-of-diffs for each <8 x i8> should be 24. 50define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind { 51; X86-SSE-LABEL: combine_psadbw_cmp_knownbits: 52; X86-SSE: # %bb.0: 53; X86-SSE-NEXT: xorps %xmm0, %xmm0 54; X86-SSE-NEXT: retl 55; 56; X64-SSE-LABEL: combine_psadbw_cmp_knownbits: 57; X64-SSE: # %bb.0: 58; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 59; X64-SSE-NEXT: pxor %xmm1, %xmm1 60; X64-SSE-NEXT: psadbw %xmm1, %xmm0 61; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 62; X64-SSE-NEXT: retq 63; 64; AVX2-LABEL: combine_psadbw_cmp_knownbits: 65; AVX2: # %bb.0: 66; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 67; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 68; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 69; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 70; AVX2-NEXT: retq 71 %mask = and <16 x i8> %a0, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 72 %sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer) 73 %cmp = icmp sgt <2 x i64> %sad, <i64 32, i64 32> 74 %ext = sext <2 x i1> %cmp to <2 x i64> 75 ret <2 x i64> %ext 76} 77 78; No need to scalarize the sitofp as the PSADBW results are smaller than i32. 79define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind { 80; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits: 81; X86-SSE: # %bb.0: 82; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 83; X86-SSE-NEXT: pxor %xmm1, %xmm1 84; X86-SSE-NEXT: psadbw %xmm0, %xmm1 85; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 86; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 87; X86-SSE-NEXT: retl 88; 89; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits: 90; X64-SSE: # %bb.0: 91; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 92; X64-SSE-NEXT: pxor %xmm1, %xmm1 93; X64-SSE-NEXT: psadbw %xmm0, %xmm1 94; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 95; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 96; X64-SSE-NEXT: retq 97; 98; AVX2-LABEL: combine_psadbw_sitofp_knownbits: 99; AVX2: # %bb.0: 100; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 101; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 102; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 103; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 104; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 105; AVX2-NEXT: retq 106 %mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 107 %sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer) 108 %cvt = sitofp <2 x i64> %sad to <2 x double> 109 ret <2 x double> %cvt 110} 111 112; Convert from uitofp to sitofp as the PSADBW results are zero-extended. 113define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind { 114; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits: 115; X86-SSE: # %bb.0: 116; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 117; X86-SSE-NEXT: pxor %xmm1, %xmm1 118; X86-SSE-NEXT: psadbw %xmm0, %xmm1 119; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 120; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 121; X86-SSE-NEXT: retl 122; 123; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits: 124; X64-SSE: # %bb.0: 125; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 126; X64-SSE-NEXT: pxor %xmm1, %xmm1 127; X64-SSE-NEXT: psadbw %xmm0, %xmm1 128; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 129; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 130; X64-SSE-NEXT: retq 131; 132; AVX2-LABEL: combine_psadbw_uitofp_knownbits: 133; AVX2: # %bb.0: 134; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 135; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 136; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 137; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 138; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 139; AVX2-NEXT: retq 140 %mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 141 %sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer) 142 %cvt = uitofp <2 x i64> %sad to <2 x double> 143 ret <2 x double> %cvt 144} 145 146declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) 147 148