xref: /llvm-project/llvm/test/CodeGen/X86/psadbw.ll (revision 0bd9255f8ad9f321dd606365e2bc28447a9976cb)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5
6; Only bottom 16 bits are set - upper 48 bits are zero.
7define <2 x i64> @combine_psadbw_shift(<16 x i8> %0, <16 x i8> %1) nounwind {
8; SSE-LABEL: combine_psadbw_shift:
9; SSE:       # %bb.0:
10; SSE-NEXT:    xorps %xmm0, %xmm0
11; SSE-NEXT:    ret{{[l|q]}}
12;
13; AVX2-LABEL: combine_psadbw_shift:
14; AVX2:       # %bb.0:
15; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
16; AVX2-NEXT:    retq
17  %3 = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, <16 x i8> %1)
18  %4 = lshr <2 x i64> %3, <i64 48, i64 48>
19  ret <2 x i64> %4
20}
21
22; Propagate the demanded result elements to the 8 aliasing source elements.
23define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
24; X86-SSE-LABEL: combine_psadbw_demandedelt:
25; X86-SSE:       # %bb.0:
26; X86-SSE-NEXT:    psadbw %xmm1, %xmm0
27; X86-SSE-NEXT:    movd %xmm0, %eax
28; X86-SSE-NEXT:    xorl %edx, %edx
29; X86-SSE-NEXT:    retl
30;
31; X64-SSE-LABEL: combine_psadbw_demandedelt:
32; X64-SSE:       # %bb.0:
33; X64-SSE-NEXT:    psadbw %xmm1, %xmm0
34; X64-SSE-NEXT:    movq %xmm0, %rax
35; X64-SSE-NEXT:    retq
36;
37; AVX2-LABEL: combine_psadbw_demandedelt:
38; AVX2:       # %bb.0:
39; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
40; AVX2-NEXT:    vmovq %xmm0, %rax
41; AVX2-NEXT:    retq
42  %3 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
43  %4 = shufflevector <16 x i8> %1, <16 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
44  %5 = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %3, <16 x i8> %4)
45  %6 = extractelement <2 x i64> %5, i32 0
46  ret i64 %6
47}
48
49; TODO: Each PSADBW source element has a maximum value of 3 - so max sum-of-diffs for each <8 x i8> should be 24.
50define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
51; X86-SSE-LABEL: combine_psadbw_cmp_knownbits:
52; X86-SSE:       # %bb.0:
53; X86-SSE-NEXT:    xorps %xmm0, %xmm0
54; X86-SSE-NEXT:    retl
55;
56; X64-SSE-LABEL: combine_psadbw_cmp_knownbits:
57; X64-SSE:       # %bb.0:
58; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
59; X64-SSE-NEXT:    pxor %xmm1, %xmm1
60; X64-SSE-NEXT:    psadbw %xmm1, %xmm0
61; X64-SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
62; X64-SSE-NEXT:    retq
63;
64; AVX2-LABEL: combine_psadbw_cmp_knownbits:
65; AVX2:       # %bb.0:
66; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
67; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
68; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
69; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
70; AVX2-NEXT:    retq
71  %mask = and <16 x i8> %a0, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
72  %sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
73  %cmp = icmp sgt <2 x i64> %sad, <i64 32, i64 32>
74  %ext = sext <2 x i1> %cmp to <2 x i64>
75  ret <2 x i64> %ext
76}
77
78; No need to scalarize the sitofp as the PSADBW results are smaller than i32.
79define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
80; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
81; X86-SSE:       # %bb.0:
82; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
83; X86-SSE-NEXT:    pxor %xmm1, %xmm1
84; X86-SSE-NEXT:    psadbw %xmm0, %xmm1
85; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
86; X86-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
87; X86-SSE-NEXT:    retl
88;
89; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
90; X64-SSE:       # %bb.0:
91; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
92; X64-SSE-NEXT:    pxor %xmm1, %xmm1
93; X64-SSE-NEXT:    psadbw %xmm0, %xmm1
94; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
95; X64-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
96; X64-SSE-NEXT:    retq
97;
98; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
99; AVX2:       # %bb.0:
100; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
101; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
102; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
103; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
104; AVX2-NEXT:    vcvtdq2pd %xmm0, %xmm0
105; AVX2-NEXT:    retq
106  %mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
107  %sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
108  %cvt = sitofp <2 x i64> %sad to <2 x double>
109  ret <2 x double> %cvt
110}
111
112; Convert from uitofp to sitofp as the PSADBW results are zero-extended.
113define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
114; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
115; X86-SSE:       # %bb.0:
116; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
117; X86-SSE-NEXT:    pxor %xmm1, %xmm1
118; X86-SSE-NEXT:    psadbw %xmm0, %xmm1
119; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
120; X86-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
121; X86-SSE-NEXT:    retl
122;
123; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
124; X64-SSE:       # %bb.0:
125; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
126; X64-SSE-NEXT:    pxor %xmm1, %xmm1
127; X64-SSE-NEXT:    psadbw %xmm0, %xmm1
128; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
129; X64-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
130; X64-SSE-NEXT:    retq
131;
132; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
133; AVX2:       # %bb.0:
134; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
135; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
136; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
137; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
138; AVX2-NEXT:    vcvtdq2pd %xmm0, %xmm0
139; AVX2-NEXT:    retq
140  %mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
141  %sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
142  %cvt = uitofp <2 x i64> %sad to <2 x double>
143  ret <2 x double> %cvt
144}
145
146declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>)
147
148