1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX 7 8define i32 @sad8_32bit_icmp_sge(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 { 9; SSE2-LABEL: sad8_32bit_icmp_sge: 10; SSE2: # %bb.0: # %entry 11; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 12; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 13; SSE2-NEXT: psadbw %xmm0, %xmm1 14; SSE2-NEXT: movd %xmm1, %eax 15; SSE2-NEXT: retq 16; 17; AVX-LABEL: sad8_32bit_icmp_sge: 18; AVX: # %bb.0: # %entry 19; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 20; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 21; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 22; AVX-NEXT: vmovd %xmm0, %eax 23; AVX-NEXT: retq 24 25entry: 26 %idx.ext = zext i32 %stride to i64 27 br label %for.body 28 29for.body: ; preds = %entry 30 %0 = load <8 x i8>, ptr %cur, align 1 31 %1 = zext <8 x i8> %0 to <8 x i32> 32 %2 = load <8 x i8>, ptr %ref, align 1 33 %3 = zext <8 x i8> %2 to <8 x i32> 34 %4 = sub nsw <8 x i32> %1, %3 35 %5 = icmp sgt <8 x i32> %4, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 36 %6 = sub nsw <8 x i32> zeroinitializer, %4 37 %7 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> %6 38 %rdx.shuf = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 39 %bin.rdx = add <8 x i32> %7, %rdx.shuf 40 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 41 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 42 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 43 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 44 %8 = extractelement <8 x i32> %bin.rdx232, i32 0 45 ret i32 %8 46} 47 48define i32 @sad8_32bit_icmp_sgt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 { 49; SSE2-LABEL: sad8_32bit_icmp_sgt: 50; SSE2: # %bb.0: # %entry 51; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 52; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 53; SSE2-NEXT: psadbw %xmm0, %xmm1 54; SSE2-NEXT: movd %xmm1, %eax 55; SSE2-NEXT: retq 56; 57; AVX-LABEL: sad8_32bit_icmp_sgt: 58; AVX: # %bb.0: # %entry 59; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 60; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 61; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 62; AVX-NEXT: vmovd %xmm0, %eax 63; AVX-NEXT: retq 64entry: 65 %idx.ext = zext i32 %stride to i64 66 br label %for.body 67 68for.body: ; preds = %entry 69 %0 = load <8 x i8>, ptr %cur, align 1 70 %1 = zext <8 x i8> %0 to <8 x i32> 71 %2 = load <8 x i8>, ptr %ref, align 1 72 %3 = zext <8 x i8> %2 to <8 x i32> 73 %4 = sub nsw <8 x i32> %1, %3 74 %5 = icmp sgt <8 x i32> %4, zeroinitializer 75 %6 = sub nsw <8 x i32> zeroinitializer, %4 76 %7 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> %6 77 %rdx.shuf = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 78 %bin.rdx = add <8 x i32> %7, %rdx.shuf 79 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 80 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 81 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 82 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 83 %8 = extractelement <8 x i32> %bin.rdx232, i32 0 84 ret i32 %8 85} 86 87define i32 @sad8_32bit_icmp_sle(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 { 88; SSE2-LABEL: sad8_32bit_icmp_sle: 89; SSE2: # %bb.0: # %entry 90; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 91; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 92; SSE2-NEXT: psadbw %xmm0, %xmm1 93; SSE2-NEXT: movd %xmm1, %eax 94; SSE2-NEXT: retq 95; 96; AVX-LABEL: sad8_32bit_icmp_sle: 97; AVX: # %bb.0: # %entry 98; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 99; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 100; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 101; AVX-NEXT: vmovd %xmm0, %eax 102; AVX-NEXT: retq 103entry: 104 %idx.ext = zext i32 %stride to i64 105 br label %for.body 106 107for.body: ; preds = %entry 108 %0 = load <8 x i8>, ptr %cur, align 1 109 %1 = zext <8 x i8> %0 to <8 x i32> 110 %2 = load <8 x i8>, ptr %ref, align 1 111 %3 = zext <8 x i8> %2 to <8 x i32> 112 %4 = sub nsw <8 x i32> %1, %3 113 %5 = icmp slt <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 114 %6 = sub nsw <8 x i32> zeroinitializer, %4 115 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4 116 %rdx.shuf = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 117 %bin.rdx = add <8 x i32> %7, %rdx.shuf 118 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 119 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 120 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 121 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 122 %8 = extractelement <8 x i32> %bin.rdx232, i32 0 123 ret i32 %8 124} 125 126define i32 @sad8_32bit_icmp_slt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 { 127; SSE2-LABEL: sad8_32bit_icmp_slt: 128; SSE2: # %bb.0: # %entry 129; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 130; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 131; SSE2-NEXT: psadbw %xmm0, %xmm1 132; SSE2-NEXT: movd %xmm1, %eax 133; SSE2-NEXT: retq 134; 135; AVX-LABEL: sad8_32bit_icmp_slt: 136; AVX: # %bb.0: # %entry 137; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 138; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 139; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 140; AVX-NEXT: vmovd %xmm0, %eax 141; AVX-NEXT: retq 142entry: 143 %idx.ext = zext i32 %stride to i64 144 br label %for.body 145 146for.body: ; preds = %entry 147 %0 = load <8 x i8>, ptr %cur, align 1 148 %1 = zext <8 x i8> %0 to <8 x i32> 149 %2 = load <8 x i8>, ptr %ref, align 1 150 %3 = zext <8 x i8> %2 to <8 x i32> 151 %4 = sub nsw <8 x i32> %1, %3 152 %5 = icmp slt <8 x i32> %4, zeroinitializer 153 %6 = sub nsw <8 x i32> zeroinitializer, %4 154 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4 155 %rdx.shuf = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 156 %bin.rdx = add <8 x i32> %7, %rdx.shuf 157 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 158 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 159 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 160 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 161 %8 = extractelement <8 x i32> %bin.rdx232, i32 0 162 ret i32 %8 163} 164 165define i64 @sad8_64bit_icmp_sext_slt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { 166; SSE2-LABEL: sad8_64bit_icmp_sext_slt: 167; SSE2: # %bb.0: # %entry 168; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 169; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 170; SSE2-NEXT: psadbw %xmm0, %xmm1 171; SSE2-NEXT: movq %xmm1, %rax 172; SSE2-NEXT: retq 173; 174; AVX-LABEL: sad8_64bit_icmp_sext_slt: 175; AVX: # %bb.0: # %entry 176; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 177; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 178; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 179; AVX-NEXT: vmovq %xmm0, %rax 180; AVX-NEXT: retq 181entry: 182 br label %for.body 183 184for.body: ; preds = %entry 185 %0 = load <8 x i8>, ptr %cur, align 1 186 %1 = zext <8 x i8> %0 to <8 x i32> 187 %2 = load <8 x i8>, ptr %ref, align 1 188 %3 = zext <8 x i8> %2 to <8 x i32> 189 %4 = sub nsw <8 x i32> %1, %3 190 %5 = icmp slt <8 x i32> %4, zeroinitializer 191 %6 = sub nsw <8 x i32> zeroinitializer, %4 192 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4 193 %8 = sext <8 x i32> %7 to <8 x i64> 194 %rdx.shuf = shufflevector <8 x i64> %8, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 195 %bin.rdx = add <8 x i64> %rdx.shuf, %8 196 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 197 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 198 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 199 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 200 %9 = extractelement <8 x i64> %bin.rdx239, i32 0 201 ret i64 %9 202} 203 204define i64 @sad8_64bit_icmp_zext_slt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { 205; SSE2-LABEL: sad8_64bit_icmp_zext_slt: 206; SSE2: # %bb.0: # %entry 207; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 208; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 209; SSE2-NEXT: psadbw %xmm0, %xmm1 210; SSE2-NEXT: movq %xmm1, %rax 211; SSE2-NEXT: retq 212; 213; AVX-LABEL: sad8_64bit_icmp_zext_slt: 214; AVX: # %bb.0: # %entry 215; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 216; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 217; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 218; AVX-NEXT: vmovq %xmm0, %rax 219; AVX-NEXT: retq 220entry: 221 br label %for.body 222 223for.body: ; preds = %entry 224 %0 = load <8 x i8>, ptr %cur, align 1 225 %1 = zext <8 x i8> %0 to <8 x i32> 226 %2 = load <8 x i8>, ptr %ref, align 1 227 %3 = zext <8 x i8> %2 to <8 x i32> 228 %4 = sub nsw <8 x i32> %1, %3 229 %5 = icmp slt <8 x i32> %4, zeroinitializer 230 %6 = sub nsw <8 x i32> zeroinitializer, %4 231 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4 232 %8 = zext <8 x i32> %7 to <8 x i64> 233 %rdx.shuf = shufflevector <8 x i64> %8, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 234 %bin.rdx = add <8 x i64> %rdx.shuf, %8 235 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 236 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 237 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 238 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 239 %9 = extractelement <8 x i64> %bin.rdx239, i32 0 240 ret i64 %9 241} 242 243define i64 @sad8_early_64bit_icmp_zext_slt(ptr nocapture readonly %cur, ptr nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { 244; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt: 245; SSE2: # %bb.0: # %entry 246; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 247; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 248; SSE2-NEXT: psadbw %xmm0, %xmm1 249; SSE2-NEXT: movq %xmm1, %rax 250; SSE2-NEXT: retq 251; 252; AVX-LABEL: sad8_early_64bit_icmp_zext_slt: 253; AVX: # %bb.0: # %entry 254; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 255; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 256; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 257; AVX-NEXT: vmovq %xmm0, %rax 258; AVX-NEXT: retq 259entry: 260 br label %for.body 261 262for.body: ; preds = %entry 263 %0 = load <8 x i8>, ptr %cur, align 1 264 %1 = zext <8 x i8> %0 to <8 x i64> 265 %2 = load <8 x i8>, ptr %ref, align 1 266 %3 = zext <8 x i8> %2 to <8 x i64> 267 %4 = sub nsw <8 x i64> %1, %3 268 %5 = icmp slt <8 x i64> %4, zeroinitializer 269 %6 = sub nsw <8 x i64> zeroinitializer, %4 270 %7 = select <8 x i1> %5, <8 x i64> %6, <8 x i64> %4 271 %rdx.shuf = shufflevector <8 x i64> %7, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 272 %bin.rdx = add <8 x i64> %rdx.shuf, %7 273 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 274 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 275 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 276 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 277 %8 = extractelement <8 x i64> %bin.rdx239, i32 0 278 ret i64 %8 279} 280