1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 7 8@a = dso_local global [1024 x i8] zeroinitializer, align 16 9@b = dso_local global [1024 x i8] zeroinitializer, align 16 10 11define dso_local i32 @sad_16i8() nounwind { 12; SSE2-LABEL: sad_16i8: 13; SSE2: # %bb.0: # %entry 14; SSE2-NEXT: pxor %xmm0, %xmm0 15; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 16; SSE2-NEXT: pxor %xmm1, %xmm1 17; SSE2-NEXT: .p2align 4 18; SSE2-NEXT: .LBB0_1: # %vector.body 19; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 20; SSE2-NEXT: movdqu a+1024(%rax), %xmm2 21; SSE2-NEXT: movdqu b+1024(%rax), %xmm3 22; SSE2-NEXT: psadbw %xmm2, %xmm3 23; SSE2-NEXT: paddd %xmm3, %xmm1 24; SSE2-NEXT: addq $16, %rax 25; SSE2-NEXT: jne .LBB0_1 26; SSE2-NEXT: # %bb.2: # %middle.block 27; SSE2-NEXT: paddd %xmm0, %xmm1 28; SSE2-NEXT: paddd %xmm0, %xmm0 29; SSE2-NEXT: paddd %xmm1, %xmm0 30; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 31; SSE2-NEXT: paddd %xmm0, %xmm1 32; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 33; SSE2-NEXT: paddd %xmm1, %xmm0 34; SSE2-NEXT: movd %xmm0, %eax 35; SSE2-NEXT: retq 36; 37; AVX1-LABEL: sad_16i8: 38; AVX1: # %bb.0: # %entry 39; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 40; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 41; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 42; AVX1-NEXT: .p2align 4 43; AVX1-NEXT: .LBB0_1: # %vector.body 44; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 45; AVX1-NEXT: vmovdqu a+1024(%rax), %xmm2 46; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 47; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2 48; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 49; AVX1-NEXT: addq $16, %rax 50; AVX1-NEXT: jne .LBB0_1 51; AVX1-NEXT: # %bb.2: # %middle.block 52; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 53; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 54; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 55; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 56; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 57; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 58; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 59; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 60; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 61; AVX1-NEXT: vmovd %xmm0, %eax 62; AVX1-NEXT: vzeroupper 63; AVX1-NEXT: retq 64; 65; AVX2-LABEL: sad_16i8: 66; AVX2: # %bb.0: # %entry 67; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 68; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 69; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 70; AVX2-NEXT: .p2align 4 71; AVX2-NEXT: .LBB0_1: # %vector.body 72; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 73; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2 74; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 75; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 76; AVX2-NEXT: addq $16, %rax 77; AVX2-NEXT: jne .LBB0_1 78; AVX2-NEXT: # %bb.2: # %middle.block 79; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 80; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 81; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 82; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 83; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 84; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 85; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 86; AVX2-NEXT: vmovd %xmm0, %eax 87; AVX2-NEXT: vzeroupper 88; AVX2-NEXT: retq 89; 90; AVX512-LABEL: sad_16i8: 91; AVX512: # %bb.0: # %entry 92; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 93; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 94; AVX512-NEXT: .p2align 4 95; AVX512-NEXT: .LBB0_1: # %vector.body 96; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 97; AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1 98; AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 99; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 100; AVX512-NEXT: addq $16, %rax 101; AVX512-NEXT: jne .LBB0_1 102; AVX512-NEXT: # %bb.2: # %middle.block 103; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 104; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 105; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 106; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 107; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 108; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 109; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 110; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 111; AVX512-NEXT: vmovd %xmm0, %eax 112; AVX512-NEXT: vzeroupper 113; AVX512-NEXT: retq 114entry: 115 br label %vector.body 116 117vector.body: 118 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 119 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 120 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index 121 %1 = bitcast ptr %0 to ptr 122 %wide.load = load <16 x i8>, ptr %1, align 4 123 %2 = zext <16 x i8> %wide.load to <16 x i32> 124 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index 125 %4 = bitcast ptr %3 to ptr 126 %wide.load1 = load <16 x i8>, ptr %4, align 4 127 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 128 %6 = sub nsw <16 x i32> %2, %5 129 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 130 %8 = sub nsw <16 x i32> zeroinitializer, %6 131 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 132 %10 = add nsw <16 x i32> %9, %vec.phi 133 %index.next = add i64 %index, 16 134 %11 = icmp eq i64 %index.next, 1024 135 br i1 %11, label %middle.block, label %vector.body 136 137middle.block: 138 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 139 %bin.rdx = add <16 x i32> %10, %rdx.shuf 140 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 141 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 142 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 143 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 144 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 145 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 146 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 147 ret i32 %12 148} 149 150define dso_local i32 @sad_32i8() nounwind { 151; SSE2-LABEL: sad_32i8: 152; SSE2: # %bb.0: # %entry 153; SSE2-NEXT: pxor %xmm0, %xmm0 154; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 155; SSE2-NEXT: pxor %xmm1, %xmm1 156; SSE2-NEXT: pxor %xmm2, %xmm2 157; SSE2-NEXT: .p2align 4 158; SSE2-NEXT: .LBB1_1: # %vector.body 159; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 160; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 161; SSE2-NEXT: psadbw b+1024(%rax), %xmm3 162; SSE2-NEXT: paddd %xmm3, %xmm1 163; SSE2-NEXT: movdqa a+1040(%rax), %xmm3 164; SSE2-NEXT: psadbw b+1040(%rax), %xmm3 165; SSE2-NEXT: paddd %xmm3, %xmm2 166; SSE2-NEXT: addq $32, %rax 167; SSE2-NEXT: jne .LBB1_1 168; SSE2-NEXT: # %bb.2: # %middle.block 169; SSE2-NEXT: paddd %xmm0, %xmm2 170; SSE2-NEXT: paddd %xmm0, %xmm1 171; SSE2-NEXT: paddd %xmm0, %xmm0 172; SSE2-NEXT: paddd %xmm0, %xmm1 173; SSE2-NEXT: paddd %xmm2, %xmm0 174; SSE2-NEXT: paddd %xmm1, %xmm0 175; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 176; SSE2-NEXT: paddd %xmm0, %xmm1 177; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 178; SSE2-NEXT: paddd %xmm1, %xmm0 179; SSE2-NEXT: movd %xmm0, %eax 180; SSE2-NEXT: retq 181; 182; AVX1-LABEL: sad_32i8: 183; AVX1: # %bb.0: # %entry 184; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 185; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 186; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 187; AVX1-NEXT: .p2align 4 188; AVX1-NEXT: .LBB1_1: # %vector.body 189; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 190; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm2 191; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 192; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3 193; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3 194; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 195; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 196; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 197; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 198; AVX1-NEXT: addq $32, %rax 199; AVX1-NEXT: jne .LBB1_1 200; AVX1-NEXT: # %bb.2: # %middle.block 201; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 202; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 203; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 204; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5 205; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 206; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 207; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 208; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 209; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 210; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 211; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 212; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 213; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 214; AVX1-NEXT: vmovd %xmm0, %eax 215; AVX1-NEXT: vzeroupper 216; AVX1-NEXT: retq 217; 218; AVX2-LABEL: sad_32i8: 219; AVX2: # %bb.0: # %entry 220; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 221; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 222; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 223; AVX2-NEXT: .p2align 4 224; AVX2-NEXT: .LBB1_1: # %vector.body 225; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 226; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm2 227; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 228; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 229; AVX2-NEXT: addq $32, %rax 230; AVX2-NEXT: jne .LBB1_1 231; AVX2-NEXT: # %bb.2: # %middle.block 232; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 233; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 234; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 235; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 236; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 237; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 238; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 239; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 240; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 241; AVX2-NEXT: vmovd %xmm0, %eax 242; AVX2-NEXT: vzeroupper 243; AVX2-NEXT: retq 244; 245; AVX512-LABEL: sad_32i8: 246; AVX512: # %bb.0: # %entry 247; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 248; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 249; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 250; AVX512-NEXT: .p2align 4 251; AVX512-NEXT: .LBB1_1: # %vector.body 252; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 253; AVX512-NEXT: vmovdqa a+1024(%rax), %ymm2 254; AVX512-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 255; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 256; AVX512-NEXT: addq $32, %rax 257; AVX512-NEXT: jne .LBB1_1 258; AVX512-NEXT: # %bb.2: # %middle.block 259; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 260; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 261; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 262; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 263; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 264; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 265; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 266; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 267; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 268; AVX512-NEXT: vmovd %xmm0, %eax 269; AVX512-NEXT: vzeroupper 270; AVX512-NEXT: retq 271entry: 272 br label %vector.body 273 274vector.body: 275 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 276 %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 277 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index 278 %1 = bitcast ptr %0 to ptr 279 %wide.load = load <32 x i8>, ptr %1, align 32 280 %2 = zext <32 x i8> %wide.load to <32 x i32> 281 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index 282 %4 = bitcast ptr %3 to ptr 283 %wide.load1 = load <32 x i8>, ptr %4, align 32 284 %5 = zext <32 x i8> %wide.load1 to <32 x i32> 285 %6 = sub nsw <32 x i32> %2, %5 286 %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 287 %8 = sub nsw <32 x i32> zeroinitializer, %6 288 %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8 289 %10 = add nsw <32 x i32> %9, %vec.phi 290 %index.next = add i64 %index, 32 291 %11 = icmp eq i64 %index.next, 1024 292 br i1 %11, label %middle.block, label %vector.body 293 294middle.block: 295 %rdx.shuf = shufflevector <32 x i32> %10, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 296 %bin.rdx = add <32 x i32> %10, %rdx.shuf 297 %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 298 %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2 299 %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 300 %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3 301 %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 302 %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4 303 %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 304 %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5 305 %12 = extractelement <32 x i32> %bin.rdx5, i32 0 306 ret i32 %12 307} 308 309define dso_local i32 @sad_avx64i8() nounwind { 310; SSE2-LABEL: sad_avx64i8: 311; SSE2: # %bb.0: # %entry 312; SSE2-NEXT: pxor %xmm4, %xmm4 313; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 314; SSE2-NEXT: pxor %xmm0, %xmm0 315; SSE2-NEXT: pxor %xmm3, %xmm3 316; SSE2-NEXT: pxor %xmm2, %xmm2 317; SSE2-NEXT: pxor %xmm1, %xmm1 318; SSE2-NEXT: .p2align 4 319; SSE2-NEXT: .LBB2_1: # %vector.body 320; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 321; SSE2-NEXT: movdqa a+1024(%rax), %xmm5 322; SSE2-NEXT: psadbw b+1024(%rax), %xmm5 323; SSE2-NEXT: paddd %xmm5, %xmm0 324; SSE2-NEXT: movdqa a+1040(%rax), %xmm5 325; SSE2-NEXT: psadbw b+1040(%rax), %xmm5 326; SSE2-NEXT: paddd %xmm5, %xmm3 327; SSE2-NEXT: movdqa a+1056(%rax), %xmm5 328; SSE2-NEXT: psadbw b+1056(%rax), %xmm5 329; SSE2-NEXT: paddd %xmm5, %xmm2 330; SSE2-NEXT: movdqa a+1072(%rax), %xmm5 331; SSE2-NEXT: psadbw b+1072(%rax), %xmm5 332; SSE2-NEXT: paddd %xmm5, %xmm1 333; SSE2-NEXT: addq $64, %rax 334; SSE2-NEXT: jne .LBB2_1 335; SSE2-NEXT: # %bb.2: # %middle.block 336; SSE2-NEXT: paddd %xmm4, %xmm2 337; SSE2-NEXT: pxor %xmm5, %xmm5 338; SSE2-NEXT: paddd %xmm5, %xmm5 339; SSE2-NEXT: paddd %xmm4, %xmm0 340; SSE2-NEXT: paddd %xmm4, %xmm1 341; SSE2-NEXT: paddd %xmm4, %xmm3 342; SSE2-NEXT: paddd %xmm5, %xmm3 343; SSE2-NEXT: paddd %xmm5, %xmm1 344; SSE2-NEXT: paddd %xmm3, %xmm1 345; SSE2-NEXT: paddd %xmm5, %xmm0 346; SSE2-NEXT: paddd %xmm2, %xmm5 347; SSE2-NEXT: paddd %xmm0, %xmm5 348; SSE2-NEXT: paddd %xmm1, %xmm5 349; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 350; SSE2-NEXT: paddd %xmm5, %xmm0 351; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 352; SSE2-NEXT: paddd %xmm0, %xmm1 353; SSE2-NEXT: movd %xmm1, %eax 354; SSE2-NEXT: retq 355; 356; AVX1-LABEL: sad_avx64i8: 357; AVX1: # %bb.0: # %entry 358; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 359; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 360; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 361; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 362; AVX1-NEXT: .p2align 4 363; AVX1-NEXT: .LBB2_1: # %vector.body 364; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 365; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3 366; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3 367; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm4 368; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm4, %xmm4 369; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm5 370; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm5, %xmm5 371; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm6 372; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm6, %xmm6 373; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 374; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 375; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 376; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 377; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 378; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 379; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 380; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 381; AVX1-NEXT: addq $64, %rax 382; AVX1-NEXT: jne .LBB2_1 383; AVX1-NEXT: # %bb.2: # %middle.block 384; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 385; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 386; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5 387; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 388; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm7 389; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm8 390; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm8 391; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2 392; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0 393; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 394; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 395; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm1 396; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm2 397; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 398; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 399; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 400; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 401; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 402; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 403; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 404; AVX1-NEXT: vmovd %xmm0, %eax 405; AVX1-NEXT: vzeroupper 406; AVX1-NEXT: retq 407; 408; AVX2-LABEL: sad_avx64i8: 409; AVX2: # %bb.0: # %entry 410; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 411; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 412; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 413; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 414; AVX2-NEXT: .p2align 4 415; AVX2-NEXT: .LBB2_1: # %vector.body 416; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 417; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm3 418; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3 419; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 420; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3 421; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 422; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 423; AVX2-NEXT: addq $64, %rax 424; AVX2-NEXT: jne .LBB2_1 425; AVX2-NEXT: # %bb.2: # %middle.block 426; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2 427; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3 428; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 429; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 430; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1 431; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 432; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 433; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 434; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 435; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 436; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 437; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 438; AVX2-NEXT: vmovd %xmm0, %eax 439; AVX2-NEXT: vzeroupper 440; AVX2-NEXT: retq 441; 442; AVX512F-LABEL: sad_avx64i8: 443; AVX512F: # %bb.0: # %entry 444; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 445; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 446; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 447; AVX512F-NEXT: .p2align 4 448; AVX512F-NEXT: .LBB2_1: # %vector.body 449; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 450; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2 451; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 452; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm3 453; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 454; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 455; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 456; AVX512F-NEXT: addq $64, %rax 457; AVX512F-NEXT: jne .LBB2_1 458; AVX512F-NEXT: # %bb.2: # %middle.block 459; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm1 460; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0 461; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 462; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 463; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 464; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 465; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 466; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 467; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 468; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 469; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 470; AVX512F-NEXT: vmovd %xmm0, %eax 471; AVX512F-NEXT: vzeroupper 472; AVX512F-NEXT: retq 473; 474; AVX512BW-LABEL: sad_avx64i8: 475; AVX512BW: # %bb.0: # %entry 476; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 477; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 478; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 479; AVX512BW-NEXT: .p2align 4 480; AVX512BW-NEXT: .LBB2_1: # %vector.body 481; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 482; AVX512BW-NEXT: vmovdqa64 a+1024(%rax), %zmm2 483; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2 484; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 485; AVX512BW-NEXT: addq $64, %rax 486; AVX512BW-NEXT: jne .LBB2_1 487; AVX512BW-NEXT: # %bb.2: # %middle.block 488; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1 489; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 490; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 491; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 492; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 493; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 494; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 495; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 496; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 497; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 498; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 499; AVX512BW-NEXT: vmovd %xmm0, %eax 500; AVX512BW-NEXT: vzeroupper 501; AVX512BW-NEXT: retq 502entry: 503 br label %vector.body 504 505vector.body: 506 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 507 %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 508 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index 509 %1 = bitcast ptr %0 to ptr 510 %wide.load = load <64 x i8>, ptr %1, align 64 511 %2 = zext <64 x i8> %wide.load to <64 x i32> 512 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index 513 %4 = bitcast ptr %3 to ptr 514 %wide.load1 = load <64 x i8>, ptr %4, align 64 515 %5 = zext <64 x i8> %wide.load1 to <64 x i32> 516 %6 = sub nsw <64 x i32> %2, %5 517 %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 518 %8 = sub nsw <64 x i32> zeroinitializer, %6 519 %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8 520 %10 = add nsw <64 x i32> %9, %vec.phi 521 %index.next = add i64 %index, 64 522 %11 = icmp eq i64 %index.next, 1024 523 br i1 %11, label %middle.block, label %vector.body 524 525middle.block: 526 %rdx.shuf = shufflevector <64 x i32> %10, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 527 %bin.rdx = add <64 x i32> %10, %rdx.shuf 528 %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 529 %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2 530 %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 531 %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3 532 %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 533 %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4 534 %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 535 %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5 536 %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 537 %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6 538 %12 = extractelement <64 x i32> %bin.rdx6, i32 0 539 ret i32 %12 540} 541 542define dso_local i32 @sad_2i8() nounwind { 543; SSE2-LABEL: sad_2i8: 544; SSE2: # %bb.0: # %entry 545; SSE2-NEXT: pxor %xmm0, %xmm0 546; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 547; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] 548; SSE2-NEXT: .p2align 4 549; SSE2-NEXT: .LBB3_1: # %vector.body 550; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 551; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 552; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 553; SSE2-NEXT: pand %xmm1, %xmm2 554; SSE2-NEXT: pand %xmm1, %xmm3 555; SSE2-NEXT: psadbw %xmm2, %xmm3 556; SSE2-NEXT: paddd %xmm3, %xmm0 557; SSE2-NEXT: addq $2, %rax 558; SSE2-NEXT: jne .LBB3_1 559; SSE2-NEXT: # %bb.2: # %middle.block 560; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 561; SSE2-NEXT: paddd %xmm0, %xmm1 562; SSE2-NEXT: movd %xmm1, %eax 563; SSE2-NEXT: retq 564; 565; AVX-LABEL: sad_2i8: 566; AVX: # %bb.0: # %entry 567; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 568; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 569; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 570; AVX-NEXT: .p2align 4 571; AVX-NEXT: .LBB3_1: # %vector.body 572; AVX-NEXT: # =>This Inner Loop Header: Depth=1 573; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 574; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 575; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] 576; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] 577; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 578; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1 579; AVX-NEXT: addq $2, %rax 580; AVX-NEXT: jne .LBB3_1 581; AVX-NEXT: # %bb.2: # %middle.block 582; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 583; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 584; AVX-NEXT: vmovd %xmm0, %eax 585; AVX-NEXT: retq 586entry: 587 br label %vector.body 588 589vector.body: 590 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 591 %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 592 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index 593 %1 = bitcast ptr %0 to ptr 594 %wide.load = load <2 x i8>, ptr %1, align 4 595 %2 = zext <2 x i8> %wide.load to <2 x i32> 596 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index 597 %4 = bitcast ptr %3 to ptr 598 %wide.load1 = load <2 x i8>, ptr %4, align 4 599 %5 = zext <2 x i8> %wide.load1 to <2 x i32> 600 %6 = sub nsw <2 x i32> %2, %5 601 %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1> 602 %8 = sub nsw <2 x i32> zeroinitializer, %6 603 %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8 604 %10 = add nsw <2 x i32> %9, %vec.phi 605 %index.next = add i64 %index, 2 606 %11 = icmp eq i64 %index.next, 1024 607 br i1 %11, label %middle.block, label %vector.body 608 609middle.block: 610 %rdx.shuf = shufflevector <2 x i32> %10, <2 x i32> undef, <2 x i32> <i32 1, i32 undef> 611 %bin.rdx = add <2 x i32> %10, %rdx.shuf 612 %12 = extractelement <2 x i32> %bin.rdx, i32 0 613 ret i32 %12 614} 615 616define dso_local i32 @sad_4i8() nounwind { 617; SSE2-LABEL: sad_4i8: 618; SSE2: # %bb.0: # %entry 619; SSE2-NEXT: pxor %xmm0, %xmm0 620; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 621; SSE2-NEXT: .p2align 4 622; SSE2-NEXT: .LBB4_1: # %vector.body 623; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 624; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 625; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 626; SSE2-NEXT: psadbw %xmm1, %xmm2 627; SSE2-NEXT: paddd %xmm2, %xmm0 628; SSE2-NEXT: addq $4, %rax 629; SSE2-NEXT: jne .LBB4_1 630; SSE2-NEXT: # %bb.2: # %middle.block 631; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 632; SSE2-NEXT: paddd %xmm0, %xmm1 633; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 634; SSE2-NEXT: paddd %xmm1, %xmm0 635; SSE2-NEXT: movd %xmm0, %eax 636; SSE2-NEXT: retq 637; 638; AVX-LABEL: sad_4i8: 639; AVX: # %bb.0: # %entry 640; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 641; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 642; AVX-NEXT: .p2align 4 643; AVX-NEXT: .LBB4_1: # %vector.body 644; AVX-NEXT: # =>This Inner Loop Header: Depth=1 645; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 646; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 647; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 648; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 649; AVX-NEXT: addq $4, %rax 650; AVX-NEXT: jne .LBB4_1 651; AVX-NEXT: # %bb.2: # %middle.block 652; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 653; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 654; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 655; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 656; AVX-NEXT: vmovd %xmm0, %eax 657; AVX-NEXT: retq 658entry: 659 br label %vector.body 660 661vector.body: 662 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 663 %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 664 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index 665 %1 = bitcast ptr %0 to ptr 666 %wide.load = load <4 x i8>, ptr %1, align 4 667 %2 = zext <4 x i8> %wide.load to <4 x i32> 668 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index 669 %4 = bitcast ptr %3 to ptr 670 %wide.load1 = load <4 x i8>, ptr %4, align 4 671 %5 = zext <4 x i8> %wide.load1 to <4 x i32> 672 %6 = sub nsw <4 x i32> %2, %5 673 %7 = icmp sgt <4 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1> 674 %8 = sub nsw <4 x i32> zeroinitializer, %6 675 %9 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> %8 676 %10 = add nsw <4 x i32> %9, %vec.phi 677 %index.next = add i64 %index, 4 678 %11 = icmp eq i64 %index.next, 1024 679 br i1 %11, label %middle.block, label %vector.body 680 681middle.block: 682 %h2 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 683 %sum2 = add <4 x i32> %10, %h2 684 %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 685 %sum3 = add <4 x i32> %sum2, %h3 686 %sum = extractelement <4 x i32> %sum3, i32 0 687 ret i32 %sum 688} 689 690 691define dso_local i32 @sad_nonloop_4i8(ptr nocapture readonly %p, i64, ptr nocapture readonly %q) local_unnamed_addr #0 { 692; SSE2-LABEL: sad_nonloop_4i8: 693; SSE2: # %bb.0: 694; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 695; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 696; SSE2-NEXT: psadbw %xmm0, %xmm1 697; SSE2-NEXT: movd %xmm1, %eax 698; SSE2-NEXT: retq 699; 700; AVX-LABEL: sad_nonloop_4i8: 701; AVX: # %bb.0: 702; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 703; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 704; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 705; AVX-NEXT: vmovd %xmm0, %eax 706; AVX-NEXT: retq 707 %v1 = load <4 x i8>, ptr %p, align 1 708 %z1 = zext <4 x i8> %v1 to <4 x i32> 709 %v2 = load <4 x i8>, ptr %q, align 1 710 %z2 = zext <4 x i8> %v2 to <4 x i32> 711 %sub = sub nsw <4 x i32> %z1, %z2 712 %isneg = icmp sgt <4 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1> 713 %neg = sub nsw <4 x i32> zeroinitializer, %sub 714 %abs = select <4 x i1> %isneg, <4 x i32> %sub, <4 x i32> %neg 715 %h2 = shufflevector <4 x i32> %abs, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 716 %sum2 = add <4 x i32> %abs, %h2 717 %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 718 %sum3 = add <4 x i32> %sum2, %h3 719 %sum = extractelement <4 x i32> %sum3, i32 0 720 ret i32 %sum 721} 722 723define dso_local i32 @sad_nonloop_8i8(ptr nocapture readonly %p, i64, ptr nocapture readonly %q) local_unnamed_addr #0 { 724; SSE2-LABEL: sad_nonloop_8i8: 725; SSE2: # %bb.0: 726; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 727; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 728; SSE2-NEXT: psadbw %xmm0, %xmm1 729; SSE2-NEXT: movd %xmm1, %eax 730; SSE2-NEXT: retq 731; 732; AVX-LABEL: sad_nonloop_8i8: 733; AVX: # %bb.0: 734; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 735; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 736; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 737; AVX-NEXT: vmovd %xmm0, %eax 738; AVX-NEXT: retq 739 %v1 = load <8 x i8>, ptr %p, align 1 740 %z1 = zext <8 x i8> %v1 to <8 x i32> 741 %v2 = load <8 x i8>, ptr %q, align 1 742 %z2 = zext <8 x i8> %v2 to <8 x i32> 743 %sub = sub nsw <8 x i32> %z1, %z2 744 %isneg = icmp sgt <8 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 745 %neg = sub nsw <8 x i32> zeroinitializer, %sub 746 %abs = select <8 x i1> %isneg, <8 x i32> %sub, <8 x i32> %neg 747 %h1 = shufflevector <8 x i32> %abs, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 748 %sum1 = add <8 x i32> %abs, %h1 749 %h2 = shufflevector <8 x i32> %sum1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 750 %sum2 = add <8 x i32> %sum1, %h2 751 %h3 = shufflevector <8 x i32> %sum2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 752 %sum3 = add <8 x i32> %sum2, %h3 753 %sum = extractelement <8 x i32> %sum3, i32 0 754 ret i32 %sum 755} 756 757define dso_local i32 @sad_nonloop_16i8(ptr nocapture readonly %p, i64, ptr nocapture readonly %q) local_unnamed_addr #0 { 758; SSE2-LABEL: sad_nonloop_16i8: 759; SSE2: # %bb.0: 760; SSE2-NEXT: movdqu (%rdi), %xmm0 761; SSE2-NEXT: movdqu (%rdx), %xmm1 762; SSE2-NEXT: psadbw %xmm0, %xmm1 763; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 764; SSE2-NEXT: paddq %xmm1, %xmm0 765; SSE2-NEXT: movd %xmm0, %eax 766; SSE2-NEXT: retq 767; 768; AVX-LABEL: sad_nonloop_16i8: 769; AVX: # %bb.0: 770; AVX-NEXT: vmovdqu (%rdi), %xmm0 771; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 772; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 773; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 774; AVX-NEXT: vmovd %xmm0, %eax 775; AVX-NEXT: retq 776 %v1 = load <16 x i8>, ptr %p, align 1 777 %z1 = zext <16 x i8> %v1 to <16 x i32> 778 %v2 = load <16 x i8>, ptr %q, align 1 779 %z2 = zext <16 x i8> %v2 to <16 x i32> 780 %sub = sub nsw <16 x i32> %z1, %z2 781 %isneg = icmp sgt <16 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 782 %neg = sub nsw <16 x i32> zeroinitializer, %sub 783 %abs = select <16 x i1> %isneg, <16 x i32> %sub, <16 x i32> %neg 784 %h0 = shufflevector <16 x i32> %abs, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 785 %sum0 = add <16 x i32> %abs, %h0 786 %h1 = shufflevector <16 x i32> %sum0, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 787 %sum1 = add <16 x i32> %sum0, %h1 788 %h2 = shufflevector <16 x i32> %sum1, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 789 %sum2 = add <16 x i32> %sum1, %h2 790 %h3 = shufflevector <16 x i32> %sum2, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 791 %sum3 = add <16 x i32> %sum2, %h3 792 %sum = extractelement <16 x i32> %sum3, i32 0 793 ret i32 %sum 794} 795 796define dso_local i32 @sad_nonloop_32i8(ptr nocapture readonly %p, i64, ptr nocapture readonly %q) local_unnamed_addr #0 { 797; SSE2-LABEL: sad_nonloop_32i8: 798; SSE2: # %bb.0: 799; SSE2-NEXT: movdqu (%rdx), %xmm0 800; SSE2-NEXT: movdqu 16(%rdx), %xmm1 801; SSE2-NEXT: movdqu (%rdi), %xmm2 802; SSE2-NEXT: psadbw %xmm0, %xmm2 803; SSE2-NEXT: movdqu 16(%rdi), %xmm0 804; SSE2-NEXT: psadbw %xmm1, %xmm0 805; SSE2-NEXT: paddq %xmm2, %xmm0 806; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 807; SSE2-NEXT: paddq %xmm0, %xmm1 808; SSE2-NEXT: movd %xmm1, %eax 809; SSE2-NEXT: retq 810; 811; AVX1-LABEL: sad_nonloop_32i8: 812; AVX1: # %bb.0: 813; AVX1-NEXT: vmovdqu (%rdi), %xmm0 814; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 815; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 816; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 817; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 818; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 819; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 820; AVX1-NEXT: vmovd %xmm0, %eax 821; AVX1-NEXT: retq 822; 823; AVX2-LABEL: sad_nonloop_32i8: 824; AVX2: # %bb.0: 825; AVX2-NEXT: vmovdqu (%rdi), %ymm0 826; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 827; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 828; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 829; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 830; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 831; AVX2-NEXT: vmovd %xmm0, %eax 832; AVX2-NEXT: vzeroupper 833; AVX2-NEXT: retq 834; 835; AVX512-LABEL: sad_nonloop_32i8: 836; AVX512: # %bb.0: 837; AVX512-NEXT: vmovdqu (%rdi), %ymm0 838; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 839; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 840; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 841; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 842; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 843; AVX512-NEXT: vmovd %xmm0, %eax 844; AVX512-NEXT: vzeroupper 845; AVX512-NEXT: retq 846 %v1 = load <32 x i8>, ptr %p, align 1 847 %z1 = zext <32 x i8> %v1 to <32 x i32> 848 %v2 = load <32 x i8>, ptr %q, align 1 849 %z2 = zext <32 x i8> %v2 to <32 x i32> 850 %sub = sub nsw <32 x i32> %z1, %z2 851 %isneg = icmp sgt <32 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 852 %neg = sub nsw <32 x i32> zeroinitializer, %sub 853 %abs = select <32 x i1> %isneg, <32 x i32> %sub, <32 x i32> %neg 854 %h32 = shufflevector <32 x i32> %abs, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 855 %sum32 = add <32 x i32> %abs, %h32 856 %h0 = shufflevector <32 x i32> %sum32, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 857 %sum0 = add <32 x i32> %sum32, %h0 858 %h1 = shufflevector <32 x i32> %sum0, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 859 %sum1 = add <32 x i32> %sum0, %h1 860 %h2 = shufflevector <32 x i32> %sum1, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 861 %sum2 = add <32 x i32> %sum1, %h2 862 %h3 = shufflevector <32 x i32> %sum2, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 863 %sum3 = add <32 x i32> %sum2, %h3 864 %sum = extractelement <32 x i32> %sum3, i32 0 865 ret i32 %sum 866} 867 868define dso_local i32 @sad_nonloop_64i8(ptr nocapture readonly %p, i64, ptr nocapture readonly %q) local_unnamed_addr #0 { 869; SSE2-LABEL: sad_nonloop_64i8: 870; SSE2: # %bb.0: 871; SSE2-NEXT: movdqu (%rdx), %xmm0 872; SSE2-NEXT: movdqu 16(%rdx), %xmm1 873; SSE2-NEXT: movdqu 32(%rdx), %xmm2 874; SSE2-NEXT: movdqu 48(%rdx), %xmm3 875; SSE2-NEXT: movdqu (%rdi), %xmm4 876; SSE2-NEXT: psadbw %xmm0, %xmm4 877; SSE2-NEXT: movdqu 16(%rdi), %xmm0 878; SSE2-NEXT: psadbw %xmm1, %xmm0 879; SSE2-NEXT: movdqu 32(%rdi), %xmm1 880; SSE2-NEXT: psadbw %xmm2, %xmm1 881; SSE2-NEXT: paddq %xmm4, %xmm1 882; SSE2-NEXT: movdqu 48(%rdi), %xmm2 883; SSE2-NEXT: psadbw %xmm3, %xmm2 884; SSE2-NEXT: paddq %xmm0, %xmm2 885; SSE2-NEXT: paddq %xmm1, %xmm2 886; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 887; SSE2-NEXT: paddq %xmm2, %xmm0 888; SSE2-NEXT: movd %xmm0, %eax 889; SSE2-NEXT: retq 890; 891; AVX1-LABEL: sad_nonloop_64i8: 892; AVX1: # %bb.0: 893; AVX1-NEXT: vmovdqu (%rdi), %xmm0 894; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 895; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 896; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 897; AVX1-NEXT: vpsadbw 48(%rdx), %xmm3, %xmm3 898; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 899; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 900; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2 901; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 902; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 903; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 904; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 905; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 906; AVX1-NEXT: vmovd %xmm0, %eax 907; AVX1-NEXT: retq 908; 909; AVX2-LABEL: sad_nonloop_64i8: 910; AVX2: # %bb.0: 911; AVX2-NEXT: vmovdqu (%rdi), %ymm0 912; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 913; AVX2-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 914; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 915; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 916; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 917; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 918; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 919; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 920; AVX2-NEXT: vmovd %xmm0, %eax 921; AVX2-NEXT: vzeroupper 922; AVX2-NEXT: retq 923; 924; AVX512F-LABEL: sad_nonloop_64i8: 925; AVX512F: # %bb.0: 926; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 927; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 928; AVX512F-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 929; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 930; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 931; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0 932; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 933; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 934; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 935; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 936; AVX512F-NEXT: vmovd %xmm0, %eax 937; AVX512F-NEXT: vzeroupper 938; AVX512F-NEXT: retq 939; 940; AVX512BW-LABEL: sad_nonloop_64i8: 941; AVX512BW: # %bb.0: 942; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 943; AVX512BW-NEXT: vpsadbw (%rdx), %zmm0, %zmm0 944; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 945; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm0 946; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 947; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 948; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 949; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 950; AVX512BW-NEXT: vmovd %xmm0, %eax 951; AVX512BW-NEXT: vzeroupper 952; AVX512BW-NEXT: retq 953 %v1 = load <64 x i8>, ptr %p, align 1 954 %z1 = zext <64 x i8> %v1 to <64 x i32> 955 %v2 = load <64 x i8>, ptr %q, align 1 956 %z2 = zext <64 x i8> %v2 to <64 x i32> 957 %sub = sub nsw <64 x i32> %z1, %z2 958 %isneg = icmp sgt <64 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 959 %neg = sub nsw <64 x i32> zeroinitializer, %sub 960 %abs = select <64 x i1> %isneg, <64 x i32> %sub, <64 x i32> %neg 961 %h64 = shufflevector <64 x i32> %abs, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 962 %sum64 = add <64 x i32> %abs, %h64 963 %h32 = shufflevector <64 x i32> %sum64, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 964 %sum32 = add <64 x i32> %sum64, %h32 965 %h0 = shufflevector <64 x i32> %sum32, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 966 %sum0 = add <64 x i32> %sum32, %h0 967 %h1 = shufflevector <64 x i32> %sum0, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 968 %sum1 = add <64 x i32> %sum0, %h1 969 %h2 = shufflevector <64 x i32> %sum1, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 970 %sum2 = add <64 x i32> %sum1, %h2 971 %h3 = shufflevector <64 x i32> %sum2, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 972 %sum3 = add <64 x i32> %sum2, %h3 973 %sum = extractelement <64 x i32> %sum3, i32 0 974 ret i32 %sum 975} 976 977; This contains an unrolled sad loop with a non-zero initial value. 978; DAGCombiner reassociation previously rewrote the adds to move the constant vector further down the tree. This resulted in the vector-reduction flag being lost. 979define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) { 980; SSE2-LABEL: sad_unroll_nonzero_initial: 981; SSE2: # %bb.0: # %bb 982; SSE2-NEXT: movdqu (%rdi), %xmm0 983; SSE2-NEXT: movdqu (%rsi), %xmm1 984; SSE2-NEXT: psadbw %xmm0, %xmm1 985; SSE2-NEXT: movdqu (%rdx), %xmm0 986; SSE2-NEXT: movdqu (%rcx), %xmm2 987; SSE2-NEXT: psadbw %xmm0, %xmm2 988; SSE2-NEXT: paddd %xmm1, %xmm2 989; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 990; SSE2-NEXT: paddd %xmm2, %xmm0 991; SSE2-NEXT: movd %xmm0, %eax 992; SSE2-NEXT: incl %eax 993; SSE2-NEXT: retq 994; 995; AVX1-LABEL: sad_unroll_nonzero_initial: 996; AVX1: # %bb.0: # %bb 997; AVX1-NEXT: vmovdqu (%rdi), %xmm0 998; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 999; AVX1-NEXT: vmovdqu (%rdx), %xmm1 1000; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 1001; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1002; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1003; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1004; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1005; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1006; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1007; AVX1-NEXT: vmovd %xmm0, %eax 1008; AVX1-NEXT: retq 1009; 1010; AVX2-LABEL: sad_unroll_nonzero_initial: 1011; AVX2: # %bb.0: # %bb 1012; AVX2-NEXT: vmovdqu (%rdi), %xmm0 1013; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 1014; AVX2-NEXT: vmovdqu (%rdx), %xmm1 1015; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 1016; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1017; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1018; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1019; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1020; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1021; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1022; AVX2-NEXT: vmovd %xmm0, %eax 1023; AVX2-NEXT: retq 1024; 1025; AVX512-LABEL: sad_unroll_nonzero_initial: 1026; AVX512: # %bb.0: # %bb 1027; AVX512-NEXT: vmovdqu (%rdi), %xmm0 1028; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 1029; AVX512-NEXT: vmovdqu (%rdx), %xmm1 1030; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 1031; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1032; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1033; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1034; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1035; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1036; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 1037; AVX512-NEXT: vmovd %xmm0, %eax 1038; AVX512-NEXT: retq 1039bb: 1040 %tmp = load <16 x i8>, ptr %arg, align 1 1041 %tmp4 = load <16 x i8>, ptr %arg1, align 1 1042 %tmp5 = zext <16 x i8> %tmp to <16 x i32> 1043 %tmp6 = zext <16 x i8> %tmp4 to <16 x i32> 1044 %tmp7 = sub nsw <16 x i32> %tmp5, %tmp6 1045 %tmp8 = icmp slt <16 x i32> %tmp7, zeroinitializer 1046 %tmp9 = sub nsw <16 x i32> zeroinitializer, %tmp7 1047 %tmp10 = select <16 x i1> %tmp8, <16 x i32> %tmp9, <16 x i32> %tmp7 1048 %tmp11 = add nuw nsw <16 x i32> %tmp10, <i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 1049 %tmp12 = load <16 x i8>, ptr %arg2, align 1 1050 %tmp13 = load <16 x i8>, ptr %arg3, align 1 1051 %tmp14 = zext <16 x i8> %tmp12 to <16 x i32> 1052 %tmp15 = zext <16 x i8> %tmp13 to <16 x i32> 1053 %tmp16 = sub nsw <16 x i32> %tmp14, %tmp15 1054 %tmp17 = icmp slt <16 x i32> %tmp16, zeroinitializer 1055 %tmp18 = sub nsw <16 x i32> zeroinitializer, %tmp16 1056 %tmp19 = select <16 x i1> %tmp17, <16 x i32> %tmp18, <16 x i32> %tmp16 1057 %tmp20 = add nuw nsw <16 x i32> %tmp19, %tmp11 1058 %tmp21 = shufflevector <16 x i32> %tmp20, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1059 %tmp22 = add <16 x i32> %tmp20, %tmp21 1060 %tmp23 = shufflevector <16 x i32> %tmp22, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1061 %tmp24 = add <16 x i32> %tmp22, %tmp23 1062 %tmp25 = shufflevector <16 x i32> %tmp24, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1063 %tmp26 = add <16 x i32> %tmp24, %tmp25 1064 %tmp27 = shufflevector <16 x i32> %tmp26, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1065 %tmp28 = add <16 x i32> %tmp26, %tmp27 1066 %tmp29 = extractelement <16 x i32> %tmp28, i64 0 1067 ret i32 %tmp29 1068} 1069 1070; This test contains two absolute difference patterns joined by an add. The result of that add is then reduced to a single element. 1071; SelectionDAGBuilder should tag the joining add as a vector reduction. We neeed to recognize that both sides can use psadbw. 1072define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) { 1073; SSE2-LABEL: sad_double_reduction: 1074; SSE2: # %bb.0: # %bb 1075; SSE2-NEXT: movdqu (%rdi), %xmm0 1076; SSE2-NEXT: movdqu (%rsi), %xmm1 1077; SSE2-NEXT: psadbw %xmm0, %xmm1 1078; SSE2-NEXT: movdqu (%rdx), %xmm0 1079; SSE2-NEXT: movdqu (%rcx), %xmm2 1080; SSE2-NEXT: psadbw %xmm0, %xmm2 1081; SSE2-NEXT: paddd %xmm1, %xmm2 1082; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1083; SSE2-NEXT: paddd %xmm2, %xmm0 1084; SSE2-NEXT: movd %xmm0, %eax 1085; SSE2-NEXT: retq 1086; 1087; AVX-LABEL: sad_double_reduction: 1088; AVX: # %bb.0: # %bb 1089; AVX-NEXT: vmovdqu (%rdi), %xmm0 1090; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 1091; AVX-NEXT: vmovdqu (%rdx), %xmm1 1092; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 1093; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1094; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1095; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1096; AVX-NEXT: vmovd %xmm0, %eax 1097; AVX-NEXT: retq 1098bb: 1099 %tmp = load <16 x i8>, ptr %arg, align 1 1100 %tmp4 = load <16 x i8>, ptr %arg1, align 1 1101 %tmp5 = zext <16 x i8> %tmp to <16 x i32> 1102 %tmp6 = zext <16 x i8> %tmp4 to <16 x i32> 1103 %tmp7 = sub nsw <16 x i32> %tmp5, %tmp6 1104 %tmp8 = icmp slt <16 x i32> %tmp7, zeroinitializer 1105 %tmp9 = sub nsw <16 x i32> zeroinitializer, %tmp7 1106 %tmp10 = select <16 x i1> %tmp8, <16 x i32> %tmp9, <16 x i32> %tmp7 1107 %tmp11 = load <16 x i8>, ptr %arg2, align 1 1108 %tmp12 = load <16 x i8>, ptr %arg3, align 1 1109 %tmp13 = zext <16 x i8> %tmp11 to <16 x i32> 1110 %tmp14 = zext <16 x i8> %tmp12 to <16 x i32> 1111 %tmp15 = sub nsw <16 x i32> %tmp13, %tmp14 1112 %tmp16 = icmp slt <16 x i32> %tmp15, zeroinitializer 1113 %tmp17 = sub nsw <16 x i32> zeroinitializer, %tmp15 1114 %tmp18 = select <16 x i1> %tmp16, <16 x i32> %tmp17, <16 x i32> %tmp15 1115 %tmp19 = add nuw nsw <16 x i32> %tmp18, %tmp10 1116 %tmp20 = shufflevector <16 x i32> %tmp19, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1117 %tmp21 = add <16 x i32> %tmp19, %tmp20 1118 %tmp22 = shufflevector <16 x i32> %tmp21, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1119 %tmp23 = add <16 x i32> %tmp21, %tmp22 1120 %tmp24 = shufflevector <16 x i32> %tmp23, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1121 %tmp25 = add <16 x i32> %tmp23, %tmp24 1122 %tmp26 = shufflevector <16 x i32> %tmp25, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1123 %tmp27 = add <16 x i32> %tmp25, %tmp26 1124 %tmp28 = extractelement <16 x i32> %tmp27, i64 0 1125 ret i32 %tmp28 1126} 1127 1128; This test contains two absolute difference patterns joined by an add. The result of that add is then reduced to a single element. 1129; SelectionDAGBuilder should tag the joining add as a vector reduction. We neeed to recognize that both sides can use psadbw. 1130define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) { 1131; SSE2-LABEL: sad_double_reduction_abs: 1132; SSE2: # %bb.0: # %bb 1133; SSE2-NEXT: movdqu (%rdi), %xmm0 1134; SSE2-NEXT: movdqu (%rsi), %xmm1 1135; SSE2-NEXT: psadbw %xmm0, %xmm1 1136; SSE2-NEXT: movdqu (%rdx), %xmm0 1137; SSE2-NEXT: movdqu (%rcx), %xmm2 1138; SSE2-NEXT: psadbw %xmm0, %xmm2 1139; SSE2-NEXT: paddd %xmm1, %xmm2 1140; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1141; SSE2-NEXT: paddd %xmm2, %xmm0 1142; SSE2-NEXT: movd %xmm0, %eax 1143; SSE2-NEXT: retq 1144; 1145; AVX-LABEL: sad_double_reduction_abs: 1146; AVX: # %bb.0: # %bb 1147; AVX-NEXT: vmovdqu (%rdi), %xmm0 1148; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 1149; AVX-NEXT: vmovdqu (%rdx), %xmm1 1150; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 1151; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1152; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1153; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1154; AVX-NEXT: vmovd %xmm0, %eax 1155; AVX-NEXT: retq 1156bb: 1157 %tmp = load <16 x i8>, ptr %arg, align 1 1158 %tmp4 = load <16 x i8>, ptr %arg1, align 1 1159 %tmp5 = zext <16 x i8> %tmp to <16 x i32> 1160 %tmp6 = zext <16 x i8> %tmp4 to <16 x i32> 1161 %tmp7 = sub nsw <16 x i32> %tmp5, %tmp6 1162 %tmp10 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %tmp7, i1 false) 1163 %tmp11 = load <16 x i8>, ptr %arg2, align 1 1164 %tmp12 = load <16 x i8>, ptr %arg3, align 1 1165 %tmp13 = zext <16 x i8> %tmp11 to <16 x i32> 1166 %tmp14 = zext <16 x i8> %tmp12 to <16 x i32> 1167 %tmp15 = sub nsw <16 x i32> %tmp13, %tmp14 1168 %tmp18 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %tmp15, i1 false) 1169 %tmp19 = add nuw nsw <16 x i32> %tmp18, %tmp10 1170 %tmp20 = shufflevector <16 x i32> %tmp19, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1171 %tmp21 = add <16 x i32> %tmp19, %tmp20 1172 %tmp22 = shufflevector <16 x i32> %tmp21, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1173 %tmp23 = add <16 x i32> %tmp21, %tmp22 1174 %tmp24 = shufflevector <16 x i32> %tmp23, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1175 %tmp25 = add <16 x i32> %tmp23, %tmp24 1176 %tmp26 = shufflevector <16 x i32> %tmp25, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1177 %tmp27 = add <16 x i32> %tmp25, %tmp26 1178 %tmp28 = extractelement <16 x i32> %tmp27, i64 0 1179 ret i32 %tmp28 1180} 1181 1182; Function Attrs: nofree nosync nounwind readnone speculatable willreturn 1183declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1 immarg) #0 1184attributes #0 = { nofree nosync nounwind readnone speculatable willreturn } 1185