1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL 9 10; 11; vXi64 12; 13 14define i64 @test_v2i64_v2i32(<2 x i64> %a0) { 15; SSE2-LABEL: test_v2i64_v2i32: 16; SSE2: # %bb.0: 17; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 18; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 19; SSE2-NEXT: paddq %xmm0, %xmm1 20; SSE2-NEXT: movq %xmm1, %rax 21; SSE2-NEXT: retq 22; 23; SSE41-LABEL: test_v2i64_v2i32: 24; SSE41: # %bb.0: 25; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 26; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 27; SSE41-NEXT: paddq %xmm0, %xmm1 28; SSE41-NEXT: movq %xmm1, %rax 29; SSE41-NEXT: retq 30; 31; AVX1-LABEL: test_v2i64_v2i32: 32; AVX1: # %bb.0: 33; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 34; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 35; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 36; AVX1-NEXT: vmovq %xmm0, %rax 37; AVX1-NEXT: retq 38; 39; AVX2-LABEL: test_v2i64_v2i32: 40; AVX2: # %bb.0: 41; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 42; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 43; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 44; AVX2-NEXT: vmovq %xmm0, %rax 45; AVX2-NEXT: retq 46; 47; AVX512BW-LABEL: test_v2i64_v2i32: 48; AVX512BW: # %bb.0: 49; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 50; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 51; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 52; AVX512BW-NEXT: vmovq %xmm0, %rax 53; AVX512BW-NEXT: retq 54; 55; AVX512BWVL-LABEL: test_v2i64_v2i32: 56; AVX512BWVL: # %bb.0: 57; AVX512BWVL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 58; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 59; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 60; AVX512BWVL-NEXT: vmovq %xmm0, %rax 61; AVX512BWVL-NEXT: retq 62 %1 = and <2 x i64> %a0, <i64 255, i64 255> 63 %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1) 64 ret i64 %2 65} 66 67define i64 @test_v4i64_v4i16(<4 x i64> %a0) { 68; SSE2-LABEL: test_v4i64_v4i16: 69; SSE2: # %bb.0: 70; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 71; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 72; SSE2-NEXT: paddq %xmm1, %xmm0 73; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 74; SSE2-NEXT: paddq %xmm0, %xmm1 75; SSE2-NEXT: movq %xmm1, %rax 76; SSE2-NEXT: retq 77; 78; SSE41-LABEL: test_v4i64_v4i16: 79; SSE41: # %bb.0: 80; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 81; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 82; SSE41-NEXT: paddq %xmm1, %xmm0 83; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 84; SSE41-NEXT: paddq %xmm0, %xmm1 85; SSE41-NEXT: movq %xmm1, %rax 86; SSE41-NEXT: retq 87; 88; AVX1-LABEL: test_v4i64_v4i16: 89; AVX1: # %bb.0: 90; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 91; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 92; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 93; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 94; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 95; AVX1-NEXT: vmovq %xmm0, %rax 96; AVX1-NEXT: vzeroupper 97; AVX1-NEXT: retq 98; 99; AVX2-LABEL: test_v4i64_v4i16: 100; AVX2: # %bb.0: 101; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 102; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 103; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 104; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 105; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 106; AVX2-NEXT: vmovq %xmm0, %rax 107; AVX2-NEXT: vzeroupper 108; AVX2-NEXT: retq 109; 110; AVX512BW-LABEL: test_v4i64_v4i16: 111; AVX512BW: # %bb.0: 112; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 113; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 114; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 115; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 116; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 117; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 118; AVX512BW-NEXT: vmovq %xmm0, %rax 119; AVX512BW-NEXT: vzeroupper 120; AVX512BW-NEXT: retq 121; 122; AVX512BWVL-LABEL: test_v4i64_v4i16: 123; AVX512BWVL: # %bb.0: 124; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 125; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 126; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 127; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 128; AVX512BWVL-NEXT: vmovq %xmm0, %rax 129; AVX512BWVL-NEXT: vzeroupper 130; AVX512BWVL-NEXT: retq 131 %1 = and <4 x i64> %a0, <i64 15, i64 31, i64 63, i64 127> 132 %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1) 133 ret i64 %2 134} 135 136define i64 @test_v8i64_v8i8(<8 x i64> %a0) { 137; SSE2-LABEL: test_v8i64_v8i8: 138; SSE2: # %bb.0: 139; SSE2-NEXT: psrlq $60, %xmm2 140; SSE2-NEXT: psrlq $60, %xmm0 141; SSE2-NEXT: paddq %xmm2, %xmm0 142; SSE2-NEXT: psrlq $60, %xmm3 143; SSE2-NEXT: psrlq $60, %xmm1 144; SSE2-NEXT: paddq %xmm3, %xmm1 145; SSE2-NEXT: paddq %xmm0, %xmm1 146; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 147; SSE2-NEXT: paddq %xmm1, %xmm0 148; SSE2-NEXT: movq %xmm0, %rax 149; SSE2-NEXT: retq 150; 151; SSE41-LABEL: test_v8i64_v8i8: 152; SSE41: # %bb.0: 153; SSE41-NEXT: psrlq $60, %xmm2 154; SSE41-NEXT: psrlq $60, %xmm0 155; SSE41-NEXT: paddq %xmm2, %xmm0 156; SSE41-NEXT: psrlq $60, %xmm3 157; SSE41-NEXT: psrlq $60, %xmm1 158; SSE41-NEXT: paddq %xmm3, %xmm1 159; SSE41-NEXT: paddq %xmm0, %xmm1 160; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 161; SSE41-NEXT: paddq %xmm1, %xmm0 162; SSE41-NEXT: movq %xmm0, %rax 163; SSE41-NEXT: retq 164; 165; AVX1-LABEL: test_v8i64_v8i8: 166; AVX1: # %bb.0: 167; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm2 168; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm3 169; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 170; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 171; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1 172; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 173; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 174; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 175; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 176; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 177; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 178; AVX1-NEXT: vmovq %xmm0, %rax 179; AVX1-NEXT: vzeroupper 180; AVX1-NEXT: retq 181; 182; AVX2-LABEL: test_v8i64_v8i8: 183; AVX2: # %bb.0: 184; AVX2-NEXT: vpsrlq $60, %ymm1, %ymm1 185; AVX2-NEXT: vpsrlq $60, %ymm0, %ymm0 186; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 187; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 188; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 189; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 190; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 191; AVX2-NEXT: vmovq %xmm0, %rax 192; AVX2-NEXT: vzeroupper 193; AVX2-NEXT: retq 194; 195; AVX512-LABEL: test_v8i64_v8i8: 196; AVX512: # %bb.0: 197; AVX512-NEXT: vpsrlq $60, %zmm0, %zmm0 198; AVX512-NEXT: vpmovqb %zmm0, %xmm0 199; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 200; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 201; AVX512-NEXT: vmovq %xmm0, %rax 202; AVX512-NEXT: vzeroupper 203; AVX512-NEXT: retq 204 %1 = lshr <8 x i64> %a0, <i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60> 205 %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1) 206 ret i64 %2 207} 208 209define i64 @test_v16i64_v16i8(<16 x i64> %a0) { 210; SSE2-LABEL: test_v16i64_v16i8: 211; SSE2: # %bb.0: 212; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [1,1] 213; SSE2-NEXT: pand %xmm8, %xmm5 214; SSE2-NEXT: pand %xmm8, %xmm1 215; SSE2-NEXT: paddq %xmm5, %xmm1 216; SSE2-NEXT: pand %xmm8, %xmm7 217; SSE2-NEXT: pand %xmm8, %xmm3 218; SSE2-NEXT: paddq %xmm7, %xmm3 219; SSE2-NEXT: paddq %xmm1, %xmm3 220; SSE2-NEXT: pand %xmm8, %xmm4 221; SSE2-NEXT: pand %xmm8, %xmm0 222; SSE2-NEXT: paddq %xmm4, %xmm0 223; SSE2-NEXT: pand %xmm8, %xmm6 224; SSE2-NEXT: pand %xmm8, %xmm2 225; SSE2-NEXT: paddq %xmm6, %xmm2 226; SSE2-NEXT: paddq %xmm0, %xmm2 227; SSE2-NEXT: paddq %xmm3, %xmm2 228; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 229; SSE2-NEXT: paddq %xmm2, %xmm0 230; SSE2-NEXT: movq %xmm0, %rax 231; SSE2-NEXT: retq 232; 233; SSE41-LABEL: test_v16i64_v16i8: 234; SSE41: # %bb.0: 235; SSE41-NEXT: pmovsxbq {{.*#+}} xmm8 = [1,1] 236; SSE41-NEXT: pand %xmm8, %xmm5 237; SSE41-NEXT: pand %xmm8, %xmm1 238; SSE41-NEXT: paddq %xmm5, %xmm1 239; SSE41-NEXT: pand %xmm8, %xmm7 240; SSE41-NEXT: pand %xmm8, %xmm3 241; SSE41-NEXT: paddq %xmm7, %xmm3 242; SSE41-NEXT: paddq %xmm1, %xmm3 243; SSE41-NEXT: pand %xmm8, %xmm4 244; SSE41-NEXT: pand %xmm8, %xmm0 245; SSE41-NEXT: paddq %xmm4, %xmm0 246; SSE41-NEXT: pand %xmm8, %xmm6 247; SSE41-NEXT: pand %xmm8, %xmm2 248; SSE41-NEXT: paddq %xmm6, %xmm2 249; SSE41-NEXT: paddq %xmm0, %xmm2 250; SSE41-NEXT: paddq %xmm3, %xmm2 251; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 252; SSE41-NEXT: paddq %xmm2, %xmm0 253; SSE41-NEXT: movq %xmm0, %rax 254; SSE41-NEXT: retq 255; 256; AVX1-LABEL: test_v16i64_v16i8: 257; AVX1: # %bb.0: 258; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1] 259; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 260; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 261; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 262; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 263; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 264; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5 265; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 266; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 267; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 268; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 269; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 270; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 271; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 272; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 273; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 274; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 275; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 276; AVX1-NEXT: vmovq %xmm0, %rax 277; AVX1-NEXT: vzeroupper 278; AVX1-NEXT: retq 279; 280; AVX2-LABEL: test_v16i64_v16i8: 281; AVX2: # %bb.0: 282; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1] 283; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 284; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 285; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 286; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 287; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 288; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 289; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 290; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 291; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 292; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 293; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 294; AVX2-NEXT: vmovq %xmm0, %rax 295; AVX2-NEXT: vzeroupper 296; AVX2-NEXT: retq 297; 298; AVX512BW-LABEL: test_v16i64_v16i8: 299; AVX512BW: # %bb.0: 300; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 301; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 302; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 303; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 304; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 305; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 306; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 307; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 308; AVX512BW-NEXT: vmovq %xmm0, %rax 309; AVX512BW-NEXT: vzeroupper 310; AVX512BW-NEXT: retq 311; 312; AVX512BWVL-LABEL: test_v16i64_v16i8: 313; AVX512BWVL: # %bb.0: 314; AVX512BWVL-NEXT: vpmovqb %zmm1, %xmm1 315; AVX512BWVL-NEXT: vpmovqb %zmm0, %xmm0 316; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 317; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 318; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 319; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 320; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 321; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 322; AVX512BWVL-NEXT: vmovq %xmm0, %rax 323; AVX512BWVL-NEXT: vzeroupper 324; AVX512BWVL-NEXT: retq 325 %1 = and <16 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 326 %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1) 327 ret i64 %2 328} 329 330; 331; vXi32 332; 333 334define i32 @test_v2i32_v2i16(<2 x i32> %a0) { 335; SSE2-LABEL: test_v2i32_v2i16: 336; SSE2: # %bb.0: 337; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 338; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 339; SSE2-NEXT: paddd %xmm0, %xmm1 340; SSE2-NEXT: movd %xmm1, %eax 341; SSE2-NEXT: retq 342; 343; SSE41-LABEL: test_v2i32_v2i16: 344; SSE41: # %bb.0: 345; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 346; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 347; SSE41-NEXT: paddd %xmm0, %xmm1 348; SSE41-NEXT: movd %xmm1, %eax 349; SSE41-NEXT: retq 350; 351; AVX1-SLOW-LABEL: test_v2i32_v2i16: 352; AVX1-SLOW: # %bb.0: 353; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 354; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 355; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 356; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 357; AVX1-SLOW-NEXT: retq 358; 359; AVX1-FAST-LABEL: test_v2i32_v2i16: 360; AVX1-FAST: # %bb.0: 361; AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 362; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 363; AVX1-FAST-NEXT: vmovd %xmm0, %eax 364; AVX1-FAST-NEXT: retq 365; 366; AVX2-LABEL: test_v2i32_v2i16: 367; AVX2: # %bb.0: 368; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 369; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 370; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 371; AVX2-NEXT: vmovd %xmm0, %eax 372; AVX2-NEXT: retq 373; 374; AVX512-LABEL: test_v2i32_v2i16: 375; AVX512: # %bb.0: 376; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 377; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 378; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 379; AVX512-NEXT: vmovd %xmm0, %eax 380; AVX512-NEXT: retq 381 %1 = and <2 x i32> %a0, <i32 255, i32 255> 382 %2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %1) 383 ret i32 %2 384} 385 386define i32 @test_v4i32(<4 x i32> %a0) { 387; SSE2-LABEL: test_v4i32: 388; SSE2: # %bb.0: 389; SSE2-NEXT: psrld $31, %xmm0 390; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 391; SSE2-NEXT: paddd %xmm0, %xmm1 392; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 393; SSE2-NEXT: paddd %xmm1, %xmm0 394; SSE2-NEXT: movd %xmm0, %eax 395; SSE2-NEXT: retq 396; 397; SSE41-LABEL: test_v4i32: 398; SSE41: # %bb.0: 399; SSE41-NEXT: psrld $31, %xmm0 400; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 401; SSE41-NEXT: paddd %xmm0, %xmm1 402; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 403; SSE41-NEXT: paddd %xmm1, %xmm0 404; SSE41-NEXT: movd %xmm0, %eax 405; SSE41-NEXT: retq 406; 407; AVX1-SLOW-LABEL: test_v4i32: 408; AVX1-SLOW: # %bb.0: 409; AVX1-SLOW-NEXT: vpsrld $31, %xmm0, %xmm0 410; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 411; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 412; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 413; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 414; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 415; AVX1-SLOW-NEXT: retq 416; 417; AVX1-FAST-LABEL: test_v4i32: 418; AVX1-FAST: # %bb.0: 419; AVX1-FAST-NEXT: vpsrld $31, %xmm0, %xmm0 420; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 421; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 422; AVX1-FAST-NEXT: vmovd %xmm0, %eax 423; AVX1-FAST-NEXT: retq 424; 425; AVX2-LABEL: test_v4i32: 426; AVX2: # %bb.0: 427; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 428; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 429; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 430; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 431; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 432; AVX2-NEXT: vmovd %xmm0, %eax 433; AVX2-NEXT: retq 434; 435; AVX512BW-LABEL: test_v4i32: 436; AVX512BW: # %bb.0: 437; AVX512BW-NEXT: vpsrld $31, %xmm0, %xmm0 438; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 439; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 440; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 441; AVX512BW-NEXT: vmovd %xmm0, %eax 442; AVX512BW-NEXT: retq 443; 444; AVX512BWVL-LABEL: test_v4i32: 445; AVX512BWVL: # %bb.0: 446; AVX512BWVL-NEXT: vpsrld $31, %xmm0, %xmm0 447; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0 448; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 449; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 450; AVX512BWVL-NEXT: vmovd %xmm0, %eax 451; AVX512BWVL-NEXT: retq 452 %1 = lshr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31> 453 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) 454 ret i32 %2 455} 456 457define i32 @test_v8i32_v8i8(<8 x i32> %a0) { 458; SSE2-LABEL: test_v8i32_v8i8: 459; SSE2: # %bb.0: 460; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 461; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 462; SSE2-NEXT: por %xmm1, %xmm0 463; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 464; SSE2-NEXT: paddd %xmm0, %xmm1 465; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 466; SSE2-NEXT: paddd %xmm1, %xmm0 467; SSE2-NEXT: movd %xmm0, %eax 468; SSE2-NEXT: retq 469; 470; SSE41-LABEL: test_v8i32_v8i8: 471; SSE41: # %bb.0: 472; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 473; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 474; SSE41-NEXT: por %xmm1, %xmm0 475; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 476; SSE41-NEXT: paddd %xmm0, %xmm1 477; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 478; SSE41-NEXT: paddd %xmm1, %xmm0 479; SSE41-NEXT: movd %xmm0, %eax 480; SSE41-NEXT: retq 481; 482; AVX1-SLOW-LABEL: test_v8i32_v8i8: 483; AVX1-SLOW: # %bb.0: 484; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 485; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 486; AVX1-SLOW-NEXT: vorps %xmm1, %xmm0, %xmm0 487; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] 488; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 489; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 490; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 491; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 492; AVX1-SLOW-NEXT: vzeroupper 493; AVX1-SLOW-NEXT: retq 494; 495; AVX1-FAST-LABEL: test_v8i32_v8i8: 496; AVX1-FAST: # %bb.0: 497; AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 498; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 499; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 500; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 501; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 502; AVX1-FAST-NEXT: vmovd %xmm0, %eax 503; AVX1-FAST-NEXT: vzeroupper 504; AVX1-FAST-NEXT: retq 505; 506; AVX2-LABEL: test_v8i32_v8i8: 507; AVX2: # %bb.0: 508; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 509; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 510; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 511; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 512; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 513; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 514; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 515; AVX2-NEXT: vmovd %xmm0, %eax 516; AVX2-NEXT: vzeroupper 517; AVX2-NEXT: retq 518; 519; AVX512BW-LABEL: test_v8i32_v8i8: 520; AVX512BW: # %bb.0: 521; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 522; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 523; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 524; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 525; AVX512BW-NEXT: vmovd %xmm0, %eax 526; AVX512BW-NEXT: vzeroupper 527; AVX512BW-NEXT: retq 528; 529; AVX512BWVL-LABEL: test_v8i32_v8i8: 530; AVX512BWVL: # %bb.0: 531; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 532; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 533; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 534; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 535; AVX512BWVL-NEXT: vmovd %xmm0, %eax 536; AVX512BWVL-NEXT: vzeroupper 537; AVX512BWVL-NEXT: retq 538 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64> 539 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 540 ret i32 %2 541} 542 543define i32 @test_v16i32_v16i8(<16 x i32> %a0) { 544; SSE2-LABEL: test_v16i32_v16i8: 545; SSE2: # %bb.0: 546; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 547; SSE2-NEXT: pand %xmm4, %xmm2 548; SSE2-NEXT: pand %xmm4, %xmm0 549; SSE2-NEXT: paddd %xmm2, %xmm0 550; SSE2-NEXT: pand %xmm4, %xmm3 551; SSE2-NEXT: pand %xmm4, %xmm1 552; SSE2-NEXT: paddd %xmm3, %xmm1 553; SSE2-NEXT: paddd %xmm0, %xmm1 554; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 555; SSE2-NEXT: paddd %xmm1, %xmm0 556; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 557; SSE2-NEXT: paddd %xmm0, %xmm1 558; SSE2-NEXT: movd %xmm1, %eax 559; SSE2-NEXT: retq 560; 561; SSE41-LABEL: test_v16i32_v16i8: 562; SSE41: # %bb.0: 563; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] 564; SSE41-NEXT: pand %xmm4, %xmm2 565; SSE41-NEXT: pand %xmm4, %xmm0 566; SSE41-NEXT: paddd %xmm2, %xmm0 567; SSE41-NEXT: pand %xmm4, %xmm3 568; SSE41-NEXT: pand %xmm4, %xmm1 569; SSE41-NEXT: paddd %xmm3, %xmm1 570; SSE41-NEXT: paddd %xmm0, %xmm1 571; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 572; SSE41-NEXT: paddd %xmm1, %xmm0 573; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 574; SSE41-NEXT: paddd %xmm0, %xmm1 575; SSE41-NEXT: movd %xmm1, %eax 576; SSE41-NEXT: retq 577; 578; AVX1-SLOW-LABEL: test_v16i32_v16i8: 579; AVX1-SLOW: # %bb.0: 580; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 581; AVX1-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 582; AVX1-SLOW-NEXT: vandps %ymm2, %ymm1, %ymm1 583; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 584; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 585; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 586; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 587; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 588; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 589; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 590; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 591; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 592; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 593; AVX1-SLOW-NEXT: vzeroupper 594; AVX1-SLOW-NEXT: retq 595; 596; AVX1-FAST-LABEL: test_v16i32_v16i8: 597; AVX1-FAST: # %bb.0: 598; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 599; AVX1-FAST-NEXT: vandps %ymm2, %ymm0, %ymm0 600; AVX1-FAST-NEXT: vandps %ymm2, %ymm1, %ymm1 601; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 602; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 603; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 604; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 605; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 606; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 607; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 608; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 609; AVX1-FAST-NEXT: vmovd %xmm0, %eax 610; AVX1-FAST-NEXT: vzeroupper 611; AVX1-FAST-NEXT: retq 612; 613; AVX2-LABEL: test_v16i32_v16i8: 614; AVX2: # %bb.0: 615; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 616; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 617; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 618; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 619; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 620; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 621; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 622; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 623; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 624; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 625; AVX2-NEXT: vmovd %xmm0, %eax 626; AVX2-NEXT: vzeroupper 627; AVX2-NEXT: retq 628; 629; AVX512-LABEL: test_v16i32_v16i8: 630; AVX512: # %bb.0: 631; AVX512-NEXT: vpmovdb %zmm0, %xmm0 632; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 633; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 634; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 635; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 636; AVX512-NEXT: vmovd %xmm0, %eax 637; AVX512-NEXT: vzeroupper 638; AVX512-NEXT: retq 639 %1 = and <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 640 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 641 ret i32 %2 642} 643 644define i32 @test_v32i32_v32i8(<32 x i32> %a0) { 645; SSE2-LABEL: test_v32i32_v32i8: 646; SSE2: # %bb.0: 647; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 648; SSE2-NEXT: pand %xmm8, %xmm5 649; SSE2-NEXT: pand %xmm8, %xmm1 650; SSE2-NEXT: paddd %xmm5, %xmm1 651; SSE2-NEXT: pand %xmm8, %xmm7 652; SSE2-NEXT: pand %xmm8, %xmm3 653; SSE2-NEXT: paddd %xmm7, %xmm3 654; SSE2-NEXT: paddd %xmm1, %xmm3 655; SSE2-NEXT: pand %xmm8, %xmm4 656; SSE2-NEXT: pand %xmm8, %xmm0 657; SSE2-NEXT: paddd %xmm4, %xmm0 658; SSE2-NEXT: pand %xmm8, %xmm6 659; SSE2-NEXT: pand %xmm8, %xmm2 660; SSE2-NEXT: paddd %xmm6, %xmm2 661; SSE2-NEXT: paddd %xmm0, %xmm2 662; SSE2-NEXT: paddd %xmm3, %xmm2 663; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 664; SSE2-NEXT: paddd %xmm2, %xmm0 665; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 666; SSE2-NEXT: paddd %xmm0, %xmm1 667; SSE2-NEXT: movd %xmm1, %eax 668; SSE2-NEXT: retq 669; 670; SSE41-LABEL: test_v32i32_v32i8: 671; SSE41: # %bb.0: 672; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = [255,255,255,255] 673; SSE41-NEXT: pand %xmm8, %xmm5 674; SSE41-NEXT: pand %xmm8, %xmm1 675; SSE41-NEXT: paddd %xmm5, %xmm1 676; SSE41-NEXT: pand %xmm8, %xmm7 677; SSE41-NEXT: pand %xmm8, %xmm3 678; SSE41-NEXT: paddd %xmm7, %xmm3 679; SSE41-NEXT: paddd %xmm1, %xmm3 680; SSE41-NEXT: pand %xmm8, %xmm4 681; SSE41-NEXT: pand %xmm8, %xmm0 682; SSE41-NEXT: paddd %xmm4, %xmm0 683; SSE41-NEXT: pand %xmm8, %xmm6 684; SSE41-NEXT: pand %xmm8, %xmm2 685; SSE41-NEXT: paddd %xmm6, %xmm2 686; SSE41-NEXT: paddd %xmm0, %xmm2 687; SSE41-NEXT: paddd %xmm3, %xmm2 688; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 689; SSE41-NEXT: paddd %xmm2, %xmm0 690; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 691; SSE41-NEXT: paddd %xmm0, %xmm1 692; SSE41-NEXT: movd %xmm1, %eax 693; SSE41-NEXT: retq 694; 695; AVX1-SLOW-LABEL: test_v32i32_v32i8: 696; AVX1-SLOW: # %bb.0: 697; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 698; AVX1-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 699; AVX1-SLOW-NEXT: vandps %ymm4, %ymm2, %ymm2 700; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1 701; AVX1-SLOW-NEXT: vandps %ymm4, %ymm3, %ymm3 702; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 703; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5 704; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 705; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 706; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 707; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 708; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 709; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 710; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 711; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 712; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 713; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 714; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 715; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 716; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 717; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 718; AVX1-SLOW-NEXT: vzeroupper 719; AVX1-SLOW-NEXT: retq 720; 721; AVX1-FAST-LABEL: test_v32i32_v32i8: 722; AVX1-FAST: # %bb.0: 723; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 724; AVX1-FAST-NEXT: vandps %ymm4, %ymm0, %ymm0 725; AVX1-FAST-NEXT: vandps %ymm4, %ymm2, %ymm2 726; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1 727; AVX1-FAST-NEXT: vandps %ymm4, %ymm3, %ymm3 728; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 729; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5 730; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4 731; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 732; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 733; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 734; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 735; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 736; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 737; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 738; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 739; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 740; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 741; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 742; AVX1-FAST-NEXT: vmovd %xmm0, %eax 743; AVX1-FAST-NEXT: vzeroupper 744; AVX1-FAST-NEXT: retq 745; 746; AVX2-LABEL: test_v32i32_v32i8: 747; AVX2: # %bb.0: 748; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 749; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 750; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 751; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 752; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 753; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 754; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 755; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 756; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 757; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 758; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 759; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 760; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 761; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 762; AVX2-NEXT: vmovd %xmm0, %eax 763; AVX2-NEXT: vzeroupper 764; AVX2-NEXT: retq 765; 766; AVX512-LABEL: test_v32i32_v32i8: 767; AVX512: # %bb.0: 768; AVX512-NEXT: vpmovdb %zmm0, %xmm0 769; AVX512-NEXT: vpmovdb %zmm1, %xmm1 770; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 771; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 772; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 773; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 774; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 775; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 776; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 777; AVX512-NEXT: vmovd %xmm0, %eax 778; AVX512-NEXT: vzeroupper 779; AVX512-NEXT: retq 780 %1 = and <32 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 781 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 782 ret i32 %2 783} 784 785; 786; vXi16 787; 788 789define i16 @test_v2i16_v2i8(<2 x i16> %a0) { 790; SSE2-LABEL: test_v2i16_v2i8: 791; SSE2: # %bb.0: 792; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 793; SSE2-NEXT: movdqa %xmm0, %xmm1 794; SSE2-NEXT: psrld $16, %xmm1 795; SSE2-NEXT: paddw %xmm0, %xmm1 796; SSE2-NEXT: movd %xmm1, %eax 797; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 798; SSE2-NEXT: retq 799; 800; SSE41-LABEL: test_v2i16_v2i8: 801; SSE41: # %bb.0: 802; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 803; SSE41-NEXT: movdqa %xmm0, %xmm1 804; SSE41-NEXT: psrld $16, %xmm1 805; SSE41-NEXT: paddw %xmm0, %xmm1 806; SSE41-NEXT: movd %xmm1, %eax 807; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 808; SSE41-NEXT: retq 809; 810; AVX1-SLOW-LABEL: test_v2i16_v2i8: 811; AVX1-SLOW: # %bb.0: 812; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 813; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 814; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 815; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 816; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 817; AVX1-SLOW-NEXT: retq 818; 819; AVX1-FAST-LABEL: test_v2i16_v2i8: 820; AVX1-FAST: # %bb.0: 821; AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 822; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 823; AVX1-FAST-NEXT: vmovd %xmm0, %eax 824; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 825; AVX1-FAST-NEXT: retq 826; 827; AVX2-LABEL: test_v2i16_v2i8: 828; AVX2: # %bb.0: 829; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 830; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 831; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 832; AVX2-NEXT: vmovd %xmm0, %eax 833; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 834; AVX2-NEXT: retq 835; 836; AVX512-LABEL: test_v2i16_v2i8: 837; AVX512: # %bb.0: 838; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 839; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 840; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 841; AVX512-NEXT: vmovd %xmm0, %eax 842; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 843; AVX512-NEXT: retq 844 %1 = and <2 x i16> %a0, <i16 255, i16 255> 845 %2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %1) 846 ret i16 %2 847} 848 849define i16 @test_v4i16_v4i8(<4 x i16> %a0) { 850; SSE2-LABEL: test_v4i16_v4i8: 851; SSE2: # %bb.0: 852; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 853; SSE2-NEXT: pandn %xmm0, %xmm1 854; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 855; SSE2-NEXT: por %xmm1, %xmm0 856; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 857; SSE2-NEXT: paddw %xmm0, %xmm1 858; SSE2-NEXT: movdqa %xmm1, %xmm0 859; SSE2-NEXT: psrld $16, %xmm0 860; SSE2-NEXT: paddw %xmm1, %xmm0 861; SSE2-NEXT: movd %xmm0, %eax 862; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 863; SSE2-NEXT: retq 864; 865; SSE41-LABEL: test_v4i16_v4i8: 866; SSE41: # %bb.0: 867; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0] 868; SSE41-NEXT: pmulhuw %xmm0, %xmm1 869; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 870; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 871; SSE41-NEXT: paddw %xmm1, %xmm0 872; SSE41-NEXT: movdqa %xmm0, %xmm1 873; SSE41-NEXT: psrld $16, %xmm1 874; SSE41-NEXT: paddw %xmm0, %xmm1 875; SSE41-NEXT: movd %xmm1, %eax 876; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 877; SSE41-NEXT: retq 878; 879; AVX1-SLOW-LABEL: test_v4i16_v4i8: 880; AVX1-SLOW: # %bb.0: 881; AVX1-SLOW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] 882; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 883; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 884; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 885; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 886; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 887; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 888; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 889; AVX1-SLOW-NEXT: retq 890; 891; AVX1-FAST-LABEL: test_v4i16_v4i8: 892; AVX1-FAST: # %bb.0: 893; AVX1-FAST-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] 894; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 895; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 896; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 897; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 898; AVX1-FAST-NEXT: vmovd %xmm0, %eax 899; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 900; AVX1-FAST-NEXT: retq 901; 902; AVX2-LABEL: test_v4i16_v4i8: 903; AVX2: # %bb.0: 904; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] 905; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 906; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 907; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 908; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 909; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 910; AVX2-NEXT: vmovd %xmm0, %eax 911; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 912; AVX2-NEXT: retq 913; 914; AVX512BW-LABEL: test_v4i16_v4i8: 915; AVX512BW: # %bb.0: 916; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 917; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0] 918; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 919; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 920; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 921; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 922; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 923; AVX512BW-NEXT: vmovd %xmm0, %eax 924; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 925; AVX512BW-NEXT: vzeroupper 926; AVX512BW-NEXT: retq 927; 928; AVX512BWVL-LABEL: test_v4i16_v4i8: 929; AVX512BWVL: # %bb.0: 930; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 931; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 932; AVX512BWVL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 933; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 934; AVX512BWVL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 935; AVX512BWVL-NEXT: vmovd %xmm0, %eax 936; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax 937; AVX512BWVL-NEXT: retq 938 %1 = lshr <4 x i16> %a0, <i16 0, i16 1, i16 2, i16 3> 939 %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1) 940 ret i16 %2 941} 942 943define i16 @test_v8i16_v8i8(<8 x i16> %a0) { 944; SSE2-LABEL: test_v8i16_v8i8: 945; SSE2: # %bb.0: 946; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 947; SSE2-NEXT: packuswb %xmm0, %xmm0 948; SSE2-NEXT: pxor %xmm1, %xmm1 949; SSE2-NEXT: psadbw %xmm0, %xmm1 950; SSE2-NEXT: movd %xmm1, %eax 951; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 952; SSE2-NEXT: retq 953; 954; SSE41-LABEL: test_v8i16_v8i8: 955; SSE41: # %bb.0: 956; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 957; SSE41-NEXT: packuswb %xmm0, %xmm0 958; SSE41-NEXT: pxor %xmm1, %xmm1 959; SSE41-NEXT: psadbw %xmm0, %xmm1 960; SSE41-NEXT: movd %xmm1, %eax 961; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 962; SSE41-NEXT: retq 963; 964; AVX-LABEL: test_v8i16_v8i8: 965; AVX: # %bb.0: 966; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 967; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 968; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 969; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 970; AVX-NEXT: vmovd %xmm0, %eax 971; AVX-NEXT: # kill: def $ax killed $ax killed $eax 972; AVX-NEXT: retq 973 %1 = and <8 x i16> %a0, <i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64> 974 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) 975 ret i16 %2 976} 977 978define i16 @test_v16i16_v16i8(<16 x i16> %a0) { 979; SSE2-LABEL: test_v16i16_v16i8: 980; SSE2: # %bb.0: 981; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 982; SSE2-NEXT: pand %xmm2, %xmm1 983; SSE2-NEXT: pand %xmm2, %xmm0 984; SSE2-NEXT: packuswb %xmm1, %xmm0 985; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 986; SSE2-NEXT: pxor %xmm1, %xmm1 987; SSE2-NEXT: psadbw %xmm0, %xmm1 988; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 989; SSE2-NEXT: paddq %xmm1, %xmm0 990; SSE2-NEXT: movd %xmm0, %eax 991; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 992; SSE2-NEXT: retq 993; 994; SSE41-LABEL: test_v16i16_v16i8: 995; SSE41: # %bb.0: 996; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 997; SSE41-NEXT: pand %xmm2, %xmm1 998; SSE41-NEXT: pand %xmm2, %xmm0 999; SSE41-NEXT: packuswb %xmm1, %xmm0 1000; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1001; SSE41-NEXT: pxor %xmm1, %xmm1 1002; SSE41-NEXT: psadbw %xmm0, %xmm1 1003; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1004; SSE41-NEXT: paddq %xmm1, %xmm0 1005; SSE41-NEXT: movd %xmm0, %eax 1006; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 1007; SSE41-NEXT: retq 1008; 1009; AVX1-LABEL: test_v16i16_v16i8: 1010; AVX1: # %bb.0: 1011; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1012; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1013; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1014; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1015; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1016; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1017; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1018; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1019; AVX1-NEXT: vmovd %xmm0, %eax 1020; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1021; AVX1-NEXT: vzeroupper 1022; AVX1-NEXT: retq 1023; 1024; AVX2-LABEL: test_v16i16_v16i8: 1025; AVX2: # %bb.0: 1026; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1027; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1028; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1029; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1030; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1031; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1032; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1033; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1034; AVX2-NEXT: vmovd %xmm0, %eax 1035; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1036; AVX2-NEXT: vzeroupper 1037; AVX2-NEXT: retq 1038; 1039; AVX512BW-LABEL: test_v16i16_v16i8: 1040; AVX512BW: # %bb.0: 1041; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1042; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1043; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1044; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1045; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1046; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1047; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1048; AVX512BW-NEXT: vmovd %xmm0, %eax 1049; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 1050; AVX512BW-NEXT: vzeroupper 1051; AVX512BW-NEXT: retq 1052; 1053; AVX512BWVL-LABEL: test_v16i16_v16i8: 1054; AVX512BWVL: # %bb.0: 1055; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1056; AVX512BWVL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 1057; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1058; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1059; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1060; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1061; AVX512BWVL-NEXT: vmovd %xmm0, %eax 1062; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax 1063; AVX512BWVL-NEXT: vzeroupper 1064; AVX512BWVL-NEXT: retq 1065 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64> 1066 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) 1067 ret i16 %2 1068} 1069 1070define i16 @test_v32i16_v32i8(<32 x i16> %a0) { 1071; SSE2-LABEL: test_v32i16_v32i8: 1072; SSE2: # %bb.0: 1073; SSE2-NEXT: psrlw $8, %xmm1 1074; SSE2-NEXT: psrlw $8, %xmm0 1075; SSE2-NEXT: packuswb %xmm1, %xmm0 1076; SSE2-NEXT: psrlw $8, %xmm3 1077; SSE2-NEXT: psrlw $8, %xmm2 1078; SSE2-NEXT: packuswb %xmm3, %xmm2 1079; SSE2-NEXT: pxor %xmm1, %xmm1 1080; SSE2-NEXT: psadbw %xmm1, %xmm2 1081; SSE2-NEXT: psadbw %xmm1, %xmm0 1082; SSE2-NEXT: paddq %xmm2, %xmm0 1083; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1084; SSE2-NEXT: paddq %xmm0, %xmm1 1085; SSE2-NEXT: movd %xmm1, %eax 1086; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 1087; SSE2-NEXT: retq 1088; 1089; SSE41-LABEL: test_v32i16_v32i8: 1090; SSE41: # %bb.0: 1091; SSE41-NEXT: psrlw $8, %xmm1 1092; SSE41-NEXT: psrlw $8, %xmm0 1093; SSE41-NEXT: packuswb %xmm1, %xmm0 1094; SSE41-NEXT: psrlw $8, %xmm3 1095; SSE41-NEXT: psrlw $8, %xmm2 1096; SSE41-NEXT: packuswb %xmm3, %xmm2 1097; SSE41-NEXT: pxor %xmm1, %xmm1 1098; SSE41-NEXT: psadbw %xmm1, %xmm2 1099; SSE41-NEXT: psadbw %xmm1, %xmm0 1100; SSE41-NEXT: paddq %xmm2, %xmm0 1101; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1102; SSE41-NEXT: paddq %xmm0, %xmm1 1103; SSE41-NEXT: movd %xmm1, %eax 1104; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 1105; SSE41-NEXT: retq 1106; 1107; AVX1-LABEL: test_v32i16_v32i8: 1108; AVX1: # %bb.0: 1109; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1110; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1111; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1112; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1113; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1114; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1115; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1116; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1117; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1118; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 1119; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 1120; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1121; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1122; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1123; AVX1-NEXT: vmovd %xmm0, %eax 1124; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1125; AVX1-NEXT: vzeroupper 1126; AVX1-NEXT: retq 1127; 1128; AVX2-LABEL: test_v32i16_v32i8: 1129; AVX2: # %bb.0: 1130; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1131; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1132; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1133; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1134; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1135; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 1136; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1137; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1138; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1139; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1140; AVX2-NEXT: vmovd %xmm0, %eax 1141; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1142; AVX2-NEXT: vzeroupper 1143; AVX2-NEXT: retq 1144; 1145; AVX512-LABEL: test_v32i16_v32i8: 1146; AVX512: # %bb.0: 1147; AVX512-NEXT: vpsrlw $8, %zmm0, %zmm0 1148; AVX512-NEXT: vpmovwb %zmm0, %ymm0 1149; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1150; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 1151; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1152; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1153; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1154; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1155; AVX512-NEXT: vmovd %xmm0, %eax 1156; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1157; AVX512-NEXT: vzeroupper 1158; AVX512-NEXT: retq 1159 %1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1160 %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1) 1161 ret i16 %2 1162} 1163 1164define i16 @test_v64i16_v64i8(<64 x i16> %a0) { 1165; SSE2-LABEL: test_v64i16_v64i8: 1166; SSE2: # %bb.0: 1167; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] 1168; SSE2-NEXT: pand %xmm8, %xmm1 1169; SSE2-NEXT: pand %xmm8, %xmm0 1170; SSE2-NEXT: packuswb %xmm1, %xmm0 1171; SSE2-NEXT: pand %xmm8, %xmm5 1172; SSE2-NEXT: pand %xmm8, %xmm4 1173; SSE2-NEXT: packuswb %xmm5, %xmm4 1174; SSE2-NEXT: paddb %xmm0, %xmm4 1175; SSE2-NEXT: pand %xmm8, %xmm3 1176; SSE2-NEXT: pand %xmm8, %xmm2 1177; SSE2-NEXT: packuswb %xmm3, %xmm2 1178; SSE2-NEXT: pand %xmm8, %xmm7 1179; SSE2-NEXT: pand %xmm8, %xmm6 1180; SSE2-NEXT: packuswb %xmm7, %xmm6 1181; SSE2-NEXT: paddb %xmm2, %xmm6 1182; SSE2-NEXT: pxor %xmm0, %xmm0 1183; SSE2-NEXT: psadbw %xmm0, %xmm6 1184; SSE2-NEXT: psadbw %xmm0, %xmm4 1185; SSE2-NEXT: paddq %xmm6, %xmm4 1186; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 1187; SSE2-NEXT: paddq %xmm4, %xmm0 1188; SSE2-NEXT: movd %xmm0, %eax 1189; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 1190; SSE2-NEXT: retq 1191; 1192; SSE41-LABEL: test_v64i16_v64i8: 1193; SSE41: # %bb.0: 1194; SSE41-NEXT: pmovsxbw {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] 1195; SSE41-NEXT: pand %xmm8, %xmm1 1196; SSE41-NEXT: pand %xmm8, %xmm0 1197; SSE41-NEXT: packuswb %xmm1, %xmm0 1198; SSE41-NEXT: pand %xmm8, %xmm5 1199; SSE41-NEXT: pand %xmm8, %xmm4 1200; SSE41-NEXT: packuswb %xmm5, %xmm4 1201; SSE41-NEXT: paddb %xmm0, %xmm4 1202; SSE41-NEXT: pand %xmm8, %xmm3 1203; SSE41-NEXT: pand %xmm8, %xmm2 1204; SSE41-NEXT: packuswb %xmm3, %xmm2 1205; SSE41-NEXT: pand %xmm8, %xmm7 1206; SSE41-NEXT: pand %xmm8, %xmm6 1207; SSE41-NEXT: packuswb %xmm7, %xmm6 1208; SSE41-NEXT: paddb %xmm2, %xmm6 1209; SSE41-NEXT: pxor %xmm0, %xmm0 1210; SSE41-NEXT: psadbw %xmm0, %xmm6 1211; SSE41-NEXT: psadbw %xmm0, %xmm4 1212; SSE41-NEXT: paddq %xmm6, %xmm4 1213; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 1214; SSE41-NEXT: paddq %xmm4, %xmm0 1215; SSE41-NEXT: movd %xmm0, %eax 1216; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 1217; SSE41-NEXT: retq 1218; 1219; AVX1-LABEL: test_v64i16_v64i8: 1220; AVX1: # %bb.0: 1221; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1222; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1223; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1224; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1225; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1226; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 1227; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 1228; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1229; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 1230; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 1231; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1232; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 1233; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1234; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 1235; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 1236; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 1237; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1238; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 1239; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1240; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1241; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1242; AVX1-NEXT: vmovd %xmm0, %eax 1243; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1244; AVX1-NEXT: vzeroupper 1245; AVX1-NEXT: retq 1246; 1247; AVX2-LABEL: test_v64i16_v64i8: 1248; AVX2: # %bb.0: 1249; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1250; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1251; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1252; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1253; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm1 1254; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1255; AVX2-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 1256; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1257; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1258; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1259; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 1260; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1261; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1262; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1263; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1264; AVX2-NEXT: vmovd %xmm0, %eax 1265; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1266; AVX2-NEXT: vzeroupper 1267; AVX2-NEXT: retq 1268; 1269; AVX512-LABEL: test_v64i16_v64i8: 1270; AVX512: # %bb.0: 1271; AVX512-NEXT: vpmovwb %zmm0, %ymm0 1272; AVX512-NEXT: vpmovwb %zmm1, %ymm1 1273; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1274; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 1275; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1276; AVX512-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 1277; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1278; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1279; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1280; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1281; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1282; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1283; AVX512-NEXT: vmovd %xmm0, %eax 1284; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1285; AVX512-NEXT: vzeroupper 1286; AVX512-NEXT: retq 1287 %1 = and <64 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 1288 %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1) 1289 ret i16 %2 1290} 1291 1292declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 1293declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 1294declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 1295declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 1296 1297declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 1298declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 1299declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 1300declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 1301declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 1302 1303declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) 1304declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 1305declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 1306declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 1307declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 1308declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 1309 1310declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) 1311declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) 1312declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 1313declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 1314declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 1315declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 1316declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) 1317