1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 9 10; 11; vXi64 12; 13 14define i64 @test_v2i64_v2i32(<2 x i32> %a0) { 15; SSE2-LABEL: test_v2i64_v2i32: 16; SSE2: # %bb.0: 17; SSE2-NEXT: xorps %xmm1, %xmm1 18; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 19; SSE2-NEXT: psrlq $32, %xmm0 20; SSE2-NEXT: paddq %xmm1, %xmm0 21; SSE2-NEXT: movq %xmm0, %rax 22; SSE2-NEXT: retq 23; 24; SSE41-LABEL: test_v2i64_v2i32: 25; SSE41: # %bb.0: 26; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 27; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 28; SSE41-NEXT: paddq %xmm0, %xmm1 29; SSE41-NEXT: movq %xmm1, %rax 30; SSE41-NEXT: retq 31; 32; AVX-LABEL: test_v2i64_v2i32: 33; AVX: # %bb.0: 34; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 35; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 36; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 37; AVX-NEXT: vmovq %xmm0, %rax 38; AVX-NEXT: retq 39 %1 = zext <2 x i32> %a0 to <2 x i64> 40 %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1) 41 ret i64 %2 42} 43 44define i64 @test_v4i64_v4i16(<4 x i16> %a0) { 45; SSE2-LABEL: test_v4i64_v4i16: 46; SSE2: # %bb.0: 47; SSE2-NEXT: pxor %xmm1, %xmm1 48; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 49; SSE2-NEXT: movdqa %xmm0, %xmm2 50; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 51; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 52; SSE2-NEXT: paddq %xmm2, %xmm0 53; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 54; SSE2-NEXT: paddq %xmm0, %xmm1 55; SSE2-NEXT: movq %xmm1, %rax 56; SSE2-NEXT: retq 57; 58; SSE41-LABEL: test_v4i64_v4i16: 59; SSE41: # %bb.0: 60; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 61; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 62; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 63; SSE41-NEXT: paddq %xmm1, %xmm0 64; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 65; SSE41-NEXT: paddq %xmm0, %xmm1 66; SSE41-NEXT: movq %xmm1, %rax 67; SSE41-NEXT: retq 68; 69; AVX1-LABEL: test_v4i64_v4i16: 70; AVX1: # %bb.0: 71; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 72; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 73; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 74; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 75; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 76; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 77; AVX1-NEXT: vmovq %xmm0, %rax 78; AVX1-NEXT: retq 79; 80; AVX2-LABEL: test_v4i64_v4i16: 81; AVX2: # %bb.0: 82; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 83; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 84; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 85; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 86; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 87; AVX2-NEXT: vmovq %xmm0, %rax 88; AVX2-NEXT: vzeroupper 89; AVX2-NEXT: retq 90; 91; AVX512-LABEL: test_v4i64_v4i16: 92; AVX512: # %bb.0: 93; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 94; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 95; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 96; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 97; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 98; AVX512-NEXT: vmovq %xmm0, %rax 99; AVX512-NEXT: vzeroupper 100; AVX512-NEXT: retq 101 %1 = zext <4 x i16> %a0 to <4 x i64> 102 %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1) 103 ret i64 %2 104} 105 106define i64 @test_v8i64_v8i8(<8 x i8> %a0) { 107; SSE-LABEL: test_v8i64_v8i8: 108; SSE: # %bb.0: 109; SSE-NEXT: pxor %xmm1, %xmm1 110; SSE-NEXT: psadbw %xmm0, %xmm1 111; SSE-NEXT: movq %xmm1, %rax 112; SSE-NEXT: retq 113; 114; AVX-LABEL: test_v8i64_v8i8: 115; AVX: # %bb.0: 116; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 117; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 118; AVX-NEXT: vmovq %xmm0, %rax 119; AVX-NEXT: retq 120 %1 = zext <8 x i8> %a0 to <8 x i64> 121 %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1) 122 ret i64 %2 123} 124 125define i64 @test_v16i64_v16i8(<16 x i8> %a0) { 126; SSE-LABEL: test_v16i64_v16i8: 127; SSE: # %bb.0: 128; SSE-NEXT: pxor %xmm1, %xmm1 129; SSE-NEXT: psadbw %xmm0, %xmm1 130; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 131; SSE-NEXT: paddq %xmm1, %xmm0 132; SSE-NEXT: movq %xmm0, %rax 133; SSE-NEXT: retq 134; 135; AVX-LABEL: test_v16i64_v16i8: 136; AVX: # %bb.0: 137; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 138; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 139; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 140; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 141; AVX-NEXT: vmovq %xmm0, %rax 142; AVX-NEXT: retq 143 %1 = zext <16 x i8> %a0 to <16 x i64> 144 %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1) 145 ret i64 %2 146} 147 148; 149; vXi32 150; 151 152define i32 @test_v2i32_v2i16(<2 x i16> %a0) { 153; SSE2-LABEL: test_v2i32_v2i16: 154; SSE2: # %bb.0: 155; SSE2-NEXT: pxor %xmm1, %xmm1 156; SSE2-NEXT: movdqa %xmm0, %xmm2 157; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 158; SSE2-NEXT: psrld $16, %xmm0 159; SSE2-NEXT: paddd %xmm2, %xmm0 160; SSE2-NEXT: movd %xmm0, %eax 161; SSE2-NEXT: retq 162; 163; SSE41-LABEL: test_v2i32_v2i16: 164; SSE41: # %bb.0: 165; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 166; SSE41-NEXT: psrld $16, %xmm0 167; SSE41-NEXT: paddd %xmm1, %xmm0 168; SSE41-NEXT: movd %xmm0, %eax 169; SSE41-NEXT: retq 170; 171; AVX1-SLOW-LABEL: test_v2i32_v2i16: 172; AVX1-SLOW: # %bb.0: 173; AVX1-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 174; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 175; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 176; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 177; AVX1-SLOW-NEXT: retq 178; 179; AVX1-FAST-LABEL: test_v2i32_v2i16: 180; AVX1-FAST: # %bb.0: 181; AVX1-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 182; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 183; AVX1-FAST-NEXT: vmovd %xmm0, %eax 184; AVX1-FAST-NEXT: retq 185; 186; AVX2-LABEL: test_v2i32_v2i16: 187; AVX2: # %bb.0: 188; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 189; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 190; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 191; AVX2-NEXT: vmovd %xmm0, %eax 192; AVX2-NEXT: retq 193; 194; AVX512-LABEL: test_v2i32_v2i16: 195; AVX512: # %bb.0: 196; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 197; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 198; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0 199; AVX512-NEXT: vmovd %xmm0, %eax 200; AVX512-NEXT: retq 201 %1 = zext <2 x i16> %a0 to <2 x i32> 202 %2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %1) 203 ret i32 %2 204} 205 206define i32 @test_v4i32(<4 x i8> %a0) { 207; SSE2-LABEL: test_v4i32: 208; SSE2: # %bb.0: 209; SSE2-NEXT: pxor %xmm1, %xmm1 210; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 211; SSE2-NEXT: psadbw %xmm1, %xmm0 212; SSE2-NEXT: movd %xmm0, %eax 213; SSE2-NEXT: retq 214; 215; SSE41-LABEL: test_v4i32: 216; SSE41: # %bb.0: 217; SSE41-NEXT: pxor %xmm1, %xmm1 218; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 219; SSE41-NEXT: psadbw %xmm1, %xmm0 220; SSE41-NEXT: movd %xmm0, %eax 221; SSE41-NEXT: retq 222; 223; AVX1-LABEL: test_v4i32: 224; AVX1: # %bb.0: 225; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 226; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 227; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 228; AVX1-NEXT: vmovd %xmm0, %eax 229; AVX1-NEXT: retq 230; 231; AVX2-LABEL: test_v4i32: 232; AVX2: # %bb.0: 233; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 234; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 235; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 236; AVX2-NEXT: vmovd %xmm0, %eax 237; AVX2-NEXT: retq 238; 239; AVX512-LABEL: test_v4i32: 240; AVX512: # %bb.0: 241; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 242; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 243; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 244; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 245; AVX512-NEXT: vmovd %xmm0, %eax 246; AVX512-NEXT: retq 247 %1 = zext <4 x i8> %a0 to <4 x i32> 248 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) 249 ret i32 %2 250} 251 252define i32 @test_v8i32_v8i8(<8 x i8> %a0) { 253; SSE-LABEL: test_v8i32_v8i8: 254; SSE: # %bb.0: 255; SSE-NEXT: pxor %xmm1, %xmm1 256; SSE-NEXT: psadbw %xmm0, %xmm1 257; SSE-NEXT: movd %xmm1, %eax 258; SSE-NEXT: retq 259; 260; AVX-LABEL: test_v8i32_v8i8: 261; AVX: # %bb.0: 262; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 263; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 264; AVX-NEXT: vmovd %xmm0, %eax 265; AVX-NEXT: retq 266 %1 = zext <8 x i8> %a0 to <8 x i32> 267 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 268 ret i32 %2 269} 270 271define i32 @test_v16i32_v16i8(<16 x i8> %a0) { 272; SSE-LABEL: test_v16i32_v16i8: 273; SSE: # %bb.0: 274; SSE-NEXT: pxor %xmm1, %xmm1 275; SSE-NEXT: psadbw %xmm0, %xmm1 276; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 277; SSE-NEXT: paddq %xmm1, %xmm0 278; SSE-NEXT: movd %xmm0, %eax 279; SSE-NEXT: retq 280; 281; AVX-LABEL: test_v16i32_v16i8: 282; AVX: # %bb.0: 283; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 284; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 285; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 286; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 287; AVX-NEXT: vmovd %xmm0, %eax 288; AVX-NEXT: retq 289 %1 = zext <16 x i8> %a0 to <16 x i32> 290 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 291 ret i32 %2 292} 293 294define i32 @test_v32i32_v32i8(<32 x i8> %a0) { 295; SSE-LABEL: test_v32i32_v32i8: 296; SSE: # %bb.0: 297; SSE-NEXT: pxor %xmm2, %xmm2 298; SSE-NEXT: psadbw %xmm2, %xmm1 299; SSE-NEXT: psadbw %xmm2, %xmm0 300; SSE-NEXT: paddq %xmm1, %xmm0 301; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 302; SSE-NEXT: paddq %xmm0, %xmm1 303; SSE-NEXT: movd %xmm1, %eax 304; SSE-NEXT: retq 305; 306; AVX1-LABEL: test_v32i32_v32i8: 307; AVX1: # %bb.0: 308; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 309; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 310; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 311; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 312; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 313; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 314; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 315; AVX1-NEXT: vmovd %xmm0, %eax 316; AVX1-NEXT: vzeroupper 317; AVX1-NEXT: retq 318; 319; AVX2-LABEL: test_v32i32_v32i8: 320; AVX2: # %bb.0: 321; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 322; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 323; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 324; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 325; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 326; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 327; AVX2-NEXT: vmovd %xmm0, %eax 328; AVX2-NEXT: vzeroupper 329; AVX2-NEXT: retq 330; 331; AVX512-LABEL: test_v32i32_v32i8: 332; AVX512: # %bb.0: 333; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 334; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 335; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 336; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 337; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 338; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 339; AVX512-NEXT: vmovd %xmm0, %eax 340; AVX512-NEXT: vzeroupper 341; AVX512-NEXT: retq 342 %1 = zext <32 x i8> %a0 to <32 x i32> 343 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 344 ret i32 %2 345} 346 347; 348; vXi16 349; 350 351define i16 @test_v2i16_v2i8(<2 x i8> %a0) { 352; SSE2-LABEL: test_v2i16_v2i8: 353; SSE2: # %bb.0: 354; SSE2-NEXT: pxor %xmm1, %xmm1 355; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 356; SSE2-NEXT: movdqa %xmm0, %xmm1 357; SSE2-NEXT: psrld $16, %xmm1 358; SSE2-NEXT: paddw %xmm0, %xmm1 359; SSE2-NEXT: movd %xmm1, %eax 360; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 361; SSE2-NEXT: retq 362; 363; SSE41-LABEL: test_v2i16_v2i8: 364; SSE41: # %bb.0: 365; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 366; SSE41-NEXT: movdqa %xmm0, %xmm1 367; SSE41-NEXT: psrld $16, %xmm1 368; SSE41-NEXT: paddw %xmm0, %xmm1 369; SSE41-NEXT: movd %xmm1, %eax 370; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 371; SSE41-NEXT: retq 372; 373; AVX1-SLOW-LABEL: test_v2i16_v2i8: 374; AVX1-SLOW: # %bb.0: 375; AVX1-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 376; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 377; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 378; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 379; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 380; AVX1-SLOW-NEXT: retq 381; 382; AVX1-FAST-LABEL: test_v2i16_v2i8: 383; AVX1-FAST: # %bb.0: 384; AVX1-FAST-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 385; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 386; AVX1-FAST-NEXT: vmovd %xmm0, %eax 387; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 388; AVX1-FAST-NEXT: retq 389; 390; AVX2-LABEL: test_v2i16_v2i8: 391; AVX2: # %bb.0: 392; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 393; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 394; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 395; AVX2-NEXT: vmovd %xmm0, %eax 396; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 397; AVX2-NEXT: retq 398; 399; AVX512-LABEL: test_v2i16_v2i8: 400; AVX512: # %bb.0: 401; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 402; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 403; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 404; AVX512-NEXT: vmovd %xmm0, %eax 405; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 406; AVX512-NEXT: retq 407 %1 = zext <2 x i8> %a0 to <2 x i16> 408 %2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %1) 409 ret i16 %2 410} 411 412define i16 @test_v4i16_v4i8(<4 x i8> %a0) { 413; SSE2-LABEL: test_v4i16_v4i8: 414; SSE2: # %bb.0: 415; SSE2-NEXT: pxor %xmm1, %xmm1 416; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 417; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 418; SSE2-NEXT: paddw %xmm0, %xmm1 419; SSE2-NEXT: movdqa %xmm1, %xmm0 420; SSE2-NEXT: psrld $16, %xmm0 421; SSE2-NEXT: paddw %xmm1, %xmm0 422; SSE2-NEXT: movd %xmm0, %eax 423; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 424; SSE2-NEXT: retq 425; 426; SSE41-LABEL: test_v4i16_v4i8: 427; SSE41: # %bb.0: 428; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 429; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 430; SSE41-NEXT: paddw %xmm0, %xmm1 431; SSE41-NEXT: movdqa %xmm1, %xmm0 432; SSE41-NEXT: psrld $16, %xmm0 433; SSE41-NEXT: paddw %xmm1, %xmm0 434; SSE41-NEXT: movd %xmm0, %eax 435; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 436; SSE41-NEXT: retq 437; 438; AVX1-SLOW-LABEL: test_v4i16_v4i8: 439; AVX1-SLOW: # %bb.0: 440; AVX1-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 441; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 442; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 443; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 444; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 445; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 446; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 447; AVX1-SLOW-NEXT: retq 448; 449; AVX1-FAST-LABEL: test_v4i16_v4i8: 450; AVX1-FAST: # %bb.0: 451; AVX1-FAST-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 452; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 453; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 454; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 455; AVX1-FAST-NEXT: vmovd %xmm0, %eax 456; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 457; AVX1-FAST-NEXT: retq 458; 459; AVX2-LABEL: test_v4i16_v4i8: 460; AVX2: # %bb.0: 461; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 462; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 463; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 464; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 465; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 466; AVX2-NEXT: vmovd %xmm0, %eax 467; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 468; AVX2-NEXT: retq 469; 470; AVX512-LABEL: test_v4i16_v4i8: 471; AVX512: # %bb.0: 472; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 473; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 474; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 475; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 476; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 477; AVX512-NEXT: vmovd %xmm0, %eax 478; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 479; AVX512-NEXT: retq 480 %1 = zext <4 x i8> %a0 to <4 x i16> 481 %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1) 482 ret i16 %2 483 484} 485 486define i16 @test_v8i16_v8i8(<8 x i8> %a0) { 487; SSE-LABEL: test_v8i16_v8i8: 488; SSE: # %bb.0: 489; SSE-NEXT: pxor %xmm1, %xmm1 490; SSE-NEXT: psadbw %xmm0, %xmm1 491; SSE-NEXT: movd %xmm1, %eax 492; SSE-NEXT: # kill: def $ax killed $ax killed $eax 493; SSE-NEXT: retq 494; 495; AVX-LABEL: test_v8i16_v8i8: 496; AVX: # %bb.0: 497; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 498; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 499; AVX-NEXT: vmovd %xmm0, %eax 500; AVX-NEXT: # kill: def $ax killed $ax killed $eax 501; AVX-NEXT: retq 502 %1 = zext <8 x i8> %a0 to <8 x i16> 503 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) 504 ret i16 %2 505} 506 507define i16 @test_v16i16_v16i8(<16 x i8> %a0) { 508; SSE-LABEL: test_v16i16_v16i8: 509; SSE: # %bb.0: 510; SSE-NEXT: pxor %xmm1, %xmm1 511; SSE-NEXT: psadbw %xmm0, %xmm1 512; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 513; SSE-NEXT: paddq %xmm1, %xmm0 514; SSE-NEXT: movd %xmm0, %eax 515; SSE-NEXT: # kill: def $ax killed $ax killed $eax 516; SSE-NEXT: retq 517; 518; AVX-LABEL: test_v16i16_v16i8: 519; AVX: # %bb.0: 520; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 521; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 522; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 523; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 524; AVX-NEXT: vmovd %xmm0, %eax 525; AVX-NEXT: # kill: def $ax killed $ax killed $eax 526; AVX-NEXT: retq 527 %1 = zext <16 x i8> %a0 to <16 x i16> 528 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) 529 ret i16 %2 530} 531 532define i16 @test_v32i16_v32i8(<32 x i8> %a0) { 533; SSE-LABEL: test_v32i16_v32i8: 534; SSE: # %bb.0: 535; SSE-NEXT: pxor %xmm2, %xmm2 536; SSE-NEXT: psadbw %xmm2, %xmm1 537; SSE-NEXT: psadbw %xmm2, %xmm0 538; SSE-NEXT: paddq %xmm1, %xmm0 539; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 540; SSE-NEXT: paddq %xmm0, %xmm1 541; SSE-NEXT: movd %xmm1, %eax 542; SSE-NEXT: # kill: def $ax killed $ax killed $eax 543; SSE-NEXT: retq 544; 545; AVX1-LABEL: test_v32i16_v32i8: 546; AVX1: # %bb.0: 547; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 548; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 549; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 550; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 551; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 552; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 553; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 554; AVX1-NEXT: vmovd %xmm0, %eax 555; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 556; AVX1-NEXT: vzeroupper 557; AVX1-NEXT: retq 558; 559; AVX2-LABEL: test_v32i16_v32i8: 560; AVX2: # %bb.0: 561; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 562; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 563; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 564; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 565; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 566; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 567; AVX2-NEXT: vmovd %xmm0, %eax 568; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 569; AVX2-NEXT: vzeroupper 570; AVX2-NEXT: retq 571; 572; AVX512-LABEL: test_v32i16_v32i8: 573; AVX512: # %bb.0: 574; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 575; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 576; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 577; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 578; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 579; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 580; AVX512-NEXT: vmovd %xmm0, %eax 581; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 582; AVX512-NEXT: vzeroupper 583; AVX512-NEXT: retq 584 %1 = zext <32 x i8> %a0 to <32 x i16> 585 %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1) 586 ret i16 %2 587} 588 589define i16 @test_v64i16_v64i8(<64 x i8> %a0) { 590; SSE-LABEL: test_v64i16_v64i8: 591; SSE: # %bb.0: 592; SSE-NEXT: pxor %xmm4, %xmm4 593; SSE-NEXT: psadbw %xmm4, %xmm3 594; SSE-NEXT: psadbw %xmm4, %xmm1 595; SSE-NEXT: paddq %xmm3, %xmm1 596; SSE-NEXT: psadbw %xmm4, %xmm2 597; SSE-NEXT: psadbw %xmm4, %xmm0 598; SSE-NEXT: paddq %xmm2, %xmm0 599; SSE-NEXT: paddq %xmm1, %xmm0 600; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 601; SSE-NEXT: paddq %xmm0, %xmm1 602; SSE-NEXT: movd %xmm1, %eax 603; SSE-NEXT: # kill: def $ax killed $ax killed $eax 604; SSE-NEXT: retq 605; 606; AVX1-LABEL: test_v64i16_v64i8: 607; AVX1: # %bb.0: 608; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 609; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 610; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 611; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 612; AVX1-NEXT: vpsadbw %xmm3, %xmm4, %xmm4 613; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 614; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 615; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 616; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 617; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 618; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 619; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 620; AVX1-NEXT: vmovd %xmm0, %eax 621; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 622; AVX1-NEXT: vzeroupper 623; AVX1-NEXT: retq 624; 625; AVX2-LABEL: test_v64i16_v64i8: 626; AVX2: # %bb.0: 627; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 628; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 629; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 630; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 631; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 632; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 633; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 634; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 635; AVX2-NEXT: vmovd %xmm0, %eax 636; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 637; AVX2-NEXT: vzeroupper 638; AVX2-NEXT: retq 639; 640; AVX512-LABEL: test_v64i16_v64i8: 641; AVX512: # %bb.0: 642; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 643; AVX512-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 644; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 645; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 646; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 647; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 648; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 649; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 650; AVX512-NEXT: vmovd %xmm0, %eax 651; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 652; AVX512-NEXT: vzeroupper 653; AVX512-NEXT: retq 654 %1 = zext <64 x i8> %a0 to <64 x i16> 655 %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1) 656 ret i16 %2 657 658} 659 660declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 661declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 662declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 663declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 664 665declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 666declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 667declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 668declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 669declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 670 671declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) 672declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 673declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 674declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 675declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 676declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 677 678declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) 679declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) 680declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 681declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 682declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 683declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 684declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) 685