1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 9 10; 11; vXi64 12; 13 14define i64 @test_v2i64_v2i32(<2 x i32> %a0) { 15; SSE2-LABEL: test_v2i64_v2i32: 16; SSE2: # %bb.0: 17; SSE2-NEXT: pxor %xmm1, %xmm1 18; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 19; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 20; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 21; SSE2-NEXT: paddq %xmm0, %xmm1 22; SSE2-NEXT: movq %xmm1, %rax 23; SSE2-NEXT: retq 24; 25; SSE41-LABEL: test_v2i64_v2i32: 26; SSE41: # %bb.0: 27; SSE41-NEXT: pmovsxdq %xmm0, %xmm0 28; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 29; SSE41-NEXT: paddq %xmm0, %xmm1 30; SSE41-NEXT: movq %xmm1, %rax 31; SSE41-NEXT: retq 32; 33; AVX-LABEL: test_v2i64_v2i32: 34; AVX: # %bb.0: 35; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 36; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 37; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 38; AVX-NEXT: vmovq %xmm0, %rax 39; AVX-NEXT: retq 40 %1 = sext <2 x i32> %a0 to <2 x i64> 41 %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1) 42 ret i64 %2 43} 44 45define i64 @test_v4i64_v4i16(<4 x i16> %a0) { 46; SSE2-LABEL: test_v4i64_v4i16: 47; SSE2: # %bb.0: 48; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 49; SSE2-NEXT: psrad $16, %xmm0 50; SSE2-NEXT: pxor %xmm1, %xmm1 51; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 52; SSE2-NEXT: movdqa %xmm0, %xmm2 53; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 54; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 55; SSE2-NEXT: paddq %xmm2, %xmm0 56; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 57; SSE2-NEXT: paddq %xmm0, %xmm1 58; SSE2-NEXT: movq %xmm1, %rax 59; SSE2-NEXT: retq 60; 61; SSE41-LABEL: test_v4i64_v4i16: 62; SSE41: # %bb.0: 63; SSE41-NEXT: pmovsxwq %xmm0, %xmm1 64; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 65; SSE41-NEXT: pmovsxwq %xmm0, %xmm0 66; SSE41-NEXT: paddq %xmm1, %xmm0 67; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 68; SSE41-NEXT: paddq %xmm0, %xmm1 69; SSE41-NEXT: movq %xmm1, %rax 70; SSE41-NEXT: retq 71; 72; AVX1-LABEL: test_v4i64_v4i16: 73; AVX1: # %bb.0: 74; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 75; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 76; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 77; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 78; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 79; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 80; AVX1-NEXT: vmovq %xmm0, %rax 81; AVX1-NEXT: retq 82; 83; AVX2-LABEL: test_v4i64_v4i16: 84; AVX2: # %bb.0: 85; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 86; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 87; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 88; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 89; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 90; AVX2-NEXT: vmovq %xmm0, %rax 91; AVX2-NEXT: vzeroupper 92; AVX2-NEXT: retq 93; 94; AVX512-LABEL: test_v4i64_v4i16: 95; AVX512: # %bb.0: 96; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0 97; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 98; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 99; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 100; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 101; AVX512-NEXT: vmovq %xmm0, %rax 102; AVX512-NEXT: vzeroupper 103; AVX512-NEXT: retq 104 %1 = sext <4 x i16> %a0 to <4 x i64> 105 %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1) 106 ret i64 %2 107} 108 109define i64 @test_v8i64_v8i8(<8 x i8> %a0) { 110; SSE2-LABEL: test_v8i64_v8i8: 111; SSE2: # %bb.0: 112; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 113; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 114; SSE2-NEXT: psrad $24, %xmm1 115; SSE2-NEXT: pxor %xmm2, %xmm2 116; SSE2-NEXT: pxor %xmm3, %xmm3 117; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 118; SSE2-NEXT: movdqa %xmm1, %xmm4 119; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 120; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 121; SSE2-NEXT: psrad $24, %xmm0 122; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 123; SSE2-NEXT: movdqa %xmm0, %xmm5 124; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 125; SSE2-NEXT: paddq %xmm4, %xmm5 126; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 127; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 128; SSE2-NEXT: paddq %xmm1, %xmm0 129; SSE2-NEXT: paddq %xmm5, %xmm0 130; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 131; SSE2-NEXT: paddq %xmm0, %xmm1 132; SSE2-NEXT: movq %xmm1, %rax 133; SSE2-NEXT: retq 134; 135; SSE41-LABEL: test_v8i64_v8i8: 136; SSE41: # %bb.0: 137; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 138; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 139; SSE41-NEXT: pmovsxbq %xmm2, %xmm2 140; SSE41-NEXT: paddq %xmm1, %xmm2 141; SSE41-NEXT: movdqa %xmm0, %xmm1 142; SSE41-NEXT: psrlq $48, %xmm1 143; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 144; SSE41-NEXT: psrld $16, %xmm0 145; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 146; SSE41-NEXT: paddq %xmm1, %xmm0 147; SSE41-NEXT: paddq %xmm2, %xmm0 148; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 149; SSE41-NEXT: paddq %xmm0, %xmm1 150; SSE41-NEXT: movq %xmm1, %rax 151; SSE41-NEXT: retq 152; 153; AVX1-LABEL: test_v8i64_v8i8: 154; AVX1: # %bb.0: 155; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 156; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 157; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 158; AVX1-NEXT: vpsrld $16, %xmm2, %xmm3 159; AVX1-NEXT: vpmovsxbq %xmm3, %xmm3 160; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 161; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 162; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 163; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 164; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 165; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 166; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 167; AVX1-NEXT: vmovq %xmm0, %rax 168; AVX1-NEXT: retq 169; 170; AVX2-LABEL: test_v8i64_v8i8: 171; AVX2: # %bb.0: 172; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 173; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 174; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 175; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 176; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 177; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 178; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 179; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 180; AVX2-NEXT: vmovq %xmm0, %rax 181; AVX2-NEXT: vzeroupper 182; AVX2-NEXT: retq 183; 184; AVX512-LABEL: test_v8i64_v8i8: 185; AVX512: # %bb.0: 186; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0 187; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 188; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 189; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 190; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 191; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 192; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 193; AVX512-NEXT: vmovq %xmm0, %rax 194; AVX512-NEXT: vzeroupper 195; AVX512-NEXT: retq 196 %1 = sext <8 x i8> %a0 to <8 x i64> 197 %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1) 198 ret i64 %2 199} 200 201define i64 @test_v16i64_v16i8(<16 x i8> %a0) { 202; SSE2-LABEL: test_v16i64_v16i8: 203; SSE2: # %bb.0: 204; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 205; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 206; SSE2-NEXT: psrad $24, %xmm2 207; SSE2-NEXT: pxor %xmm1, %xmm1 208; SSE2-NEXT: pxor %xmm3, %xmm3 209; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 210; SSE2-NEXT: movdqa %xmm2, %xmm5 211; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] 212; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 213; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 214; SSE2-NEXT: psrad $24, %xmm0 215; SSE2-NEXT: pxor %xmm7, %xmm7 216; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 217; SSE2-NEXT: movdqa %xmm0, %xmm8 218; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] 219; SSE2-NEXT: paddq %xmm5, %xmm8 220; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] 221; SSE2-NEXT: psrad $24, %xmm4 222; SSE2-NEXT: pxor %xmm5, %xmm5 223; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 224; SSE2-NEXT: movdqa %xmm4, %xmm9 225; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] 226; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 227; SSE2-NEXT: psrad $24, %xmm6 228; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 229; SSE2-NEXT: movdqa %xmm6, %xmm10 230; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] 231; SSE2-NEXT: paddq %xmm9, %xmm10 232; SSE2-NEXT: paddq %xmm8, %xmm10 233; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 234; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 235; SSE2-NEXT: paddq %xmm2, %xmm0 236; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 237; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] 238; SSE2-NEXT: paddq %xmm4, %xmm6 239; SSE2-NEXT: paddq %xmm0, %xmm6 240; SSE2-NEXT: paddq %xmm10, %xmm6 241; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 242; SSE2-NEXT: paddq %xmm6, %xmm0 243; SSE2-NEXT: movq %xmm0, %rax 244; SSE2-NEXT: retq 245; 246; SSE41-LABEL: test_v16i64_v16i8: 247; SSE41: # %bb.0: 248; SSE41-NEXT: movdqa %xmm0, %xmm1 249; SSE41-NEXT: movdqa %xmm0, %xmm2 250; SSE41-NEXT: movdqa %xmm0, %xmm3 251; SSE41-NEXT: pmovsxbq %xmm0, %xmm4 252; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] 253; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] 254; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 255; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 256; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 257; SSE41-NEXT: psrld $16, %xmm1 258; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 259; SSE41-NEXT: paddq %xmm0, %xmm1 260; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 261; SSE41-NEXT: pmovsxbq %xmm2, %xmm0 262; SSE41-NEXT: psrlq $48, %xmm3 263; SSE41-NEXT: pmovsxbq %xmm3, %xmm2 264; SSE41-NEXT: paddq %xmm0, %xmm2 265; SSE41-NEXT: paddq %xmm1, %xmm2 266; SSE41-NEXT: pmovsxbq %xmm5, %xmm0 267; SSE41-NEXT: paddq %xmm4, %xmm0 268; SSE41-NEXT: pmovsxbq %xmm6, %xmm1 269; SSE41-NEXT: pmovsxbq %xmm7, %xmm3 270; SSE41-NEXT: paddq %xmm1, %xmm3 271; SSE41-NEXT: paddq %xmm0, %xmm3 272; SSE41-NEXT: paddq %xmm2, %xmm3 273; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 274; SSE41-NEXT: paddq %xmm3, %xmm0 275; SSE41-NEXT: movq %xmm0, %rax 276; SSE41-NEXT: retq 277; 278; AVX1-LABEL: test_v16i64_v16i8: 279; AVX1: # %bb.0: 280; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 281; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 282; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 283; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 284; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] 285; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 286; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 287; AVX1-NEXT: vpmovsxbw %xmm0, %xmm4 288; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3] 289; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 290; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] 291; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 292; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 293; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 294; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 295; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 296; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 297; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 298; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 299; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 300; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 301; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 302; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 303; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 304; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 305; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 306; AVX1-NEXT: vmovq %xmm0, %rax 307; AVX1-NEXT: retq 308; 309; AVX2-LABEL: test_v16i64_v16i8: 310; AVX2: # %bb.0: 311; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 312; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 313; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 314; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 315; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 316; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 317; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 318; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 319; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 320; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 321; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 322; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 323; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 324; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 325; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 326; AVX2-NEXT: vmovq %xmm0, %rax 327; AVX2-NEXT: vzeroupper 328; AVX2-NEXT: retq 329; 330; AVX512-LABEL: test_v16i64_v16i8: 331; AVX512: # %bb.0: 332; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 333; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 334; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 335; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 336; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 337; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 338; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 339; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 340; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 341; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 342; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 343; AVX512-NEXT: vmovq %xmm0, %rax 344; AVX512-NEXT: vzeroupper 345; AVX512-NEXT: retq 346 %1 = sext <16 x i8> %a0 to <16 x i64> 347 %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1) 348 ret i64 %2 349} 350 351; 352; vXi32 353; 354 355define i32 @test_v2i32_v2i16(<2 x i16> %a0) { 356; SSE2-LABEL: test_v2i32_v2i16: 357; SSE2: # %bb.0: 358; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 359; SSE2-NEXT: psrad $16, %xmm0 360; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 361; SSE2-NEXT: paddd %xmm0, %xmm1 362; SSE2-NEXT: movd %xmm1, %eax 363; SSE2-NEXT: retq 364; 365; SSE41-LABEL: test_v2i32_v2i16: 366; SSE41: # %bb.0: 367; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 368; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 369; SSE41-NEXT: paddd %xmm0, %xmm1 370; SSE41-NEXT: movd %xmm1, %eax 371; SSE41-NEXT: retq 372; 373; AVX1-SLOW-LABEL: test_v2i32_v2i16: 374; AVX1-SLOW: # %bb.0: 375; AVX1-SLOW-NEXT: vpmovsxwd %xmm0, %xmm0 376; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 377; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 378; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 379; AVX1-SLOW-NEXT: retq 380; 381; AVX1-FAST-LABEL: test_v2i32_v2i16: 382; AVX1-FAST: # %bb.0: 383; AVX1-FAST-NEXT: vpmovsxwd %xmm0, %xmm0 384; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 385; AVX1-FAST-NEXT: vmovd %xmm0, %eax 386; AVX1-FAST-NEXT: retq 387; 388; AVX2-LABEL: test_v2i32_v2i16: 389; AVX2: # %bb.0: 390; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0 391; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 392; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 393; AVX2-NEXT: vmovd %xmm0, %eax 394; AVX2-NEXT: retq 395; 396; AVX512-LABEL: test_v2i32_v2i16: 397; AVX512: # %bb.0: 398; AVX512-NEXT: vpmovsxwd %xmm0, %xmm0 399; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 400; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 401; AVX512-NEXT: vmovd %xmm0, %eax 402; AVX512-NEXT: retq 403 %1 = sext <2 x i16> %a0 to <2 x i32> 404 %2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %1) 405 ret i32 %2 406} 407 408define i32 @test_v4i32(<4 x i8> %a0) { 409; SSE2-LABEL: test_v4i32: 410; SSE2: # %bb.0: 411; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 412; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 413; SSE2-NEXT: psrad $24, %xmm0 414; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 415; SSE2-NEXT: paddd %xmm0, %xmm1 416; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 417; SSE2-NEXT: paddd %xmm1, %xmm0 418; SSE2-NEXT: movd %xmm0, %eax 419; SSE2-NEXT: retq 420; 421; SSE41-LABEL: test_v4i32: 422; SSE41: # %bb.0: 423; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 424; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 425; SSE41-NEXT: paddd %xmm0, %xmm1 426; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 427; SSE41-NEXT: paddd %xmm1, %xmm0 428; SSE41-NEXT: movd %xmm0, %eax 429; SSE41-NEXT: retq 430; 431; AVX1-SLOW-LABEL: test_v4i32: 432; AVX1-SLOW: # %bb.0: 433; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm0 434; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 435; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 436; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 437; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 438; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 439; AVX1-SLOW-NEXT: retq 440; 441; AVX1-FAST-LABEL: test_v4i32: 442; AVX1-FAST: # %bb.0: 443; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 444; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 445; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 446; AVX1-FAST-NEXT: vmovd %xmm0, %eax 447; AVX1-FAST-NEXT: retq 448; 449; AVX2-LABEL: test_v4i32: 450; AVX2: # %bb.0: 451; AVX2-NEXT: vpmovsxbd %xmm0, %xmm0 452; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 453; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 454; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 455; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 456; AVX2-NEXT: vmovd %xmm0, %eax 457; AVX2-NEXT: retq 458; 459; AVX512-LABEL: test_v4i32: 460; AVX512: # %bb.0: 461; AVX512-NEXT: vpmovsxbd %xmm0, %xmm0 462; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 463; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 464; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 465; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 466; AVX512-NEXT: vmovd %xmm0, %eax 467; AVX512-NEXT: retq 468 %1 = sext <4 x i8> %a0 to <4 x i32> 469 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) 470 ret i32 %2 471} 472 473define i32 @test_v8i32_v8i8(<8 x i8> %a0) { 474; SSE2-LABEL: test_v8i32_v8i8: 475; SSE2: # %bb.0: 476; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 477; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 478; SSE2-NEXT: psrad $24, %xmm1 479; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 480; SSE2-NEXT: psrad $24, %xmm0 481; SSE2-NEXT: paddd %xmm1, %xmm0 482; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 483; SSE2-NEXT: paddd %xmm0, %xmm1 484; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 485; SSE2-NEXT: paddd %xmm1, %xmm0 486; SSE2-NEXT: movd %xmm0, %eax 487; SSE2-NEXT: retq 488; 489; SSE41-LABEL: test_v8i32_v8i8: 490; SSE41: # %bb.0: 491; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 492; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 493; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 494; SSE41-NEXT: paddd %xmm1, %xmm0 495; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 496; SSE41-NEXT: paddd %xmm0, %xmm1 497; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 498; SSE41-NEXT: paddd %xmm1, %xmm0 499; SSE41-NEXT: movd %xmm0, %eax 500; SSE41-NEXT: retq 501; 502; AVX1-SLOW-LABEL: test_v8i32_v8i8: 503; AVX1-SLOW: # %bb.0: 504; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm1 505; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 506; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm0 507; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 508; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 509; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 510; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 511; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 512; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 513; AVX1-SLOW-NEXT: retq 514; 515; AVX1-FAST-LABEL: test_v8i32_v8i8: 516; AVX1-FAST: # %bb.0: 517; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm1 518; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 519; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 520; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 521; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 522; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 523; AVX1-FAST-NEXT: vmovd %xmm0, %eax 524; AVX1-FAST-NEXT: retq 525; 526; AVX2-LABEL: test_v8i32_v8i8: 527; AVX2: # %bb.0: 528; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 529; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 530; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 531; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 532; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 533; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 534; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 535; AVX2-NEXT: vmovd %xmm0, %eax 536; AVX2-NEXT: vzeroupper 537; AVX2-NEXT: retq 538; 539; AVX512-LABEL: test_v8i32_v8i8: 540; AVX512: # %bb.0: 541; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 542; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 543; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 544; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 545; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 546; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 547; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 548; AVX512-NEXT: vmovd %xmm0, %eax 549; AVX512-NEXT: vzeroupper 550; AVX512-NEXT: retq 551 %1 = sext <8 x i8> %a0 to <8 x i32> 552 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 553 ret i32 %2 554} 555 556define i32 @test_v16i32_v16i8(<16 x i8> %a0) { 557; SSE2-LABEL: test_v16i32_v16i8: 558; SSE2: # %bb.0: 559; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 560; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 561; SSE2-NEXT: psrad $24, %xmm2 562; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 563; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] 564; SSE2-NEXT: psrad $24, %xmm3 565; SSE2-NEXT: paddd %xmm2, %xmm3 566; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 567; SSE2-NEXT: psrad $24, %xmm1 568; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 569; SSE2-NEXT: psrad $24, %xmm0 570; SSE2-NEXT: paddd %xmm1, %xmm0 571; SSE2-NEXT: paddd %xmm3, %xmm0 572; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 573; SSE2-NEXT: paddd %xmm0, %xmm1 574; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 575; SSE2-NEXT: paddd %xmm1, %xmm0 576; SSE2-NEXT: movd %xmm0, %eax 577; SSE2-NEXT: retq 578; 579; SSE41-LABEL: test_v16i32_v16i8: 580; SSE41: # %bb.0: 581; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 582; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 583; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 584; SSE41-NEXT: paddd %xmm1, %xmm2 585; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 586; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 587; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 588; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 589; SSE41-NEXT: paddd %xmm1, %xmm0 590; SSE41-NEXT: paddd %xmm2, %xmm0 591; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 592; SSE41-NEXT: paddd %xmm0, %xmm1 593; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 594; SSE41-NEXT: paddd %xmm1, %xmm0 595; SSE41-NEXT: movd %xmm0, %eax 596; SSE41-NEXT: retq 597; 598; AVX1-SLOW-LABEL: test_v16i32_v16i8: 599; AVX1-SLOW: # %bb.0: 600; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm1 601; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 602; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2 603; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 604; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 605; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2 606; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 607; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm0 608; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 609; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 610; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 611; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 612; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 613; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 614; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 615; AVX1-SLOW-NEXT: retq 616; 617; AVX1-FAST-LABEL: test_v16i32_v16i8: 618; AVX1-FAST: # %bb.0: 619; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm1 620; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 621; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2 622; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 623; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 624; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2 625; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 626; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 627; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 628; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 629; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 630; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 631; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 632; AVX1-FAST-NEXT: vmovd %xmm0, %eax 633; AVX1-FAST-NEXT: retq 634; 635; AVX2-LABEL: test_v16i32_v16i8: 636; AVX2: # %bb.0: 637; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1 638; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 639; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 640; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 641; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 642; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 643; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 644; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 645; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 646; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 647; AVX2-NEXT: vmovd %xmm0, %eax 648; AVX2-NEXT: vzeroupper 649; AVX2-NEXT: retq 650; 651; AVX512-LABEL: test_v16i32_v16i8: 652; AVX512: # %bb.0: 653; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 654; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 655; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 656; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 657; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 658; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 659; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 660; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 661; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 662; AVX512-NEXT: vmovd %xmm0, %eax 663; AVX512-NEXT: vzeroupper 664; AVX512-NEXT: retq 665 %1 = sext <16 x i8> %a0 to <16 x i32> 666 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 667 ret i32 %2 668} 669 670define i32 @test_v32i32_v32i8(<32 x i8> %a0) { 671; SSE2-LABEL: test_v32i32_v32i8: 672; SSE2: # %bb.0: 673; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 674; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 675; SSE2-NEXT: psrad $24, %xmm3 676; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 677; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 678; SSE2-NEXT: psrad $24, %xmm5 679; SSE2-NEXT: paddd %xmm3, %xmm5 680; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 681; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 682; SSE2-NEXT: psrad $24, %xmm3 683; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 684; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 685; SSE2-NEXT: psrad $24, %xmm6 686; SSE2-NEXT: paddd %xmm3, %xmm6 687; SSE2-NEXT: paddd %xmm5, %xmm6 688; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 689; SSE2-NEXT: psrad $24, %xmm2 690; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 691; SSE2-NEXT: psrad $24, %xmm3 692; SSE2-NEXT: paddd %xmm2, %xmm3 693; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 694; SSE2-NEXT: psrad $24, %xmm1 695; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 696; SSE2-NEXT: psrad $24, %xmm0 697; SSE2-NEXT: paddd %xmm1, %xmm0 698; SSE2-NEXT: paddd %xmm3, %xmm0 699; SSE2-NEXT: paddd %xmm6, %xmm0 700; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 701; SSE2-NEXT: paddd %xmm0, %xmm1 702; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 703; SSE2-NEXT: paddd %xmm1, %xmm0 704; SSE2-NEXT: movd %xmm0, %eax 705; SSE2-NEXT: retq 706; 707; SSE41-LABEL: test_v32i32_v32i8: 708; SSE41: # %bb.0: 709; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 710; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 711; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 712; SSE41-NEXT: pmovsxbd %xmm3, %xmm3 713; SSE41-NEXT: paddd %xmm2, %xmm3 714; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 715; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 716; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] 717; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 718; SSE41-NEXT: paddd %xmm2, %xmm4 719; SSE41-NEXT: paddd %xmm3, %xmm4 720; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 721; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 722; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 723; SSE41-NEXT: pmovsxbd %xmm3, %xmm3 724; SSE41-NEXT: paddd %xmm2, %xmm3 725; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 726; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 727; SSE41-NEXT: paddd %xmm1, %xmm0 728; SSE41-NEXT: paddd %xmm3, %xmm0 729; SSE41-NEXT: paddd %xmm4, %xmm0 730; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 731; SSE41-NEXT: paddd %xmm0, %xmm1 732; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 733; SSE41-NEXT: paddd %xmm1, %xmm0 734; SSE41-NEXT: movd %xmm0, %eax 735; SSE41-NEXT: retq 736; 737; AVX1-SLOW-LABEL: test_v32i32_v32i8: 738; AVX1-SLOW: # %bb.0: 739; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 740; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 741; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2 742; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 743; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3 744; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 745; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] 746; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3 747; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] 748; AVX1-SLOW-NEXT: vpmovsxbd %xmm4, %xmm4 749; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm4, %xmm3 750; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 751; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 752; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3 753; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 754; AVX1-SLOW-NEXT: vpmovsxbd %xmm4, %xmm4 755; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm4, %xmm3 756; AVX1-SLOW-NEXT: vpmovsxbd %xmm1, %xmm1 757; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm0 758; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 759; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm0, %xmm0 760; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 761; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 762; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 763; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 764; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 765; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 766; AVX1-SLOW-NEXT: vzeroupper 767; AVX1-SLOW-NEXT: retq 768; 769; AVX1-FAST-LABEL: test_v32i32_v32i8: 770; AVX1-FAST: # %bb.0: 771; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 772; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 773; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2 774; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 775; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3 776; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 777; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] 778; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3 779; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] 780; AVX1-FAST-NEXT: vpmovsxbd %xmm4, %xmm4 781; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm4, %xmm3 782; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm2, %xmm2 783; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 784; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3 785; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 786; AVX1-FAST-NEXT: vpmovsxbd %xmm4, %xmm4 787; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm4, %xmm3 788; AVX1-FAST-NEXT: vpmovsxbd %xmm1, %xmm1 789; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 790; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 791; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm0, %xmm0 792; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 793; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 794; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 795; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 796; AVX1-FAST-NEXT: vmovd %xmm0, %eax 797; AVX1-FAST-NEXT: vzeroupper 798; AVX1-FAST-NEXT: retq 799; 800; AVX2-LABEL: test_v32i32_v32i8: 801; AVX2: # %bb.0: 802; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 803; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 804; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 805; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 806; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 807; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 808; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 809; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 810; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 811; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 812; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 813; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 814; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 815; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 816; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 817; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 818; AVX2-NEXT: vmovd %xmm0, %eax 819; AVX2-NEXT: vzeroupper 820; AVX2-NEXT: retq 821; 822; AVX512-LABEL: test_v32i32_v32i8: 823; AVX512: # %bb.0: 824; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 825; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1 826; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 827; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 828; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 829; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 830; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 831; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 832; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 833; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 834; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 835; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 836; AVX512-NEXT: vmovd %xmm0, %eax 837; AVX512-NEXT: vzeroupper 838; AVX512-NEXT: retq 839 %1 = sext <32 x i8> %a0 to <32 x i32> 840 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 841 ret i32 %2 842} 843 844; 845; vXi16 846; 847 848define i16 @test_v2i16_v2i8(<2 x i8> %a0) { 849; SSE2-LABEL: test_v2i16_v2i8: 850; SSE2: # %bb.0: 851; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 852; SSE2-NEXT: psraw $8, %xmm0 853; SSE2-NEXT: movdqa %xmm0, %xmm1 854; SSE2-NEXT: psrld $16, %xmm1 855; SSE2-NEXT: paddw %xmm0, %xmm1 856; SSE2-NEXT: movd %xmm1, %eax 857; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 858; SSE2-NEXT: retq 859; 860; SSE41-LABEL: test_v2i16_v2i8: 861; SSE41: # %bb.0: 862; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 863; SSE41-NEXT: movdqa %xmm0, %xmm1 864; SSE41-NEXT: psrld $16, %xmm1 865; SSE41-NEXT: paddw %xmm0, %xmm1 866; SSE41-NEXT: movd %xmm1, %eax 867; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 868; SSE41-NEXT: retq 869; 870; AVX1-SLOW-LABEL: test_v2i16_v2i8: 871; AVX1-SLOW: # %bb.0: 872; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0 873; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 874; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 875; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 876; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 877; AVX1-SLOW-NEXT: retq 878; 879; AVX1-FAST-LABEL: test_v2i16_v2i8: 880; AVX1-FAST: # %bb.0: 881; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 882; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 883; AVX1-FAST-NEXT: vmovd %xmm0, %eax 884; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 885; AVX1-FAST-NEXT: retq 886; 887; AVX2-LABEL: test_v2i16_v2i8: 888; AVX2: # %bb.0: 889; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 890; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 891; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 892; AVX2-NEXT: vmovd %xmm0, %eax 893; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 894; AVX2-NEXT: retq 895; 896; AVX512-LABEL: test_v2i16_v2i8: 897; AVX512: # %bb.0: 898; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 899; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 900; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 901; AVX512-NEXT: vmovd %xmm0, %eax 902; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 903; AVX512-NEXT: retq 904 %1 = sext <2 x i8> %a0 to <2 x i16> 905 %2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %1) 906 ret i16 %2 907} 908 909define i16 @test_v4i16_v4i8(<4 x i8> %a0) { 910; SSE2-LABEL: test_v4i16_v4i8: 911; SSE2: # %bb.0: 912; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 913; SSE2-NEXT: psraw $8, %xmm0 914; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 915; SSE2-NEXT: paddw %xmm0, %xmm1 916; SSE2-NEXT: movdqa %xmm1, %xmm0 917; SSE2-NEXT: psrld $16, %xmm0 918; SSE2-NEXT: paddw %xmm1, %xmm0 919; SSE2-NEXT: movd %xmm0, %eax 920; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 921; SSE2-NEXT: retq 922; 923; SSE41-LABEL: test_v4i16_v4i8: 924; SSE41: # %bb.0: 925; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 926; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 927; SSE41-NEXT: paddw %xmm0, %xmm1 928; SSE41-NEXT: movdqa %xmm1, %xmm0 929; SSE41-NEXT: psrld $16, %xmm0 930; SSE41-NEXT: paddw %xmm1, %xmm0 931; SSE41-NEXT: movd %xmm0, %eax 932; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 933; SSE41-NEXT: retq 934; 935; AVX1-SLOW-LABEL: test_v4i16_v4i8: 936; AVX1-SLOW: # %bb.0: 937; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0 938; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 939; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 940; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 941; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 942; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 943; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 944; AVX1-SLOW-NEXT: retq 945; 946; AVX1-FAST-LABEL: test_v4i16_v4i8: 947; AVX1-FAST: # %bb.0: 948; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 949; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 950; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 951; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 952; AVX1-FAST-NEXT: vmovd %xmm0, %eax 953; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 954; AVX1-FAST-NEXT: retq 955; 956; AVX2-LABEL: test_v4i16_v4i8: 957; AVX2: # %bb.0: 958; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 959; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 960; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 961; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 962; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 963; AVX2-NEXT: vmovd %xmm0, %eax 964; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 965; AVX2-NEXT: retq 966; 967; AVX512-LABEL: test_v4i16_v4i8: 968; AVX512: # %bb.0: 969; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 970; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 971; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 972; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 973; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 974; AVX512-NEXT: vmovd %xmm0, %eax 975; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 976; AVX512-NEXT: retq 977 %1 = sext <4 x i8> %a0 to <4 x i16> 978 %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1) 979 ret i16 %2 980 981} 982 983define i16 @test_v8i16_v8i8(<8 x i8> %a0) { 984; SSE2-LABEL: test_v8i16_v8i8: 985; SSE2: # %bb.0: 986; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 987; SSE2-NEXT: psraw $8, %xmm0 988; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 989; SSE2-NEXT: paddw %xmm0, %xmm1 990; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 991; SSE2-NEXT: paddw %xmm1, %xmm0 992; SSE2-NEXT: movdqa %xmm0, %xmm1 993; SSE2-NEXT: psrld $16, %xmm1 994; SSE2-NEXT: paddw %xmm0, %xmm1 995; SSE2-NEXT: movd %xmm1, %eax 996; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 997; SSE2-NEXT: retq 998; 999; SSE41-LABEL: test_v8i16_v8i8: 1000; SSE41: # %bb.0: 1001; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 1002; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1003; SSE41-NEXT: paddw %xmm0, %xmm1 1004; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1005; SSE41-NEXT: paddw %xmm1, %xmm0 1006; SSE41-NEXT: movdqa %xmm0, %xmm1 1007; SSE41-NEXT: psrld $16, %xmm1 1008; SSE41-NEXT: paddw %xmm0, %xmm1 1009; SSE41-NEXT: movd %xmm1, %eax 1010; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 1011; SSE41-NEXT: retq 1012; 1013; AVX1-SLOW-LABEL: test_v8i16_v8i8: 1014; AVX1-SLOW: # %bb.0: 1015; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0 1016; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1017; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1018; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1019; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1020; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 1021; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1022; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 1023; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1024; AVX1-SLOW-NEXT: retq 1025; 1026; AVX1-FAST-LABEL: test_v8i16_v8i8: 1027; AVX1-FAST: # %bb.0: 1028; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 1029; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1030; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1031; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1032; AVX1-FAST-NEXT: vmovd %xmm0, %eax 1033; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1034; AVX1-FAST-NEXT: retq 1035; 1036; AVX2-LABEL: test_v8i16_v8i8: 1037; AVX2: # %bb.0: 1038; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 1039; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1040; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1041; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1042; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1043; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1044; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1045; AVX2-NEXT: vmovd %xmm0, %eax 1046; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1047; AVX2-NEXT: retq 1048; 1049; AVX512-LABEL: test_v8i16_v8i8: 1050; AVX512: # %bb.0: 1051; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 1052; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1053; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1054; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1055; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1056; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1057; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1058; AVX512-NEXT: vmovd %xmm0, %eax 1059; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1060; AVX512-NEXT: retq 1061 %1 = sext <8 x i8> %a0 to <8 x i16> 1062 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) 1063 ret i16 %2 1064} 1065 1066define i16 @test_v16i16_v16i8(<16 x i8> %a0) { 1067; SSE2-LABEL: test_v16i16_v16i8: 1068; SSE2: # %bb.0: 1069; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1070; SSE2-NEXT: psraw $8, %xmm1 1071; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1072; SSE2-NEXT: psraw $8, %xmm0 1073; SSE2-NEXT: paddw %xmm1, %xmm0 1074; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1075; SSE2-NEXT: paddw %xmm0, %xmm1 1076; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1077; SSE2-NEXT: paddw %xmm1, %xmm0 1078; SSE2-NEXT: movdqa %xmm0, %xmm1 1079; SSE2-NEXT: psrld $16, %xmm1 1080; SSE2-NEXT: paddw %xmm0, %xmm1 1081; SSE2-NEXT: movd %xmm1, %eax 1082; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 1083; SSE2-NEXT: retq 1084; 1085; SSE41-LABEL: test_v16i16_v16i8: 1086; SSE41: # %bb.0: 1087; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 1088; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1089; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 1090; SSE41-NEXT: paddw %xmm1, %xmm0 1091; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1092; SSE41-NEXT: paddw %xmm0, %xmm1 1093; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1094; SSE41-NEXT: paddw %xmm1, %xmm0 1095; SSE41-NEXT: movdqa %xmm0, %xmm1 1096; SSE41-NEXT: psrld $16, %xmm1 1097; SSE41-NEXT: paddw %xmm0, %xmm1 1098; SSE41-NEXT: movd %xmm1, %eax 1099; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 1100; SSE41-NEXT: retq 1101; 1102; AVX1-SLOW-LABEL: test_v16i16_v16i8: 1103; AVX1-SLOW: # %bb.0: 1104; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm1 1105; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1106; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0 1107; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1108; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1109; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1110; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1111; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1112; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 1113; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1114; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 1115; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1116; AVX1-SLOW-NEXT: retq 1117; 1118; AVX1-FAST-LABEL: test_v16i16_v16i8: 1119; AVX1-FAST: # %bb.0: 1120; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm1 1121; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1122; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 1123; AVX1-FAST-NEXT: vphaddw %xmm1, %xmm0, %xmm0 1124; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1125; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1126; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1127; AVX1-FAST-NEXT: vmovd %xmm0, %eax 1128; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1129; AVX1-FAST-NEXT: retq 1130; 1131; AVX2-LABEL: test_v16i16_v16i8: 1132; AVX2: # %bb.0: 1133; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1134; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1135; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1136; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1137; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1138; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1139; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1140; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1141; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1142; AVX2-NEXT: vmovd %xmm0, %eax 1143; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1144; AVX2-NEXT: vzeroupper 1145; AVX2-NEXT: retq 1146; 1147; AVX512-LABEL: test_v16i16_v16i8: 1148; AVX512: # %bb.0: 1149; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 1150; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1151; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1152; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1153; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1154; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1155; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1156; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1157; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1158; AVX512-NEXT: vmovd %xmm0, %eax 1159; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1160; AVX512-NEXT: vzeroupper 1161; AVX512-NEXT: retq 1162 %1 = sext <16 x i8> %a0 to <16 x i16> 1163 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) 1164 ret i16 %2 1165} 1166 1167define i16 @test_v32i16_v32i8(<32 x i8> %a0) { 1168; SSE2-LABEL: test_v32i16_v32i8: 1169; SSE2: # %bb.0: 1170; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1171; SSE2-NEXT: psraw $8, %xmm2 1172; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 1173; SSE2-NEXT: psraw $8, %xmm3 1174; SSE2-NEXT: paddw %xmm2, %xmm3 1175; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1176; SSE2-NEXT: psraw $8, %xmm1 1177; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1178; SSE2-NEXT: psraw $8, %xmm0 1179; SSE2-NEXT: paddw %xmm1, %xmm0 1180; SSE2-NEXT: paddw %xmm3, %xmm0 1181; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1182; SSE2-NEXT: paddw %xmm0, %xmm1 1183; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1184; SSE2-NEXT: paddw %xmm1, %xmm0 1185; SSE2-NEXT: movdqa %xmm0, %xmm1 1186; SSE2-NEXT: psrld $16, %xmm1 1187; SSE2-NEXT: paddw %xmm0, %xmm1 1188; SSE2-NEXT: movd %xmm1, %eax 1189; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 1190; SSE2-NEXT: retq 1191; 1192; SSE41-LABEL: test_v32i16_v32i8: 1193; SSE41: # %bb.0: 1194; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1195; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 1196; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1197; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 1198; SSE41-NEXT: paddw %xmm2, %xmm3 1199; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 1200; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 1201; SSE41-NEXT: paddw %xmm1, %xmm0 1202; SSE41-NEXT: paddw %xmm3, %xmm0 1203; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1204; SSE41-NEXT: paddw %xmm0, %xmm1 1205; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1206; SSE41-NEXT: paddw %xmm1, %xmm0 1207; SSE41-NEXT: movdqa %xmm0, %xmm1 1208; SSE41-NEXT: psrld $16, %xmm1 1209; SSE41-NEXT: paddw %xmm0, %xmm1 1210; SSE41-NEXT: movd %xmm1, %eax 1211; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 1212; SSE41-NEXT: retq 1213; 1214; AVX1-SLOW-LABEL: test_v32i16_v32i8: 1215; AVX1-SLOW: # %bb.0: 1216; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1217; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1218; AVX1-SLOW-NEXT: vpmovsxbw %xmm2, %xmm2 1219; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1220; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm3 1221; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 1222; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1 1223; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0 1224; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1225; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0 1226; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1227; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1228; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1229; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1230; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 1231; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1232; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 1233; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1234; AVX1-SLOW-NEXT: vzeroupper 1235; AVX1-SLOW-NEXT: retq 1236; 1237; AVX1-FAST-LABEL: test_v32i16_v32i8: 1238; AVX1-FAST: # %bb.0: 1239; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1240; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1241; AVX1-FAST-NEXT: vpmovsxbw %xmm2, %xmm2 1242; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1243; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm3 1244; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 1245; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1 1246; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 1247; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1248; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0 1249; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1250; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1251; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1252; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1253; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1254; AVX1-FAST-NEXT: vmovd %xmm0, %eax 1255; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1256; AVX1-FAST-NEXT: vzeroupper 1257; AVX1-FAST-NEXT: retq 1258; 1259; AVX2-LABEL: test_v32i16_v32i8: 1260; AVX2: # %bb.0: 1261; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1262; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1263; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1264; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1265; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1266; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1267; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1268; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1269; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1270; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1271; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1272; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1273; AVX2-NEXT: vmovd %xmm0, %eax 1274; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1275; AVX2-NEXT: vzeroupper 1276; AVX2-NEXT: retq 1277; 1278; AVX512-LABEL: test_v32i16_v32i8: 1279; AVX512: # %bb.0: 1280; AVX512-NEXT: vpmovsxbw %ymm0, %zmm0 1281; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1282; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 1283; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1284; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1285; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1286; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1287; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1288; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1289; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1290; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1291; AVX512-NEXT: vmovd %xmm0, %eax 1292; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1293; AVX512-NEXT: vzeroupper 1294; AVX512-NEXT: retq 1295 %1 = sext <32 x i8> %a0 to <32 x i16> 1296 %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1) 1297 ret i16 %2 1298} 1299 1300define i16 @test_v64i16_v64i8(<64 x i8> %a0) { 1301; SSE2-LABEL: test_v64i16_v64i8: 1302; SSE2: # %bb.0: 1303; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] 1304; SSE2-NEXT: psraw $8, %xmm4 1305; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 1306; SSE2-NEXT: psraw $8, %xmm5 1307; SSE2-NEXT: paddw %xmm4, %xmm5 1308; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 1309; SSE2-NEXT: psraw $8, %xmm4 1310; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] 1311; SSE2-NEXT: psraw $8, %xmm6 1312; SSE2-NEXT: paddw %xmm4, %xmm6 1313; SSE2-NEXT: paddw %xmm5, %xmm6 1314; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1315; SSE2-NEXT: psraw $8, %xmm2 1316; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1317; SSE2-NEXT: psraw $8, %xmm0 1318; SSE2-NEXT: paddw %xmm2, %xmm0 1319; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 1320; SSE2-NEXT: psraw $8, %xmm2 1321; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1322; SSE2-NEXT: psraw $8, %xmm1 1323; SSE2-NEXT: paddw %xmm2, %xmm1 1324; SSE2-NEXT: paddw %xmm0, %xmm1 1325; SSE2-NEXT: paddw %xmm6, %xmm1 1326; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1327; SSE2-NEXT: paddw %xmm1, %xmm0 1328; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1329; SSE2-NEXT: paddw %xmm0, %xmm1 1330; SSE2-NEXT: movdqa %xmm1, %xmm0 1331; SSE2-NEXT: psrld $16, %xmm0 1332; SSE2-NEXT: paddw %xmm1, %xmm0 1333; SSE2-NEXT: movd %xmm0, %eax 1334; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 1335; SSE2-NEXT: retq 1336; 1337; SSE41-LABEL: test_v64i16_v64i8: 1338; SSE41: # %bb.0: 1339; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 1340; SSE41-NEXT: pmovsxbw %xmm0, %xmm5 1341; SSE41-NEXT: paddw %xmm4, %xmm5 1342; SSE41-NEXT: pmovsxbw %xmm3, %xmm4 1343; SSE41-NEXT: pmovsxbw %xmm1, %xmm6 1344; SSE41-NEXT: paddw %xmm4, %xmm6 1345; SSE41-NEXT: paddw %xmm5, %xmm6 1346; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 1347; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 1348; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1349; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 1350; SSE41-NEXT: paddw %xmm2, %xmm0 1351; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1352; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 1353; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1354; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 1355; SSE41-NEXT: paddw %xmm2, %xmm1 1356; SSE41-NEXT: paddw %xmm0, %xmm1 1357; SSE41-NEXT: paddw %xmm6, %xmm1 1358; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1359; SSE41-NEXT: paddw %xmm1, %xmm0 1360; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1361; SSE41-NEXT: paddw %xmm0, %xmm1 1362; SSE41-NEXT: movdqa %xmm1, %xmm0 1363; SSE41-NEXT: psrld $16, %xmm0 1364; SSE41-NEXT: paddw %xmm1, %xmm0 1365; SSE41-NEXT: movd %xmm0, %eax 1366; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 1367; SSE41-NEXT: retq 1368; 1369; AVX1-SLOW-LABEL: test_v64i16_v64i8: 1370; AVX1-SLOW: # %bb.0: 1371; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm2 1372; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm3 1373; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 1374; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3 1375; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm4 1376; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 1377; AVX1-SLOW-NEXT: vpmovsxbw %xmm5, %xmm6 1378; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm6, %xmm4 1379; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2 1380; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1381; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1 1382; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1383; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0 1384; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1385; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] 1386; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1 1387; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] 1388; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm3 1389; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 1390; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1391; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0 1392; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1393; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1394; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1395; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1396; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 1397; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1398; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 1399; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1400; AVX1-SLOW-NEXT: vzeroupper 1401; AVX1-SLOW-NEXT: retq 1402; 1403; AVX1-FAST-LABEL: test_v64i16_v64i8: 1404; AVX1-FAST: # %bb.0: 1405; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm2 1406; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm3 1407; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 1408; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm3 1409; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm4 1410; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm5 1411; AVX1-FAST-NEXT: vpmovsxbw %xmm5, %xmm6 1412; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm6, %xmm4 1413; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2 1414; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1415; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1 1416; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1417; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 1418; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1419; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] 1420; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1 1421; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] 1422; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm3 1423; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 1424; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1425; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm2, %xmm0 1426; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1427; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1428; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1429; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1430; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1431; AVX1-FAST-NEXT: vmovd %xmm0, %eax 1432; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1433; AVX1-FAST-NEXT: vzeroupper 1434; AVX1-FAST-NEXT: retq 1435; 1436; AVX2-LABEL: test_v64i16_v64i8: 1437; AVX2: # %bb.0: 1438; AVX2-NEXT: vpmovsxbw %xmm1, %ymm2 1439; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3 1440; AVX2-NEXT: vpaddw %ymm2, %ymm3, %ymm2 1441; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1442; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1443; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1444; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1445; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1446; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0 1447; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1448; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1449; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1450; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1451; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1452; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1453; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1454; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1455; AVX2-NEXT: vmovd %xmm0, %eax 1456; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1457; AVX2-NEXT: vzeroupper 1458; AVX2-NEXT: retq 1459; 1460; AVX512-LABEL: test_v64i16_v64i8: 1461; AVX512: # %bb.0: 1462; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1463; AVX512-NEXT: vpmovsxbw %ymm1, %zmm1 1464; AVX512-NEXT: vpmovsxbw %ymm0, %zmm0 1465; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 1466; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1467; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 1468; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1469; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1470; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1471; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1472; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1473; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1474; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1475; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1476; AVX512-NEXT: vmovd %xmm0, %eax 1477; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1478; AVX512-NEXT: vzeroupper 1479; AVX512-NEXT: retq 1480 %1 = sext <64 x i8> %a0 to <64 x i16> 1481 %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1) 1482 ret i16 %2 1483 1484} 1485 1486; 1487; vXi1 - sum of extended bool vectors 1488; 1489 1490define i64 @test_v2i64_v2i1(<2 x i64> %a0) { 1491; SSE2-LABEL: test_v2i64_v2i1: 1492; SSE2: # %bb.0: 1493; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1494; SSE2-NEXT: pxor %xmm1, %xmm1 1495; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 1496; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1497; SSE2-NEXT: paddq %xmm1, %xmm0 1498; SSE2-NEXT: movq %xmm0, %rax 1499; SSE2-NEXT: retq 1500; 1501; SSE41-LABEL: test_v2i64_v2i1: 1502; SSE41: # %bb.0: 1503; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1504; SSE41-NEXT: pxor %xmm1, %xmm1 1505; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 1506; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1507; SSE41-NEXT: paddq %xmm1, %xmm0 1508; SSE41-NEXT: movq %xmm0, %rax 1509; SSE41-NEXT: retq 1510; 1511; AVX-LABEL: test_v2i64_v2i1: 1512; AVX: # %bb.0: 1513; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1514; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 1515; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1516; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1517; AVX-NEXT: vmovq %xmm0, %rax 1518; AVX-NEXT: retq 1519 %1 = icmp slt <2 x i64> %a0, zeroinitializer 1520 %2 = sext <2 x i1> %1 to <2 x i64> 1521 %3 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %2) 1522 ret i64 %3 1523} 1524 1525define i32 @test_v4i32_v4i1(<4 x i32> %a0) { 1526; SSE2-LABEL: test_v4i32_v4i1: 1527; SSE2: # %bb.0: 1528; SSE2-NEXT: pxor %xmm1, %xmm1 1529; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 1530; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1531; SSE2-NEXT: paddd %xmm1, %xmm0 1532; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1533; SSE2-NEXT: paddd %xmm0, %xmm1 1534; SSE2-NEXT: movd %xmm1, %eax 1535; SSE2-NEXT: retq 1536; 1537; SSE41-LABEL: test_v4i32_v4i1: 1538; SSE41: # %bb.0: 1539; SSE41-NEXT: pxor %xmm1, %xmm1 1540; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 1541; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1542; SSE41-NEXT: paddd %xmm1, %xmm0 1543; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1544; SSE41-NEXT: paddd %xmm0, %xmm1 1545; SSE41-NEXT: movd %xmm1, %eax 1546; SSE41-NEXT: retq 1547; 1548; AVX1-SLOW-LABEL: test_v4i32_v4i1: 1549; AVX1-SLOW: # %bb.0: 1550; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1551; AVX1-SLOW-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 1552; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1553; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1554; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1555; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1556; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 1557; AVX1-SLOW-NEXT: retq 1558; 1559; AVX1-FAST-LABEL: test_v4i32_v4i1: 1560; AVX1-FAST: # %bb.0: 1561; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 1562; AVX1-FAST-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 1563; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1564; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1565; AVX1-FAST-NEXT: vmovd %xmm0, %eax 1566; AVX1-FAST-NEXT: retq 1567; 1568; AVX2-LABEL: test_v4i32_v4i1: 1569; AVX2: # %bb.0: 1570; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1571; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 1572; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1573; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1574; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1575; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1576; AVX2-NEXT: vmovd %xmm0, %eax 1577; AVX2-NEXT: retq 1578; 1579; AVX512-LABEL: test_v4i32_v4i1: 1580; AVX512: # %bb.0: 1581; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1582; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 1583; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1584; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1585; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1586; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1587; AVX512-NEXT: vmovd %xmm0, %eax 1588; AVX512-NEXT: retq 1589 %1 = icmp slt <4 x i32> %a0, zeroinitializer 1590 %2 = sext <4 x i1> %1 to <4 x i32> 1591 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 1592 ret i32 %3 1593} 1594 1595define i16 @test_v8i16_v8i1(<8 x i16> %a0) { 1596; SSE2-LABEL: test_v8i16_v8i1: 1597; SSE2: # %bb.0: 1598; SSE2-NEXT: pxor %xmm1, %xmm1 1599; SSE2-NEXT: pcmpgtw %xmm0, %xmm1 1600; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1601; SSE2-NEXT: paddw %xmm1, %xmm0 1602; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1603; SSE2-NEXT: paddw %xmm0, %xmm1 1604; SSE2-NEXT: movdqa %xmm1, %xmm0 1605; SSE2-NEXT: psrld $16, %xmm0 1606; SSE2-NEXT: paddw %xmm1, %xmm0 1607; SSE2-NEXT: movd %xmm0, %eax 1608; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 1609; SSE2-NEXT: retq 1610; 1611; SSE41-LABEL: test_v8i16_v8i1: 1612; SSE41: # %bb.0: 1613; SSE41-NEXT: pxor %xmm1, %xmm1 1614; SSE41-NEXT: pcmpgtw %xmm0, %xmm1 1615; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1616; SSE41-NEXT: paddw %xmm1, %xmm0 1617; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1618; SSE41-NEXT: paddw %xmm0, %xmm1 1619; SSE41-NEXT: movdqa %xmm1, %xmm0 1620; SSE41-NEXT: psrld $16, %xmm0 1621; SSE41-NEXT: paddw %xmm1, %xmm0 1622; SSE41-NEXT: movd %xmm0, %eax 1623; SSE41-NEXT: # kill: def $ax killed $ax killed $eax 1624; SSE41-NEXT: retq 1625; 1626; AVX1-SLOW-LABEL: test_v8i16_v8i1: 1627; AVX1-SLOW: # %bb.0: 1628; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1629; AVX1-SLOW-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 1630; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1631; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1632; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1633; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1634; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 1635; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1636; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 1637; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1638; AVX1-SLOW-NEXT: retq 1639; 1640; AVX1-FAST-LABEL: test_v8i16_v8i1: 1641; AVX1-FAST: # %bb.0: 1642; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 1643; AVX1-FAST-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 1644; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1645; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1646; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1647; AVX1-FAST-NEXT: vmovd %xmm0, %eax 1648; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1649; AVX1-FAST-NEXT: retq 1650; 1651; AVX2-LABEL: test_v8i16_v8i1: 1652; AVX2: # %bb.0: 1653; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1654; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 1655; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1656; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1657; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1658; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1659; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1660; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1661; AVX2-NEXT: vmovd %xmm0, %eax 1662; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1663; AVX2-NEXT: retq 1664; 1665; AVX512-LABEL: test_v8i16_v8i1: 1666; AVX512: # %bb.0: 1667; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1668; AVX512-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 1669; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1670; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1671; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1672; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1673; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1674; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1675; AVX512-NEXT: vmovd %xmm0, %eax 1676; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1677; AVX512-NEXT: retq 1678 %1 = icmp slt <8 x i16> %a0, zeroinitializer 1679 %2 = sext <8 x i1> %1 to <8 x i16> 1680 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) 1681 ret i16 %3 1682} 1683 1684define i8 @test_v16i8_v16i1(<16 x i8> %a0) { 1685; SSE2-LABEL: test_v16i8_v16i1: 1686; SSE2: # %bb.0: 1687; SSE2-NEXT: pxor %xmm1, %xmm1 1688; SSE2-NEXT: pxor %xmm2, %xmm2 1689; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 1690; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1691; SSE2-NEXT: paddb %xmm2, %xmm0 1692; SSE2-NEXT: psadbw %xmm1, %xmm0 1693; SSE2-NEXT: movd %xmm0, %eax 1694; SSE2-NEXT: # kill: def $al killed $al killed $eax 1695; SSE2-NEXT: retq 1696; 1697; SSE41-LABEL: test_v16i8_v16i1: 1698; SSE41: # %bb.0: 1699; SSE41-NEXT: pxor %xmm1, %xmm1 1700; SSE41-NEXT: pxor %xmm2, %xmm2 1701; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 1702; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1703; SSE41-NEXT: paddb %xmm2, %xmm0 1704; SSE41-NEXT: psadbw %xmm1, %xmm0 1705; SSE41-NEXT: movd %xmm0, %eax 1706; SSE41-NEXT: # kill: def $al killed $al killed $eax 1707; SSE41-NEXT: retq 1708; 1709; AVX-LABEL: test_v16i8_v16i1: 1710; AVX: # %bb.0: 1711; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1712; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 1713; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 1714; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1715; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1716; AVX-NEXT: vmovd %xmm0, %eax 1717; AVX-NEXT: # kill: def $al killed $al killed $eax 1718; AVX-NEXT: retq 1719 %1 = icmp slt <16 x i8> %a0, zeroinitializer 1720 %2 = sext <16 x i1> %1 to <16 x i8> 1721 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) 1722 ret i8 %3 1723} 1724 1725define i8 @test_v32i8_v32i1(<32 x i8> %a0) { 1726; SSE2-LABEL: test_v32i8_v32i1: 1727; SSE2: # %bb.0: 1728; SSE2-NEXT: pxor %xmm2, %xmm2 1729; SSE2-NEXT: pxor %xmm3, %xmm3 1730; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 1731; SSE2-NEXT: pxor %xmm1, %xmm1 1732; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 1733; SSE2-NEXT: paddb %xmm3, %xmm1 1734; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1735; SSE2-NEXT: paddb %xmm1, %xmm0 1736; SSE2-NEXT: psadbw %xmm2, %xmm0 1737; SSE2-NEXT: movd %xmm0, %eax 1738; SSE2-NEXT: # kill: def $al killed $al killed $eax 1739; SSE2-NEXT: retq 1740; 1741; SSE41-LABEL: test_v32i8_v32i1: 1742; SSE41: # %bb.0: 1743; SSE41-NEXT: pxor %xmm2, %xmm2 1744; SSE41-NEXT: pxor %xmm3, %xmm3 1745; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 1746; SSE41-NEXT: pxor %xmm1, %xmm1 1747; SSE41-NEXT: pcmpgtb %xmm0, %xmm1 1748; SSE41-NEXT: paddb %xmm3, %xmm1 1749; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1750; SSE41-NEXT: paddb %xmm1, %xmm0 1751; SSE41-NEXT: psadbw %xmm2, %xmm0 1752; SSE41-NEXT: movd %xmm0, %eax 1753; SSE41-NEXT: # kill: def $al killed $al killed $eax 1754; SSE41-NEXT: retq 1755; 1756; AVX1-LABEL: test_v32i8_v32i1: 1757; AVX1: # %bb.0: 1758; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1759; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1760; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 1761; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 1762; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1763; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1764; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1765; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 1766; AVX1-NEXT: vmovd %xmm0, %eax 1767; AVX1-NEXT: # kill: def $al killed $al killed $eax 1768; AVX1-NEXT: vzeroupper 1769; AVX1-NEXT: retq 1770; 1771; AVX2-LABEL: test_v32i8_v32i1: 1772; AVX2: # %bb.0: 1773; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1774; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 1775; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1776; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1777; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1778; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1779; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1780; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1781; AVX2-NEXT: vmovd %xmm0, %eax 1782; AVX2-NEXT: # kill: def $al killed $al killed $eax 1783; AVX2-NEXT: vzeroupper 1784; AVX2-NEXT: retq 1785; 1786; AVX512-LABEL: test_v32i8_v32i1: 1787; AVX512: # %bb.0: 1788; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1789; AVX512-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 1790; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1791; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1792; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1793; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1794; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1795; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1796; AVX512-NEXT: vmovd %xmm0, %eax 1797; AVX512-NEXT: # kill: def $al killed $al killed $eax 1798; AVX512-NEXT: vzeroupper 1799; AVX512-NEXT: retq 1800 %1 = icmp slt <32 x i8> %a0, zeroinitializer 1801 %2 = sext <32 x i1> %1 to <32 x i8> 1802 %3 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %2) 1803 ret i8 %3 1804} 1805 1806define i8 @test_v64i8_v64i1(<64 x i8> %a0) { 1807; SSE2-LABEL: test_v64i8_v64i1: 1808; SSE2: # %bb.0: 1809; SSE2-NEXT: pxor %xmm4, %xmm4 1810; SSE2-NEXT: pxor %xmm5, %xmm5 1811; SSE2-NEXT: pcmpgtb %xmm2, %xmm5 1812; SSE2-NEXT: pxor %xmm2, %xmm2 1813; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 1814; SSE2-NEXT: paddb %xmm5, %xmm2 1815; SSE2-NEXT: pxor %xmm0, %xmm0 1816; SSE2-NEXT: pcmpgtb %xmm3, %xmm0 1817; SSE2-NEXT: pxor %xmm3, %xmm3 1818; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 1819; SSE2-NEXT: paddb %xmm0, %xmm3 1820; SSE2-NEXT: paddb %xmm2, %xmm3 1821; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 1822; SSE2-NEXT: paddb %xmm3, %xmm0 1823; SSE2-NEXT: psadbw %xmm4, %xmm0 1824; SSE2-NEXT: movd %xmm0, %eax 1825; SSE2-NEXT: # kill: def $al killed $al killed $eax 1826; SSE2-NEXT: retq 1827; 1828; SSE41-LABEL: test_v64i8_v64i1: 1829; SSE41: # %bb.0: 1830; SSE41-NEXT: pxor %xmm4, %xmm4 1831; SSE41-NEXT: pxor %xmm5, %xmm5 1832; SSE41-NEXT: pcmpgtb %xmm2, %xmm5 1833; SSE41-NEXT: pxor %xmm2, %xmm2 1834; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 1835; SSE41-NEXT: paddb %xmm5, %xmm2 1836; SSE41-NEXT: pxor %xmm0, %xmm0 1837; SSE41-NEXT: pcmpgtb %xmm3, %xmm0 1838; SSE41-NEXT: pxor %xmm3, %xmm3 1839; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 1840; SSE41-NEXT: paddb %xmm0, %xmm3 1841; SSE41-NEXT: paddb %xmm2, %xmm3 1842; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 1843; SSE41-NEXT: paddb %xmm3, %xmm0 1844; SSE41-NEXT: psadbw %xmm4, %xmm0 1845; SSE41-NEXT: movd %xmm0, %eax 1846; SSE41-NEXT: # kill: def $al killed $al killed $eax 1847; SSE41-NEXT: retq 1848; 1849; AVX1-LABEL: test_v64i8_v64i1: 1850; AVX1: # %bb.0: 1851; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1852; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3 1853; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm4 1854; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 1855; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1856; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 1857; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1858; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 1859; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1860; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 1861; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1862; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1863; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 1864; AVX1-NEXT: vmovd %xmm0, %eax 1865; AVX1-NEXT: # kill: def $al killed $al killed $eax 1866; AVX1-NEXT: vzeroupper 1867; AVX1-NEXT: retq 1868; 1869; AVX2-LABEL: test_v64i8_v64i1: 1870; AVX2: # %bb.0: 1871; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1872; AVX2-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 1873; AVX2-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 1874; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1875; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1876; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1877; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1878; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1879; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1880; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1881; AVX2-NEXT: vmovd %xmm0, %eax 1882; AVX2-NEXT: # kill: def $al killed $al killed $eax 1883; AVX2-NEXT: vzeroupper 1884; AVX2-NEXT: retq 1885; 1886; AVX512-LABEL: test_v64i8_v64i1: 1887; AVX512: # %bb.0: 1888; AVX512-NEXT: vpmovb2m %zmm0, %k0 1889; AVX512-NEXT: vpmovm2b %k0, %zmm0 1890; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1891; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1892; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1893; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1894; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1895; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1896; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1897; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1898; AVX512-NEXT: vmovd %xmm0, %eax 1899; AVX512-NEXT: # kill: def $al killed $al killed $eax 1900; AVX512-NEXT: vzeroupper 1901; AVX512-NEXT: retq 1902 %1 = icmp slt <64 x i8> %a0, zeroinitializer 1903 %2 = sext <64 x i1> %1 to <64 x i8> 1904 %3 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %2) 1905 ret i8 %3 1906} 1907 1908define i8 @test_v128i8_v128i1(<128 x i8> %a0) { 1909; SSE2-LABEL: test_v128i8_v128i1: 1910; SSE2: # %bb.0: 1911; SSE2-NEXT: pxor %xmm8, %xmm8 1912; SSE2-NEXT: pxor %xmm9, %xmm9 1913; SSE2-NEXT: pcmpgtb %xmm4, %xmm9 1914; SSE2-NEXT: pxor %xmm4, %xmm4 1915; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 1916; SSE2-NEXT: paddb %xmm9, %xmm4 1917; SSE2-NEXT: pxor %xmm0, %xmm0 1918; SSE2-NEXT: pcmpgtb %xmm6, %xmm0 1919; SSE2-NEXT: pxor %xmm6, %xmm6 1920; SSE2-NEXT: pcmpgtb %xmm2, %xmm6 1921; SSE2-NEXT: paddb %xmm0, %xmm6 1922; SSE2-NEXT: paddb %xmm4, %xmm6 1923; SSE2-NEXT: pxor %xmm0, %xmm0 1924; SSE2-NEXT: pcmpgtb %xmm5, %xmm0 1925; SSE2-NEXT: pxor %xmm2, %xmm2 1926; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 1927; SSE2-NEXT: paddb %xmm0, %xmm2 1928; SSE2-NEXT: pxor %xmm0, %xmm0 1929; SSE2-NEXT: pcmpgtb %xmm7, %xmm0 1930; SSE2-NEXT: pxor %xmm1, %xmm1 1931; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 1932; SSE2-NEXT: paddb %xmm0, %xmm1 1933; SSE2-NEXT: paddb %xmm2, %xmm1 1934; SSE2-NEXT: paddb %xmm6, %xmm1 1935; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1936; SSE2-NEXT: paddb %xmm1, %xmm0 1937; SSE2-NEXT: psadbw %xmm8, %xmm0 1938; SSE2-NEXT: movd %xmm0, %eax 1939; SSE2-NEXT: # kill: def $al killed $al killed $eax 1940; SSE2-NEXT: retq 1941; 1942; SSE41-LABEL: test_v128i8_v128i1: 1943; SSE41: # %bb.0: 1944; SSE41-NEXT: pxor %xmm8, %xmm8 1945; SSE41-NEXT: pxor %xmm9, %xmm9 1946; SSE41-NEXT: pcmpgtb %xmm4, %xmm9 1947; SSE41-NEXT: pxor %xmm4, %xmm4 1948; SSE41-NEXT: pcmpgtb %xmm0, %xmm4 1949; SSE41-NEXT: paddb %xmm9, %xmm4 1950; SSE41-NEXT: pxor %xmm0, %xmm0 1951; SSE41-NEXT: pcmpgtb %xmm6, %xmm0 1952; SSE41-NEXT: pxor %xmm6, %xmm6 1953; SSE41-NEXT: pcmpgtb %xmm2, %xmm6 1954; SSE41-NEXT: paddb %xmm0, %xmm6 1955; SSE41-NEXT: paddb %xmm4, %xmm6 1956; SSE41-NEXT: pxor %xmm0, %xmm0 1957; SSE41-NEXT: pcmpgtb %xmm5, %xmm0 1958; SSE41-NEXT: pxor %xmm2, %xmm2 1959; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 1960; SSE41-NEXT: paddb %xmm0, %xmm2 1961; SSE41-NEXT: pxor %xmm0, %xmm0 1962; SSE41-NEXT: pcmpgtb %xmm7, %xmm0 1963; SSE41-NEXT: pxor %xmm1, %xmm1 1964; SSE41-NEXT: pcmpgtb %xmm3, %xmm1 1965; SSE41-NEXT: paddb %xmm0, %xmm1 1966; SSE41-NEXT: paddb %xmm2, %xmm1 1967; SSE41-NEXT: paddb %xmm6, %xmm1 1968; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1969; SSE41-NEXT: paddb %xmm1, %xmm0 1970; SSE41-NEXT: psadbw %xmm8, %xmm0 1971; SSE41-NEXT: movd %xmm0, %eax 1972; SSE41-NEXT: # kill: def $al killed $al killed $eax 1973; SSE41-NEXT: retq 1974; 1975; AVX1-LABEL: test_v128i8_v128i1: 1976; AVX1: # %bb.0: 1977; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 1978; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm5 1979; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm6 1980; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 1981; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm6 1982; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm7 1983; AVX1-NEXT: vpaddb %xmm6, %xmm7, %xmm6 1984; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 1985; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1986; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 1987; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1988; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0 1989; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1990; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1991; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 1992; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1993; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1 1994; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 1995; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1996; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 1997; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1998; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1999; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 2000; AVX1-NEXT: vmovd %xmm0, %eax 2001; AVX1-NEXT: # kill: def $al killed $al killed $eax 2002; AVX1-NEXT: vzeroupper 2003; AVX1-NEXT: retq 2004; 2005; AVX2-LABEL: test_v128i8_v128i1: 2006; AVX2: # %bb.0: 2007; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 2008; AVX2-NEXT: vpcmpgtb %ymm2, %ymm4, %ymm2 2009; AVX2-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 2010; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 2011; AVX2-NEXT: vpcmpgtb %ymm3, %ymm4, %ymm2 2012; AVX2-NEXT: vpcmpgtb %ymm1, %ymm4, %ymm1 2013; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 2014; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 2015; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2016; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2017; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2018; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2019; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2020; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 2021; AVX2-NEXT: vmovd %xmm0, %eax 2022; AVX2-NEXT: # kill: def $al killed $al killed $eax 2023; AVX2-NEXT: vzeroupper 2024; AVX2-NEXT: retq 2025; 2026; AVX512-LABEL: test_v128i8_v128i1: 2027; AVX512: # %bb.0: 2028; AVX512-NEXT: vpmovb2m %zmm0, %k0 2029; AVX512-NEXT: vpmovb2m %zmm1, %k1 2030; AVX512-NEXT: vpmovm2b %k1, %zmm0 2031; AVX512-NEXT: vpmovm2b %k0, %zmm1 2032; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0 2033; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2034; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 2035; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 2036; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2037; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2038; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 2039; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 2040; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 2041; AVX512-NEXT: vmovd %xmm0, %eax 2042; AVX512-NEXT: # kill: def $al killed $al killed $eax 2043; AVX512-NEXT: vzeroupper 2044; AVX512-NEXT: retq 2045 %1 = icmp slt <128 x i8> %a0, zeroinitializer 2046 %2 = sext <128 x i1> %1 to <128 x i8> 2047 %3 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %2) 2048 ret i8 %3 2049} 2050 2051declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 2052declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 2053declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 2054declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 2055 2056declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 2057declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 2058declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 2059declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 2060declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 2061 2062declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) 2063declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 2064declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 2065declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 2066declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 2067declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 2068 2069declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) 2070declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) 2071declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 2072declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 2073declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 2074declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 2075declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) 2076