1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW 7 8define i32 @_Z10test_shortPsS_i_128(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 9; SSE2-LABEL: _Z10test_shortPsS_i_128: 10; SSE2: # %bb.0: # %entry 11; SSE2-NEXT: movl %edx, %eax 12; SSE2-NEXT: pxor %xmm0, %xmm0 13; SSE2-NEXT: xorl %ecx, %ecx 14; SSE2-NEXT: pxor %xmm1, %xmm1 15; SSE2-NEXT: .p2align 4 16; SSE2-NEXT: .LBB0_1: # %vector.body 17; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 18; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 19; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 20; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 21; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 22; SSE2-NEXT: pmaddwd %xmm3, %xmm2 23; SSE2-NEXT: paddd %xmm2, %xmm1 24; SSE2-NEXT: addq $8, %rcx 25; SSE2-NEXT: cmpq %rcx, %rax 26; SSE2-NEXT: jne .LBB0_1 27; SSE2-NEXT: # %bb.2: # %middle.block 28; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 29; SSE2-NEXT: paddd %xmm1, %xmm0 30; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 31; SSE2-NEXT: paddd %xmm0, %xmm1 32; SSE2-NEXT: movd %xmm1, %eax 33; SSE2-NEXT: retq 34; 35; AVX-LABEL: _Z10test_shortPsS_i_128: 36; AVX: # %bb.0: # %entry 37; AVX-NEXT: movl %edx, %eax 38; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 39; AVX-NEXT: xorl %ecx, %ecx 40; AVX-NEXT: .p2align 4 41; AVX-NEXT: .LBB0_1: # %vector.body 42; AVX-NEXT: # =>This Inner Loop Header: Depth=1 43; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 44; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 45; AVX-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 46; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 47; AVX-NEXT: addq $8, %rcx 48; AVX-NEXT: cmpq %rcx, %rax 49; AVX-NEXT: jne .LBB0_1 50; AVX-NEXT: # %bb.2: # %middle.block 51; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 52; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 53; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 54; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 55; AVX-NEXT: vmovd %xmm0, %eax 56; AVX-NEXT: retq 57entry: 58 %3 = zext i32 %2 to i64 59 br label %vector.body 60 61vector.body: 62 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 63 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 64 %4 = getelementptr inbounds i16, ptr %0, i64 %index 65 %5 = bitcast ptr %4 to ptr 66 %wide.load = load <4 x i16>, ptr %5, align 2 67 %6 = sext <4 x i16> %wide.load to <4 x i32> 68 %7 = getelementptr inbounds i16, ptr %1, i64 %index 69 %8 = bitcast ptr %7 to ptr 70 %wide.load14 = load <4 x i16>, ptr %8, align 2 71 %9 = sext <4 x i16> %wide.load14 to <4 x i32> 72 %10 = mul nsw <4 x i32> %9, %6 73 %11 = add nsw <4 x i32> %10, %vec.phi 74 %index.next = add i64 %index, 8 75 %12 = icmp eq i64 %index.next, %3 76 br i1 %12, label %middle.block, label %vector.body 77 78middle.block: 79 %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 80 %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15 81 %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 82 %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17 83 %13 = extractelement <4 x i32> %bin.rdx18, i32 0 84 ret i32 %13 85} 86 87define i32 @_Z10test_shortPsS_i_256(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 88; SSE2-LABEL: _Z10test_shortPsS_i_256: 89; SSE2: # %bb.0: # %entry 90; SSE2-NEXT: movl %edx, %eax 91; SSE2-NEXT: pxor %xmm0, %xmm0 92; SSE2-NEXT: xorl %ecx, %ecx 93; SSE2-NEXT: pxor %xmm1, %xmm1 94; SSE2-NEXT: .p2align 4 95; SSE2-NEXT: .LBB1_1: # %vector.body 96; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 97; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 98; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 99; SSE2-NEXT: pmaddwd %xmm2, %xmm3 100; SSE2-NEXT: paddd %xmm3, %xmm1 101; SSE2-NEXT: addq $8, %rcx 102; SSE2-NEXT: cmpq %rcx, %rax 103; SSE2-NEXT: jne .LBB1_1 104; SSE2-NEXT: # %bb.2: # %middle.block 105; SSE2-NEXT: paddd %xmm0, %xmm1 106; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 107; SSE2-NEXT: paddd %xmm1, %xmm0 108; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 109; SSE2-NEXT: paddd %xmm0, %xmm1 110; SSE2-NEXT: movd %xmm1, %eax 111; SSE2-NEXT: retq 112; 113; AVX1-LABEL: _Z10test_shortPsS_i_256: 114; AVX1: # %bb.0: # %entry 115; AVX1-NEXT: movl %edx, %eax 116; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 117; AVX1-NEXT: xorl %ecx, %ecx 118; AVX1-NEXT: .p2align 4 119; AVX1-NEXT: .LBB1_1: # %vector.body 120; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 121; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 122; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 123; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 124; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 125; AVX1-NEXT: addq $8, %rcx 126; AVX1-NEXT: cmpq %rcx, %rax 127; AVX1-NEXT: jne .LBB1_1 128; AVX1-NEXT: # %bb.2: # %middle.block 129; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 130; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 131; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 132; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 133; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 134; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 135; AVX1-NEXT: vmovd %xmm0, %eax 136; AVX1-NEXT: vzeroupper 137; AVX1-NEXT: retq 138; 139; AVX256-LABEL: _Z10test_shortPsS_i_256: 140; AVX256: # %bb.0: # %entry 141; AVX256-NEXT: movl %edx, %eax 142; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 143; AVX256-NEXT: xorl %ecx, %ecx 144; AVX256-NEXT: .p2align 4 145; AVX256-NEXT: .LBB1_1: # %vector.body 146; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 147; AVX256-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 148; AVX256-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 149; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 150; AVX256-NEXT: addq $8, %rcx 151; AVX256-NEXT: cmpq %rcx, %rax 152; AVX256-NEXT: jne .LBB1_1 153; AVX256-NEXT: # %bb.2: # %middle.block 154; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 155; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 156; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 157; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 158; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 159; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 160; AVX256-NEXT: vmovd %xmm0, %eax 161; AVX256-NEXT: vzeroupper 162; AVX256-NEXT: retq 163entry: 164 %3 = zext i32 %2 to i64 165 br label %vector.body 166 167vector.body: 168 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 169 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 170 %4 = getelementptr inbounds i16, ptr %0, i64 %index 171 %5 = bitcast ptr %4 to ptr 172 %wide.load = load <8 x i16>, ptr %5, align 2 173 %6 = sext <8 x i16> %wide.load to <8 x i32> 174 %7 = getelementptr inbounds i16, ptr %1, i64 %index 175 %8 = bitcast ptr %7 to ptr 176 %wide.load14 = load <8 x i16>, ptr %8, align 2 177 %9 = sext <8 x i16> %wide.load14 to <8 x i32> 178 %10 = mul nsw <8 x i32> %9, %6 179 %11 = add nsw <8 x i32> %10, %vec.phi 180 %index.next = add i64 %index, 8 181 %12 = icmp eq i64 %index.next, %3 182 br i1 %12, label %middle.block, label %vector.body 183 184middle.block: 185 %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 186 %bin.rdx = add <8 x i32> %11, %rdx.shuf 187 %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 188 %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15 189 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 190 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 191 %13 = extractelement <8 x i32> %bin.rdx18, i32 0 192 ret i32 %13 193} 194 195define i32 @_Z10test_shortPsS_i_512(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 196; SSE2-LABEL: _Z10test_shortPsS_i_512: 197; SSE2: # %bb.0: # %entry 198; SSE2-NEXT: movl %edx, %eax 199; SSE2-NEXT: pxor %xmm0, %xmm0 200; SSE2-NEXT: xorl %ecx, %ecx 201; SSE2-NEXT: pxor %xmm2, %xmm2 202; SSE2-NEXT: pxor %xmm1, %xmm1 203; SSE2-NEXT: .p2align 4 204; SSE2-NEXT: .LBB2_1: # %vector.body 205; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 206; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm3 207; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm4 208; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 209; SSE2-NEXT: pmaddwd %xmm3, %xmm5 210; SSE2-NEXT: paddd %xmm5, %xmm2 211; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 212; SSE2-NEXT: pmaddwd %xmm4, %xmm3 213; SSE2-NEXT: paddd %xmm3, %xmm1 214; SSE2-NEXT: addq $16, %rcx 215; SSE2-NEXT: cmpq %rcx, %rax 216; SSE2-NEXT: jne .LBB2_1 217; SSE2-NEXT: # %bb.2: # %middle.block 218; SSE2-NEXT: paddd %xmm0, %xmm2 219; SSE2-NEXT: paddd %xmm0, %xmm1 220; SSE2-NEXT: paddd %xmm2, %xmm1 221; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 222; SSE2-NEXT: paddd %xmm1, %xmm0 223; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 224; SSE2-NEXT: paddd %xmm0, %xmm1 225; SSE2-NEXT: movd %xmm1, %eax 226; SSE2-NEXT: retq 227; 228; AVX1-LABEL: _Z10test_shortPsS_i_512: 229; AVX1: # %bb.0: # %entry 230; AVX1-NEXT: movl %edx, %eax 231; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 232; AVX1-NEXT: xorl %ecx, %ecx 233; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 234; AVX1-NEXT: .p2align 4 235; AVX1-NEXT: .LBB2_1: # %vector.body 236; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 237; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 238; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3 239; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 240; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3 241; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 242; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 243; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 244; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 245; AVX1-NEXT: addq $16, %rcx 246; AVX1-NEXT: cmpq %rcx, %rax 247; AVX1-NEXT: jne .LBB2_1 248; AVX1-NEXT: # %bb.2: # %middle.block 249; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 250; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 251; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 252; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 253; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 254; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 255; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 256; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 257; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 258; AVX1-NEXT: vmovd %xmm0, %eax 259; AVX1-NEXT: vzeroupper 260; AVX1-NEXT: retq 261; 262; AVX2-LABEL: _Z10test_shortPsS_i_512: 263; AVX2: # %bb.0: # %entry 264; AVX2-NEXT: movl %edx, %eax 265; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 266; AVX2-NEXT: xorl %ecx, %ecx 267; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 268; AVX2-NEXT: .p2align 4 269; AVX2-NEXT: .LBB2_1: # %vector.body 270; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 271; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 272; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 273; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 274; AVX2-NEXT: addq $16, %rcx 275; AVX2-NEXT: cmpq %rcx, %rax 276; AVX2-NEXT: jne .LBB2_1 277; AVX2-NEXT: # %bb.2: # %middle.block 278; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 279; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 280; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 281; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 282; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 283; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 284; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 285; AVX2-NEXT: vmovd %xmm0, %eax 286; AVX2-NEXT: vzeroupper 287; AVX2-NEXT: retq 288; 289; AVX512-LABEL: _Z10test_shortPsS_i_512: 290; AVX512: # %bb.0: # %entry 291; AVX512-NEXT: movl %edx, %eax 292; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 293; AVX512-NEXT: xorl %ecx, %ecx 294; AVX512-NEXT: .p2align 4 295; AVX512-NEXT: .LBB2_1: # %vector.body 296; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 297; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %ymm1 298; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1 299; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 300; AVX512-NEXT: addq $16, %rcx 301; AVX512-NEXT: cmpq %rcx, %rax 302; AVX512-NEXT: jne .LBB2_1 303; AVX512-NEXT: # %bb.2: # %middle.block 304; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 305; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 306; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 307; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 308; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 309; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 310; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 311; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 312; AVX512-NEXT: vmovd %xmm0, %eax 313; AVX512-NEXT: vzeroupper 314; AVX512-NEXT: retq 315entry: 316 %3 = zext i32 %2 to i64 317 br label %vector.body 318 319vector.body: 320 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 321 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 322 %4 = getelementptr inbounds i16, ptr %0, i64 %index 323 %5 = bitcast ptr %4 to ptr 324 %wide.load = load <16 x i16>, ptr %5, align 2 325 %6 = sext <16 x i16> %wide.load to <16 x i32> 326 %7 = getelementptr inbounds i16, ptr %1, i64 %index 327 %8 = bitcast ptr %7 to ptr 328 %wide.load14 = load <16 x i16>, ptr %8, align 2 329 %9 = sext <16 x i16> %wide.load14 to <16 x i32> 330 %10 = mul nsw <16 x i32> %9, %6 331 %11 = add nsw <16 x i32> %10, %vec.phi 332 %index.next = add i64 %index, 16 333 %12 = icmp eq i64 %index.next, %3 334 br i1 %12, label %middle.block, label %vector.body 335 336middle.block: 337 %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 338 %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1 339 %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 340 %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf 341 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 342 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 343 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 344 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 345 %13 = extractelement <16 x i32> %bin.rdx18, i32 0 346 ret i32 %13 347} 348 349define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 350; SSE2-LABEL: _Z10test_shortPsS_i_1024: 351; SSE2: # %bb.0: # %entry 352; SSE2-NEXT: movl %edx, %eax 353; SSE2-NEXT: pxor %xmm0, %xmm0 354; SSE2-NEXT: xorl %ecx, %ecx 355; SSE2-NEXT: pxor %xmm2, %xmm2 356; SSE2-NEXT: pxor %xmm4, %xmm4 357; SSE2-NEXT: pxor %xmm1, %xmm1 358; SSE2-NEXT: pxor %xmm3, %xmm3 359; SSE2-NEXT: .p2align 4 360; SSE2-NEXT: .LBB3_1: # %vector.body 361; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 362; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5 363; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 364; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 365; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8 366; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9 367; SSE2-NEXT: pmaddwd %xmm5, %xmm9 368; SSE2-NEXT: paddd %xmm9, %xmm2 369; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm5 370; SSE2-NEXT: pmaddwd %xmm6, %xmm5 371; SSE2-NEXT: paddd %xmm5, %xmm4 372; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm5 373; SSE2-NEXT: pmaddwd %xmm7, %xmm5 374; SSE2-NEXT: paddd %xmm5, %xmm1 375; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm5 376; SSE2-NEXT: pmaddwd %xmm8, %xmm5 377; SSE2-NEXT: paddd %xmm5, %xmm3 378; SSE2-NEXT: addq $16, %rcx 379; SSE2-NEXT: cmpq %rcx, %rax 380; SSE2-NEXT: jne .LBB3_1 381; SSE2-NEXT: # %bb.2: # %middle.block 382; SSE2-NEXT: paddd %xmm0, %xmm4 383; SSE2-NEXT: paddd %xmm0, %xmm3 384; SSE2-NEXT: paddd %xmm4, %xmm3 385; SSE2-NEXT: paddd %xmm0, %xmm2 386; SSE2-NEXT: paddd %xmm0, %xmm1 387; SSE2-NEXT: paddd %xmm2, %xmm1 388; SSE2-NEXT: paddd %xmm3, %xmm1 389; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 390; SSE2-NEXT: paddd %xmm1, %xmm0 391; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 392; SSE2-NEXT: paddd %xmm0, %xmm1 393; SSE2-NEXT: movd %xmm1, %eax 394; SSE2-NEXT: retq 395; 396; AVX1-LABEL: _Z10test_shortPsS_i_1024: 397; AVX1: # %bb.0: # %entry 398; AVX1-NEXT: movl %edx, %eax 399; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 400; AVX1-NEXT: xorl %ecx, %ecx 401; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 402; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 403; AVX1-NEXT: .p2align 4 404; AVX1-NEXT: .LBB3_1: # %vector.body 405; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 406; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm3 407; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4 408; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5 409; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6 410; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 411; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 412; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 413; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 414; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 415; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 416; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 417; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 418; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 419; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 420; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 421; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 422; AVX1-NEXT: addq $16, %rcx 423; AVX1-NEXT: cmpq %rcx, %rax 424; AVX1-NEXT: jne .LBB3_1 425; AVX1-NEXT: # %bb.2: # %middle.block 426; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3 427; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm4 428; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 429; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 430; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 431; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 432; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 433; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 434; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 435; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 436; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 437; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 438; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 439; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 440; AVX1-NEXT: vmovd %xmm0, %eax 441; AVX1-NEXT: vzeroupper 442; AVX1-NEXT: retq 443; 444; AVX2-LABEL: _Z10test_shortPsS_i_1024: 445; AVX2: # %bb.0: # %entry 446; AVX2-NEXT: movl %edx, %eax 447; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 448; AVX2-NEXT: xorl %ecx, %ecx 449; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 450; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 451; AVX2-NEXT: .p2align 4 452; AVX2-NEXT: .LBB3_1: # %vector.body 453; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 454; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 455; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 456; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 457; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 458; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 459; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 460; AVX2-NEXT: addq $16, %rcx 461; AVX2-NEXT: cmpq %rcx, %rax 462; AVX2-NEXT: jne .LBB3_1 463; AVX2-NEXT: # %bb.2: # %middle.block 464; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 465; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 466; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 467; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 468; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 469; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 470; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 471; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 472; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 473; AVX2-NEXT: vmovd %xmm0, %eax 474; AVX2-NEXT: vzeroupper 475; AVX2-NEXT: retq 476; 477; AVX512F-LABEL: _Z10test_shortPsS_i_1024: 478; AVX512F: # %bb.0: # %entry 479; AVX512F-NEXT: movl %edx, %eax 480; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 481; AVX512F-NEXT: xorl %ecx, %ecx 482; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 483; AVX512F-NEXT: .p2align 4 484; AVX512F-NEXT: .LBB3_1: # %vector.body 485; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 486; AVX512F-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 487; AVX512F-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm3 488; AVX512F-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3 489; AVX512F-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 490; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 491; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 492; AVX512F-NEXT: addq $16, %rcx 493; AVX512F-NEXT: cmpq %rcx, %rax 494; AVX512F-NEXT: jne .LBB3_1 495; AVX512F-NEXT: # %bb.2: # %middle.block 496; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 497; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 498; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 499; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 500; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 501; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 502; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 503; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 504; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 505; AVX512F-NEXT: vmovd %xmm0, %eax 506; AVX512F-NEXT: vzeroupper 507; AVX512F-NEXT: retq 508; 509; AVX512BW-LABEL: _Z10test_shortPsS_i_1024: 510; AVX512BW: # %bb.0: # %entry 511; AVX512BW-NEXT: movl %edx, %eax 512; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 513; AVX512BW-NEXT: xorl %ecx, %ecx 514; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 515; AVX512BW-NEXT: .p2align 4 516; AVX512BW-NEXT: .LBB3_1: # %vector.body 517; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 518; AVX512BW-NEXT: vmovdqu64 (%rsi,%rcx,2), %zmm2 519; AVX512BW-NEXT: vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2 520; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 521; AVX512BW-NEXT: addq $16, %rcx 522; AVX512BW-NEXT: cmpq %rcx, %rax 523; AVX512BW-NEXT: jne .LBB3_1 524; AVX512BW-NEXT: # %bb.2: # %middle.block 525; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 526; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 527; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 528; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 529; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 530; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 531; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 532; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 533; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 534; AVX512BW-NEXT: vmovd %xmm0, %eax 535; AVX512BW-NEXT: vzeroupper 536; AVX512BW-NEXT: retq 537entry: 538 %3 = zext i32 %2 to i64 539 br label %vector.body 540 541vector.body: 542 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 543 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 544 %4 = getelementptr inbounds i16, ptr %0, i64 %index 545 %5 = bitcast ptr %4 to ptr 546 %wide.load = load <32 x i16>, ptr %5, align 2 547 %6 = sext <32 x i16> %wide.load to <32 x i32> 548 %7 = getelementptr inbounds i16, ptr %1, i64 %index 549 %8 = bitcast ptr %7 to ptr 550 %wide.load14 = load <32 x i16>, ptr %8, align 2 551 %9 = sext <32 x i16> %wide.load14 to <32 x i32> 552 %10 = mul nsw <32 x i32> %9, %6 553 %11 = add nsw <32 x i32> %10, %vec.phi 554 %index.next = add i64 %index, 16 555 %12 = icmp eq i64 %index.next, %3 556 br i1 %12, label %middle.block, label %vector.body 557 558middle.block: 559 %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 560 %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2 561 %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 562 %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1 563 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 564 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 565 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 566 %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15 567 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 568 %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17 569 %13 = extractelement <32 x i32> %bin.rdx18, i32 0 570 ret i32 %13 571} 572 573define i32 @_Z9test_charPcS_i_128(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 574; SSE2-LABEL: _Z9test_charPcS_i_128: 575; SSE2: # %bb.0: # %entry 576; SSE2-NEXT: movl %edx, %eax 577; SSE2-NEXT: pxor %xmm0, %xmm0 578; SSE2-NEXT: xorl %ecx, %ecx 579; SSE2-NEXT: .p2align 4 580; SSE2-NEXT: .LBB4_1: # %vector.body 581; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 582; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 583; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 584; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 585; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 586; SSE2-NEXT: psraw $8, %xmm1 587; SSE2-NEXT: psraw $8, %xmm2 588; SSE2-NEXT: pmullw %xmm1, %xmm2 589; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 590; SSE2-NEXT: psrad $16, %xmm1 591; SSE2-NEXT: paddd %xmm1, %xmm0 592; SSE2-NEXT: addq $16, %rcx 593; SSE2-NEXT: cmpq %rcx, %rax 594; SSE2-NEXT: jne .LBB4_1 595; SSE2-NEXT: # %bb.2: # %middle.block 596; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 597; SSE2-NEXT: paddd %xmm0, %xmm1 598; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 599; SSE2-NEXT: paddd %xmm1, %xmm0 600; SSE2-NEXT: movd %xmm0, %eax 601; SSE2-NEXT: retq 602; 603; AVX-LABEL: _Z9test_charPcS_i_128: 604; AVX: # %bb.0: # %entry 605; AVX-NEXT: movl %edx, %eax 606; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 607; AVX-NEXT: xorl %ecx, %ecx 608; AVX-NEXT: .p2align 4 609; AVX-NEXT: .LBB4_1: # %vector.body 610; AVX-NEXT: # =>This Inner Loop Header: Depth=1 611; AVX-NEXT: vpmovsxbd (%rdi,%rcx), %xmm1 612; AVX-NEXT: vpmovsxbd (%rsi,%rcx), %xmm2 613; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 614; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 615; AVX-NEXT: addq $16, %rcx 616; AVX-NEXT: cmpq %rcx, %rax 617; AVX-NEXT: jne .LBB4_1 618; AVX-NEXT: # %bb.2: # %middle.block 619; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 620; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 621; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 622; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 623; AVX-NEXT: vmovd %xmm0, %eax 624; AVX-NEXT: retq 625entry: 626 %3 = zext i32 %2 to i64 627 br label %vector.body 628 629vector.body: 630 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 631 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 632 %4 = getelementptr inbounds i8, ptr %0, i64 %index 633 %5 = bitcast ptr %4 to ptr 634 %wide.load = load <4 x i8>, ptr %5, align 1 635 %6 = sext <4 x i8> %wide.load to <4 x i32> 636 %7 = getelementptr inbounds i8, ptr %1, i64 %index 637 %8 = bitcast ptr %7 to ptr 638 %wide.load14 = load <4 x i8>, ptr %8, align 1 639 %9 = sext <4 x i8> %wide.load14 to <4 x i32> 640 %10 = mul nsw <4 x i32> %9, %6 641 %11 = add nsw <4 x i32> %10, %vec.phi 642 %index.next = add i64 %index, 16 643 %12 = icmp eq i64 %index.next, %3 644 br i1 %12, label %middle.block, label %vector.body 645 646middle.block: 647 %rdx.shuf17 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 648 %bin.rdx18 = add <4 x i32> %11, %rdx.shuf17 649 %rdx.shuf19 = shufflevector <4 x i32> %bin.rdx18, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 650 %bin.rdx20 = add <4 x i32> %bin.rdx18, %rdx.shuf19 651 %13 = extractelement <4 x i32> %bin.rdx20, i32 0 652 ret i32 %13 653} 654 655define i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 656; SSE2-LABEL: _Z9test_charPcS_i_256: 657; SSE2: # %bb.0: # %entry 658; SSE2-NEXT: movl %edx, %eax 659; SSE2-NEXT: pxor %xmm0, %xmm0 660; SSE2-NEXT: xorl %ecx, %ecx 661; SSE2-NEXT: pxor %xmm1, %xmm1 662; SSE2-NEXT: .p2align 4 663; SSE2-NEXT: .LBB5_1: # %vector.body 664; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 665; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 666; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 667; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 668; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 669; SSE2-NEXT: psraw $8, %xmm2 670; SSE2-NEXT: psraw $8, %xmm3 671; SSE2-NEXT: pmaddwd %xmm2, %xmm3 672; SSE2-NEXT: paddd %xmm3, %xmm1 673; SSE2-NEXT: addq $16, %rcx 674; SSE2-NEXT: cmpq %rcx, %rax 675; SSE2-NEXT: jne .LBB5_1 676; SSE2-NEXT: # %bb.2: # %middle.block 677; SSE2-NEXT: paddd %xmm0, %xmm1 678; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 679; SSE2-NEXT: paddd %xmm1, %xmm0 680; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 681; SSE2-NEXT: paddd %xmm0, %xmm1 682; SSE2-NEXT: movd %xmm1, %eax 683; SSE2-NEXT: retq 684; 685; AVX1-LABEL: _Z9test_charPcS_i_256: 686; AVX1: # %bb.0: # %entry 687; AVX1-NEXT: movl %edx, %eax 688; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 689; AVX1-NEXT: xorl %ecx, %ecx 690; AVX1-NEXT: .p2align 4 691; AVX1-NEXT: .LBB5_1: # %vector.body 692; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 693; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 694; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 695; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 696; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 697; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 698; AVX1-NEXT: addq $16, %rcx 699; AVX1-NEXT: cmpq %rcx, %rax 700; AVX1-NEXT: jne .LBB5_1 701; AVX1-NEXT: # %bb.2: # %middle.block 702; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 703; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 704; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 705; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 706; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 707; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 708; AVX1-NEXT: vmovd %xmm0, %eax 709; AVX1-NEXT: vzeroupper 710; AVX1-NEXT: retq 711; 712; AVX256-LABEL: _Z9test_charPcS_i_256: 713; AVX256: # %bb.0: # %entry 714; AVX256-NEXT: movl %edx, %eax 715; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 716; AVX256-NEXT: xorl %ecx, %ecx 717; AVX256-NEXT: .p2align 4 718; AVX256-NEXT: .LBB5_1: # %vector.body 719; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 720; AVX256-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 721; AVX256-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 722; AVX256-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 723; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 724; AVX256-NEXT: addq $16, %rcx 725; AVX256-NEXT: cmpq %rcx, %rax 726; AVX256-NEXT: jne .LBB5_1 727; AVX256-NEXT: # %bb.2: # %middle.block 728; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 729; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 730; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 731; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 732; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 733; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 734; AVX256-NEXT: vmovd %xmm0, %eax 735; AVX256-NEXT: vzeroupper 736; AVX256-NEXT: retq 737entry: 738 %3 = zext i32 %2 to i64 739 br label %vector.body 740 741vector.body: 742 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 743 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 744 %4 = getelementptr inbounds i8, ptr %0, i64 %index 745 %5 = bitcast ptr %4 to ptr 746 %wide.load = load <8 x i8>, ptr %5, align 1 747 %6 = sext <8 x i8> %wide.load to <8 x i32> 748 %7 = getelementptr inbounds i8, ptr %1, i64 %index 749 %8 = bitcast ptr %7 to ptr 750 %wide.load14 = load <8 x i8>, ptr %8, align 1 751 %9 = sext <8 x i8> %wide.load14 to <8 x i32> 752 %10 = mul nsw <8 x i32> %9, %6 753 %11 = add nsw <8 x i32> %10, %vec.phi 754 %index.next = add i64 %index, 16 755 %12 = icmp eq i64 %index.next, %3 756 br i1 %12, label %middle.block, label %vector.body 757 758middle.block: 759 %rdx.shuf15 = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 760 %bin.rdx16 = add <8 x i32> %11, %rdx.shuf15 761 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 762 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 763 %rdx.shuf19 = shufflevector <8 x i32> %bin.rdx18, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 764 %bin.rdx20 = add <8 x i32> %bin.rdx18, %rdx.shuf19 765 %13 = extractelement <8 x i32> %bin.rdx20, i32 0 766 ret i32 %13 767} 768 769define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 770; SSE2-LABEL: _Z9test_charPcS_i_512: 771; SSE2: # %bb.0: # %entry 772; SSE2-NEXT: movl %edx, %eax 773; SSE2-NEXT: pxor %xmm0, %xmm0 774; SSE2-NEXT: xorl %ecx, %ecx 775; SSE2-NEXT: pxor %xmm2, %xmm2 776; SSE2-NEXT: pxor %xmm1, %xmm1 777; SSE2-NEXT: .p2align 4 778; SSE2-NEXT: .LBB6_1: # %vector.body 779; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 780; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm3 781; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm4 782; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 783; SSE2-NEXT: psraw $8, %xmm5 784; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 785; SSE2-NEXT: psraw $8, %xmm6 786; SSE2-NEXT: pmaddwd %xmm5, %xmm6 787; SSE2-NEXT: paddd %xmm6, %xmm2 788; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 789; SSE2-NEXT: psraw $8, %xmm3 790; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 791; SSE2-NEXT: psraw $8, %xmm4 792; SSE2-NEXT: pmaddwd %xmm3, %xmm4 793; SSE2-NEXT: paddd %xmm4, %xmm1 794; SSE2-NEXT: addq $16, %rcx 795; SSE2-NEXT: cmpq %rcx, %rax 796; SSE2-NEXT: jne .LBB6_1 797; SSE2-NEXT: # %bb.2: # %middle.block 798; SSE2-NEXT: paddd %xmm0, %xmm2 799; SSE2-NEXT: paddd %xmm0, %xmm1 800; SSE2-NEXT: paddd %xmm2, %xmm1 801; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 802; SSE2-NEXT: paddd %xmm1, %xmm0 803; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 804; SSE2-NEXT: paddd %xmm0, %xmm1 805; SSE2-NEXT: movd %xmm1, %eax 806; SSE2-NEXT: retq 807; 808; AVX1-LABEL: _Z9test_charPcS_i_512: 809; AVX1: # %bb.0: # %entry 810; AVX1-NEXT: movl %edx, %eax 811; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 812; AVX1-NEXT: xorl %ecx, %ecx 813; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 814; AVX1-NEXT: .p2align 4 815; AVX1-NEXT: .LBB6_1: # %vector.body 816; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 817; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 818; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 819; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 820; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 821; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 822; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 823; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 824; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 825; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 826; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 827; AVX1-NEXT: addq $16, %rcx 828; AVX1-NEXT: cmpq %rcx, %rax 829; AVX1-NEXT: jne .LBB6_1 830; AVX1-NEXT: # %bb.2: # %middle.block 831; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 832; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 833; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 834; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 835; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 836; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 837; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 838; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 839; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 840; AVX1-NEXT: vmovd %xmm0, %eax 841; AVX1-NEXT: vzeroupper 842; AVX1-NEXT: retq 843; 844; AVX2-LABEL: _Z9test_charPcS_i_512: 845; AVX2: # %bb.0: # %entry 846; AVX2-NEXT: movl %edx, %eax 847; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 848; AVX2-NEXT: xorl %ecx, %ecx 849; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 850; AVX2-NEXT: .p2align 4 851; AVX2-NEXT: .LBB6_1: # %vector.body 852; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 853; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 854; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 855; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 856; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 857; AVX2-NEXT: addq $16, %rcx 858; AVX2-NEXT: cmpq %rcx, %rax 859; AVX2-NEXT: jne .LBB6_1 860; AVX2-NEXT: # %bb.2: # %middle.block 861; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 862; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 863; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 864; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 865; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 866; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 867; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 868; AVX2-NEXT: vmovd %xmm0, %eax 869; AVX2-NEXT: vzeroupper 870; AVX2-NEXT: retq 871; 872; AVX512-LABEL: _Z9test_charPcS_i_512: 873; AVX512: # %bb.0: # %entry 874; AVX512-NEXT: movl %edx, %eax 875; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 876; AVX512-NEXT: xorl %ecx, %ecx 877; AVX512-NEXT: .p2align 4 878; AVX512-NEXT: .LBB6_1: # %vector.body 879; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 880; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1 881; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2 882; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 883; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 884; AVX512-NEXT: addq $16, %rcx 885; AVX512-NEXT: cmpq %rcx, %rax 886; AVX512-NEXT: jne .LBB6_1 887; AVX512-NEXT: # %bb.2: # %middle.block 888; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 889; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 890; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 891; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 892; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 893; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 894; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 895; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 896; AVX512-NEXT: vmovd %xmm0, %eax 897; AVX512-NEXT: vzeroupper 898; AVX512-NEXT: retq 899entry: 900 %3 = zext i32 %2 to i64 901 br label %vector.body 902 903vector.body: 904 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 905 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 906 %4 = getelementptr inbounds i8, ptr %0, i64 %index 907 %5 = bitcast ptr %4 to ptr 908 %wide.load = load <16 x i8>, ptr %5, align 1 909 %6 = sext <16 x i8> %wide.load to <16 x i32> 910 %7 = getelementptr inbounds i8, ptr %1, i64 %index 911 %8 = bitcast ptr %7 to ptr 912 %wide.load14 = load <16 x i8>, ptr %8, align 1 913 %9 = sext <16 x i8> %wide.load14 to <16 x i32> 914 %10 = mul nsw <16 x i32> %9, %6 915 %11 = add nsw <16 x i32> %10, %vec.phi 916 %index.next = add i64 %index, 16 917 %12 = icmp eq i64 %index.next, %3 918 br i1 %12, label %middle.block, label %vector.body 919 920middle.block: 921 %rdx.shuf = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 922 %bin.rdx = add <16 x i32> %11, %rdx.shuf 923 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 924 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 925 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 926 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 927 %rdx.shuf19 = shufflevector <16 x i32> %bin.rdx18, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 928 %bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19 929 %13 = extractelement <16 x i32> %bin.rdx20, i32 0 930 ret i32 %13 931} 932 933define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 934; SSE2-LABEL: _Z9test_charPcS_i_1024: 935; SSE2: # %bb.0: # %entry 936; SSE2-NEXT: movl %edx, %eax 937; SSE2-NEXT: pxor %xmm0, %xmm0 938; SSE2-NEXT: xorl %ecx, %ecx 939; SSE2-NEXT: pxor %xmm2, %xmm2 940; SSE2-NEXT: pxor %xmm4, %xmm4 941; SSE2-NEXT: pxor %xmm1, %xmm1 942; SSE2-NEXT: pxor %xmm3, %xmm3 943; SSE2-NEXT: .p2align 4 944; SSE2-NEXT: .LBB7_1: # %vector.body 945; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 946; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 947; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm6 948; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm8 949; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm5 950; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] 951; SSE2-NEXT: psraw $8, %xmm9 952; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 953; SSE2-NEXT: psraw $8, %xmm10 954; SSE2-NEXT: pmaddwd %xmm9, %xmm10 955; SSE2-NEXT: paddd %xmm10, %xmm2 956; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 957; SSE2-NEXT: psraw $8, %xmm7 958; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 959; SSE2-NEXT: psraw $8, %xmm8 960; SSE2-NEXT: pmaddwd %xmm7, %xmm8 961; SSE2-NEXT: paddd %xmm8, %xmm4 962; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 963; SSE2-NEXT: psraw $8, %xmm7 964; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] 965; SSE2-NEXT: psraw $8, %xmm8 966; SSE2-NEXT: pmaddwd %xmm7, %xmm8 967; SSE2-NEXT: paddd %xmm8, %xmm1 968; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 969; SSE2-NEXT: psraw $8, %xmm6 970; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 971; SSE2-NEXT: psraw $8, %xmm5 972; SSE2-NEXT: pmaddwd %xmm6, %xmm5 973; SSE2-NEXT: paddd %xmm5, %xmm3 974; SSE2-NEXT: addq $32, %rcx 975; SSE2-NEXT: cmpq %rcx, %rax 976; SSE2-NEXT: jne .LBB7_1 977; SSE2-NEXT: # %bb.2: # %middle.block 978; SSE2-NEXT: paddd %xmm0, %xmm4 979; SSE2-NEXT: paddd %xmm0, %xmm3 980; SSE2-NEXT: paddd %xmm4, %xmm3 981; SSE2-NEXT: paddd %xmm0, %xmm2 982; SSE2-NEXT: paddd %xmm0, %xmm1 983; SSE2-NEXT: paddd %xmm2, %xmm1 984; SSE2-NEXT: paddd %xmm3, %xmm1 985; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 986; SSE2-NEXT: paddd %xmm1, %xmm0 987; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 988; SSE2-NEXT: paddd %xmm0, %xmm1 989; SSE2-NEXT: movd %xmm1, %eax 990; SSE2-NEXT: retq 991; 992; AVX1-LABEL: _Z9test_charPcS_i_1024: 993; AVX1: # %bb.0: # %entry 994; AVX1-NEXT: movl %edx, %eax 995; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 996; AVX1-NEXT: xorl %ecx, %ecx 997; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 998; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 999; AVX1-NEXT: .p2align 4 1000; AVX1-NEXT: .LBB7_1: # %vector.body 1001; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1002; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 1003; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 1004; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 1005; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 1006; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 1007; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 1008; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 1009; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 1010; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 1011; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 1012; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 1013; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 1014; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 1015; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 1016; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1 1017; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1018; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1019; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 1020; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 1021; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1022; AVX1-NEXT: addq $32, %rcx 1023; AVX1-NEXT: cmpq %rcx, %rax 1024; AVX1-NEXT: jne .LBB7_1 1025; AVX1-NEXT: # %bb.2: # %middle.block 1026; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3 1027; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm4 1028; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 1029; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1030; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1031; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 1032; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1033; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1034; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 1035; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 1036; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1037; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1038; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1039; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1040; AVX1-NEXT: vmovd %xmm0, %eax 1041; AVX1-NEXT: vzeroupper 1042; AVX1-NEXT: retq 1043; 1044; AVX2-LABEL: _Z9test_charPcS_i_1024: 1045; AVX2: # %bb.0: # %entry 1046; AVX2-NEXT: movl %edx, %eax 1047; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1048; AVX2-NEXT: xorl %ecx, %ecx 1049; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1050; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1051; AVX2-NEXT: .p2align 4 1052; AVX2-NEXT: .LBB7_1: # %vector.body 1053; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1054; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 1055; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 1056; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 1057; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 1058; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 1059; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 1060; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 1061; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 1062; AVX2-NEXT: addq $32, %rcx 1063; AVX2-NEXT: cmpq %rcx, %rax 1064; AVX2-NEXT: jne .LBB7_1 1065; AVX2-NEXT: # %bb.2: # %middle.block 1066; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 1067; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 1068; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 1069; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1070; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1071; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1072; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1073; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1074; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1075; AVX2-NEXT: vmovd %xmm0, %eax 1076; AVX2-NEXT: vzeroupper 1077; AVX2-NEXT: retq 1078; 1079; AVX512F-LABEL: _Z9test_charPcS_i_1024: 1080; AVX512F: # %bb.0: # %entry 1081; AVX512F-NEXT: movl %edx, %eax 1082; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 1083; AVX512F-NEXT: xorl %ecx, %ecx 1084; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1085; AVX512F-NEXT: .p2align 4 1086; AVX512F-NEXT: .LBB7_1: # %vector.body 1087; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 1088; AVX512F-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 1089; AVX512F-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 1090; AVX512F-NEXT: vpmovsxbw (%rsi,%rcx), %ymm4 1091; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 1092; AVX512F-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 1093; AVX512F-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 1094; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 1095; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1096; AVX512F-NEXT: addq $32, %rcx 1097; AVX512F-NEXT: cmpq %rcx, %rax 1098; AVX512F-NEXT: jne .LBB7_1 1099; AVX512F-NEXT: # %bb.2: # %middle.block 1100; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1101; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1102; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1103; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 1104; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1105; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1106; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1107; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1108; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1109; AVX512F-NEXT: vmovd %xmm0, %eax 1110; AVX512F-NEXT: vzeroupper 1111; AVX512F-NEXT: retq 1112; 1113; AVX512BW-LABEL: _Z9test_charPcS_i_1024: 1114; AVX512BW: # %bb.0: # %entry 1115; AVX512BW-NEXT: movl %edx, %eax 1116; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 1117; AVX512BW-NEXT: xorl %ecx, %ecx 1118; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1119; AVX512BW-NEXT: .p2align 4 1120; AVX512BW-NEXT: .LBB7_1: # %vector.body 1121; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 1122; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 1123; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 1124; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 1125; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1126; AVX512BW-NEXT: addq $32, %rcx 1127; AVX512BW-NEXT: cmpq %rcx, %rax 1128; AVX512BW-NEXT: jne .LBB7_1 1129; AVX512BW-NEXT: # %bb.2: # %middle.block 1130; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1131; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1132; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1133; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1134; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1135; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1136; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1137; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1138; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1139; AVX512BW-NEXT: vmovd %xmm0, %eax 1140; AVX512BW-NEXT: vzeroupper 1141; AVX512BW-NEXT: retq 1142entry: 1143 %3 = zext i32 %2 to i64 1144 br label %vector.body 1145 1146vector.body: 1147 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1148 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1149 %4 = getelementptr inbounds i8, ptr %0, i64 %index 1150 %5 = bitcast ptr %4 to ptr 1151 %wide.load = load <32 x i8>, ptr %5, align 1 1152 %6 = sext <32 x i8> %wide.load to <32 x i32> 1153 %7 = getelementptr inbounds i8, ptr %1, i64 %index 1154 %8 = bitcast ptr %7 to ptr 1155 %wide.load14 = load <32 x i8>, ptr %8, align 1 1156 %9 = sext <32 x i8> %wide.load14 to <32 x i32> 1157 %10 = mul nsw <32 x i32> %9, %6 1158 %11 = add nsw <32 x i32> %10, %vec.phi 1159 %index.next = add i64 %index, 32 1160 %12 = icmp eq i64 %index.next, %3 1161 br i1 %12, label %middle.block, label %vector.body 1162 1163middle.block: 1164 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1165 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 1166 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1167 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 1168 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1169 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 1170 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1171 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 1172 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1173 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 1174 %13 = extractelement <32 x i32> %bin.rdx20, i32 0 1175 ret i32 %13 1176} 1177 1178define i32 @test_unsigned_short_128(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 1179; SSE2-LABEL: test_unsigned_short_128: 1180; SSE2: # %bb.0: # %entry 1181; SSE2-NEXT: movl %edx, %eax 1182; SSE2-NEXT: pxor %xmm0, %xmm0 1183; SSE2-NEXT: xorl %ecx, %ecx 1184; SSE2-NEXT: .p2align 4 1185; SSE2-NEXT: .LBB8_1: # %vector.body 1186; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1187; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 1188; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 1189; SSE2-NEXT: movdqa %xmm2, %xmm3 1190; SSE2-NEXT: pmulhuw %xmm1, %xmm3 1191; SSE2-NEXT: pmullw %xmm1, %xmm2 1192; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1193; SSE2-NEXT: paddd %xmm2, %xmm0 1194; SSE2-NEXT: addq $16, %rcx 1195; SSE2-NEXT: cmpq %rcx, %rax 1196; SSE2-NEXT: jne .LBB8_1 1197; SSE2-NEXT: # %bb.2: # %middle.block 1198; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1199; SSE2-NEXT: paddd %xmm0, %xmm1 1200; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1201; SSE2-NEXT: paddd %xmm1, %xmm0 1202; SSE2-NEXT: movd %xmm0, %eax 1203; SSE2-NEXT: retq 1204; 1205; AVX-LABEL: test_unsigned_short_128: 1206; AVX: # %bb.0: # %entry 1207; AVX-NEXT: movl %edx, %eax 1208; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1209; AVX-NEXT: xorl %ecx, %ecx 1210; AVX-NEXT: .p2align 4 1211; AVX-NEXT: .LBB8_1: # %vector.body 1212; AVX-NEXT: # =>This Inner Loop Header: Depth=1 1213; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1214; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1215; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 1216; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1217; AVX-NEXT: addq $16, %rcx 1218; AVX-NEXT: cmpq %rcx, %rax 1219; AVX-NEXT: jne .LBB8_1 1220; AVX-NEXT: # %bb.2: # %middle.block 1221; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1222; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1223; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1224; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1225; AVX-NEXT: vmovd %xmm0, %eax 1226; AVX-NEXT: retq 1227entry: 1228 %3 = zext i32 %2 to i64 1229 br label %vector.body 1230 1231vector.body: 1232 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1233 %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1234 %4 = getelementptr inbounds i16, ptr %0, i64 %index 1235 %5 = bitcast ptr %4 to ptr 1236 %wide.load = load <4 x i16>, ptr %5, align 2 1237 %6 = zext <4 x i16> %wide.load to <4 x i32> 1238 %7 = getelementptr inbounds i16, ptr %1, i64 %index 1239 %8 = bitcast ptr %7 to ptr 1240 %wide.load14 = load <4 x i16>, ptr %8, align 2 1241 %9 = zext <4 x i16> %wide.load14 to <4 x i32> 1242 %10 = mul nsw <4 x i32> %9, %6 1243 %11 = add nsw <4 x i32> %10, %vec.phi 1244 %index.next = add i64 %index, 16 1245 %12 = icmp eq i64 %index.next, %3 1246 br i1 %12, label %middle.block, label %vector.body 1247 1248middle.block: 1249 %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1250 %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15 1251 %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1252 %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17 1253 %13 = extractelement <4 x i32> %bin.rdx18, i32 0 1254 ret i32 %13 1255} 1256 1257define i32 @test_unsigned_short_256(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 1258; SSE2-LABEL: test_unsigned_short_256: 1259; SSE2: # %bb.0: # %entry 1260; SSE2-NEXT: movl %edx, %eax 1261; SSE2-NEXT: pxor %xmm0, %xmm0 1262; SSE2-NEXT: xorl %ecx, %ecx 1263; SSE2-NEXT: pxor %xmm1, %xmm1 1264; SSE2-NEXT: .p2align 4 1265; SSE2-NEXT: .LBB9_1: # %vector.body 1266; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1267; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 1268; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 1269; SSE2-NEXT: movdqa %xmm3, %xmm4 1270; SSE2-NEXT: pmulhuw %xmm2, %xmm4 1271; SSE2-NEXT: pmullw %xmm2, %xmm3 1272; SSE2-NEXT: movdqa %xmm3, %xmm2 1273; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1274; SSE2-NEXT: paddd %xmm2, %xmm0 1275; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1276; SSE2-NEXT: paddd %xmm3, %xmm1 1277; SSE2-NEXT: addq $16, %rcx 1278; SSE2-NEXT: cmpq %rcx, %rax 1279; SSE2-NEXT: jne .LBB9_1 1280; SSE2-NEXT: # %bb.2: # %middle.block 1281; SSE2-NEXT: paddd %xmm1, %xmm0 1282; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1283; SSE2-NEXT: paddd %xmm0, %xmm1 1284; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1285; SSE2-NEXT: paddd %xmm1, %xmm0 1286; SSE2-NEXT: movd %xmm0, %eax 1287; SSE2-NEXT: retq 1288; 1289; AVX1-LABEL: test_unsigned_short_256: 1290; AVX1: # %bb.0: # %entry 1291; AVX1-NEXT: movl %edx, %eax 1292; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 1293; AVX1-NEXT: xorl %ecx, %ecx 1294; AVX1-NEXT: .p2align 4 1295; AVX1-NEXT: .LBB9_1: # %vector.body 1296; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1297; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1298; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1299; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1300; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 1301; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1302; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 1303; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1304; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 1305; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 1306; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1307; AVX1-NEXT: addq $16, %rcx 1308; AVX1-NEXT: cmpq %rcx, %rax 1309; AVX1-NEXT: jne .LBB9_1 1310; AVX1-NEXT: # %bb.2: # %middle.block 1311; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1312; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1313; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1314; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1315; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1316; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1317; AVX1-NEXT: vmovd %xmm0, %eax 1318; AVX1-NEXT: vzeroupper 1319; AVX1-NEXT: retq 1320; 1321; AVX256-LABEL: test_unsigned_short_256: 1322; AVX256: # %bb.0: # %entry 1323; AVX256-NEXT: movl %edx, %eax 1324; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 1325; AVX256-NEXT: xorl %ecx, %ecx 1326; AVX256-NEXT: .p2align 4 1327; AVX256-NEXT: .LBB9_1: # %vector.body 1328; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 1329; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1330; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1331; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1332; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 1333; AVX256-NEXT: addq $16, %rcx 1334; AVX256-NEXT: cmpq %rcx, %rax 1335; AVX256-NEXT: jne .LBB9_1 1336; AVX256-NEXT: # %bb.2: # %middle.block 1337; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 1338; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1339; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1340; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1341; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1342; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1343; AVX256-NEXT: vmovd %xmm0, %eax 1344; AVX256-NEXT: vzeroupper 1345; AVX256-NEXT: retq 1346entry: 1347 %3 = zext i32 %2 to i64 1348 br label %vector.body 1349 1350vector.body: 1351 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1352 %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1353 %4 = getelementptr inbounds i16, ptr %0, i64 %index 1354 %5 = bitcast ptr %4 to ptr 1355 %wide.load = load <8 x i16>, ptr %5, align 2 1356 %6 = zext <8 x i16> %wide.load to <8 x i32> 1357 %7 = getelementptr inbounds i16, ptr %1, i64 %index 1358 %8 = bitcast ptr %7 to ptr 1359 %wide.load14 = load <8 x i16>, ptr %8, align 2 1360 %9 = zext <8 x i16> %wide.load14 to <8 x i32> 1361 %10 = mul nsw <8 x i32> %9, %6 1362 %11 = add nsw <8 x i32> %10, %vec.phi 1363 %index.next = add i64 %index, 16 1364 %12 = icmp eq i64 %index.next, %3 1365 br i1 %12, label %middle.block, label %vector.body 1366 1367middle.block: 1368 %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 1369 %bin.rdx = add <8 x i32> %11, %rdx.shuf 1370 %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1371 %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15 1372 %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1373 %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 1374 %13 = extractelement <8 x i32> %bin.rdx18, i32 0 1375 ret i32 %13 1376} 1377 1378define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 1379; SSE2-LABEL: test_unsigned_short_512: 1380; SSE2: # %bb.0: # %entry 1381; SSE2-NEXT: movl %edx, %eax 1382; SSE2-NEXT: pxor %xmm0, %xmm0 1383; SSE2-NEXT: xorl %ecx, %ecx 1384; SSE2-NEXT: pxor %xmm1, %xmm1 1385; SSE2-NEXT: pxor %xmm3, %xmm3 1386; SSE2-NEXT: pxor %xmm2, %xmm2 1387; SSE2-NEXT: .p2align 4 1388; SSE2-NEXT: .LBB10_1: # %vector.body 1389; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1390; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 1391; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm5 1392; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm6 1393; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm7 1394; SSE2-NEXT: movdqa %xmm6, %xmm8 1395; SSE2-NEXT: pmulhuw %xmm4, %xmm8 1396; SSE2-NEXT: pmullw %xmm4, %xmm6 1397; SSE2-NEXT: movdqa %xmm6, %xmm4 1398; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] 1399; SSE2-NEXT: paddd %xmm4, %xmm0 1400; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] 1401; SSE2-NEXT: paddd %xmm6, %xmm1 1402; SSE2-NEXT: movdqa %xmm7, %xmm4 1403; SSE2-NEXT: pmulhuw %xmm5, %xmm4 1404; SSE2-NEXT: pmullw %xmm5, %xmm7 1405; SSE2-NEXT: movdqa %xmm7, %xmm5 1406; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 1407; SSE2-NEXT: paddd %xmm5, %xmm3 1408; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 1409; SSE2-NEXT: paddd %xmm7, %xmm2 1410; SSE2-NEXT: addq $16, %rcx 1411; SSE2-NEXT: cmpq %rcx, %rax 1412; SSE2-NEXT: jne .LBB10_1 1413; SSE2-NEXT: # %bb.2: # %middle.block 1414; SSE2-NEXT: paddd %xmm3, %xmm0 1415; SSE2-NEXT: paddd %xmm2, %xmm1 1416; SSE2-NEXT: paddd %xmm0, %xmm1 1417; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1418; SSE2-NEXT: paddd %xmm1, %xmm0 1419; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1420; SSE2-NEXT: paddd %xmm0, %xmm1 1421; SSE2-NEXT: movd %xmm1, %eax 1422; SSE2-NEXT: retq 1423; 1424; AVX1-LABEL: test_unsigned_short_512: 1425; AVX1: # %bb.0: # %entry 1426; AVX1-NEXT: movl %edx, %eax 1427; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 1428; AVX1-NEXT: xorl %ecx, %ecx 1429; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1430; AVX1-NEXT: .p2align 4 1431; AVX1-NEXT: .LBB10_1: # %vector.body 1432; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1433; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1434; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1435; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1436; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1437; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1438; AVX1-NEXT: vpmulld %xmm2, %xmm6, %xmm2 1439; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1440; AVX1-NEXT: vpmulld %xmm3, %xmm6, %xmm3 1441; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1442; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4 1443; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1444; AVX1-NEXT: vpmulld %xmm5, %xmm6, %xmm5 1445; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 1446; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 1447; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 1448; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1449; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1450; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 1451; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 1452; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1453; AVX1-NEXT: addq $16, %rcx 1454; AVX1-NEXT: cmpq %rcx, %rax 1455; AVX1-NEXT: jne .LBB10_1 1456; AVX1-NEXT: # %bb.2: # %middle.block 1457; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1458; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1459; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1460; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1461; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 1462; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1463; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1464; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1465; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1466; AVX1-NEXT: vmovd %xmm0, %eax 1467; AVX1-NEXT: vzeroupper 1468; AVX1-NEXT: retq 1469; 1470; AVX2-LABEL: test_unsigned_short_512: 1471; AVX2: # %bb.0: # %entry 1472; AVX2-NEXT: movl %edx, %eax 1473; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1474; AVX2-NEXT: xorl %ecx, %ecx 1475; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1476; AVX2-NEXT: .p2align 4 1477; AVX2-NEXT: .LBB10_1: # %vector.body 1478; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1479; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1480; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1481; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1482; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2 1483; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 1484; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1485; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 1486; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 1487; AVX2-NEXT: addq $16, %rcx 1488; AVX2-NEXT: cmpq %rcx, %rax 1489; AVX2-NEXT: jne .LBB10_1 1490; AVX2-NEXT: # %bb.2: # %middle.block 1491; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1492; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1493; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1494; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1495; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1496; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1497; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1498; AVX2-NEXT: vmovd %xmm0, %eax 1499; AVX2-NEXT: vzeroupper 1500; AVX2-NEXT: retq 1501; 1502; AVX512-LABEL: test_unsigned_short_512: 1503; AVX512: # %bb.0: # %entry 1504; AVX512-NEXT: movl %edx, %eax 1505; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 1506; AVX512-NEXT: xorl %ecx, %ecx 1507; AVX512-NEXT: .p2align 4 1508; AVX512-NEXT: .LBB10_1: # %vector.body 1509; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 1510; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1511; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1512; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 1513; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1514; AVX512-NEXT: addq $16, %rcx 1515; AVX512-NEXT: cmpq %rcx, %rax 1516; AVX512-NEXT: jne .LBB10_1 1517; AVX512-NEXT: # %bb.2: # %middle.block 1518; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1519; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1520; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1521; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1522; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1523; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1524; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1525; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1526; AVX512-NEXT: vmovd %xmm0, %eax 1527; AVX512-NEXT: vzeroupper 1528; AVX512-NEXT: retq 1529entry: 1530 %3 = zext i32 %2 to i64 1531 br label %vector.body 1532 1533vector.body: 1534 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1535 %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1536 %4 = getelementptr inbounds i16, ptr %0, i64 %index 1537 %5 = bitcast ptr %4 to ptr 1538 %wide.load = load <16 x i16>, ptr %5, align 2 1539 %6 = zext <16 x i16> %wide.load to <16 x i32> 1540 %7 = getelementptr inbounds i16, ptr %1, i64 %index 1541 %8 = bitcast ptr %7 to ptr 1542 %wide.load14 = load <16 x i16>, ptr %8, align 2 1543 %9 = zext <16 x i16> %wide.load14 to <16 x i32> 1544 %10 = mul nsw <16 x i32> %9, %6 1545 %11 = add nsw <16 x i32> %10, %vec.phi 1546 %index.next = add i64 %index, 16 1547 %12 = icmp eq i64 %index.next, %3 1548 br i1 %12, label %middle.block, label %vector.body 1549 1550middle.block: 1551 %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1552 %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1 1553 %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1554 %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf 1555 %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1556 %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15 1557 %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1558 %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17 1559 %13 = extractelement <16 x i32> %bin.rdx18, i32 0 1560 ret i32 %13 1561} 1562 1563define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 1564; SSE2-LABEL: test_unsigned_short_1024: 1565; SSE2: # %bb.0: # %entry 1566; SSE2-NEXT: movl %edx, %eax 1567; SSE2-NEXT: pxor %xmm0, %xmm0 1568; SSE2-NEXT: xorl %ecx, %ecx 1569; SSE2-NEXT: pxor %xmm3, %xmm3 1570; SSE2-NEXT: pxor %xmm1, %xmm1 1571; SSE2-NEXT: pxor %xmm2, %xmm2 1572; SSE2-NEXT: pxor %xmm4, %xmm4 1573; SSE2-NEXT: pxor %xmm6, %xmm6 1574; SSE2-NEXT: pxor %xmm5, %xmm5 1575; SSE2-NEXT: pxor %xmm7, %xmm7 1576; SSE2-NEXT: .p2align 4 1577; SSE2-NEXT: .LBB11_1: # %vector.body 1578; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1579; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8 1580; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm9 1581; SSE2-NEXT: movdqa %xmm9, %xmm10 1582; SSE2-NEXT: pmulhuw %xmm8, %xmm10 1583; SSE2-NEXT: pmullw %xmm8, %xmm9 1584; SSE2-NEXT: movdqa %xmm9, %xmm8 1585; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] 1586; SSE2-NEXT: paddd %xmm8, %xmm7 1587; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm8 1588; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] 1589; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm10 1590; SSE2-NEXT: paddd %xmm9, %xmm5 1591; SSE2-NEXT: movdqa %xmm10, %xmm9 1592; SSE2-NEXT: pmulhuw %xmm8, %xmm9 1593; SSE2-NEXT: pmullw %xmm8, %xmm10 1594; SSE2-NEXT: movdqa %xmm10, %xmm8 1595; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] 1596; SSE2-NEXT: paddd %xmm8, %xmm6 1597; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm8 1598; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 1599; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9 1600; SSE2-NEXT: paddd %xmm10, %xmm4 1601; SSE2-NEXT: movdqa %xmm9, %xmm10 1602; SSE2-NEXT: pmulhuw %xmm8, %xmm10 1603; SSE2-NEXT: pmullw %xmm8, %xmm9 1604; SSE2-NEXT: movdqa %xmm9, %xmm8 1605; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] 1606; SSE2-NEXT: paddd %xmm8, %xmm0 1607; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm8 1608; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 1609; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm10 1610; SSE2-NEXT: paddd %xmm9, %xmm3 1611; SSE2-NEXT: movdqa %xmm10, %xmm9 1612; SSE2-NEXT: pmulhuw %xmm8, %xmm9 1613; SSE2-NEXT: pmullw %xmm8, %xmm10 1614; SSE2-NEXT: movdqa %xmm10, %xmm8 1615; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] 1616; SSE2-NEXT: paddd %xmm8, %xmm1 1617; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 1618; SSE2-NEXT: paddd %xmm10, %xmm2 1619; SSE2-NEXT: addq $16, %rcx 1620; SSE2-NEXT: cmpq %rcx, %rax 1621; SSE2-NEXT: jne .LBB11_1 1622; SSE2-NEXT: # %bb.2: # %middle.block 1623; SSE2-NEXT: paddd %xmm6, %xmm3 1624; SSE2-NEXT: paddd %xmm7, %xmm2 1625; SSE2-NEXT: paddd %xmm3, %xmm2 1626; SSE2-NEXT: paddd %xmm4, %xmm0 1627; SSE2-NEXT: paddd %xmm5, %xmm1 1628; SSE2-NEXT: paddd %xmm0, %xmm1 1629; SSE2-NEXT: paddd %xmm2, %xmm1 1630; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1631; SSE2-NEXT: paddd %xmm1, %xmm0 1632; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1633; SSE2-NEXT: paddd %xmm0, %xmm1 1634; SSE2-NEXT: movd %xmm1, %eax 1635; SSE2-NEXT: retq 1636; 1637; AVX1-LABEL: test_unsigned_short_1024: 1638; AVX1: # %bb.0: # %entry 1639; AVX1-NEXT: movl %edx, %eax 1640; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 1641; AVX1-NEXT: xorl %ecx, %ecx 1642; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1643; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1644; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1645; AVX1-NEXT: .p2align 4 1646; AVX1-NEXT: .LBB11_1: # %vector.body 1647; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1648; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1649; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1650; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1651; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1652; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1653; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1654; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1655; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1656; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1657; AVX1-NEXT: vpmulld %xmm4, %xmm12, %xmm4 1658; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1659; AVX1-NEXT: vpmulld %xmm5, %xmm12, %xmm5 1660; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1661; AVX1-NEXT: vpmulld %xmm6, %xmm12, %xmm6 1662; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1663; AVX1-NEXT: vpmulld %xmm7, %xmm12, %xmm7 1664; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1665; AVX1-NEXT: vpmulld %xmm8, %xmm12, %xmm8 1666; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1667; AVX1-NEXT: vpmulld %xmm9, %xmm12, %xmm9 1668; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1669; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm10 1670; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1671; AVX1-NEXT: vpmulld %xmm11, %xmm12, %xmm11 1672; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 1673; AVX1-NEXT: vpaddd %xmm4, %xmm12, %xmm4 1674; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 1675; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1676; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 1677; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4 1678; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm0 1679; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1680; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 1681; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4 1682; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm3 1683; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 1684; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1685; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4 1686; AVX1-NEXT: vpaddd %xmm2, %xmm11, %xmm2 1687; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 1688; AVX1-NEXT: addq $16, %rcx 1689; AVX1-NEXT: cmpq %rcx, %rax 1690; AVX1-NEXT: jne .LBB11_1 1691; AVX1-NEXT: # %bb.2: # %middle.block 1692; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm4 1693; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm5 1694; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 1695; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1696; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1697; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1698; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1699; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1700; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1701; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1702; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 1703; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1704; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1705; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1706; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1707; AVX1-NEXT: vmovd %xmm0, %eax 1708; AVX1-NEXT: vzeroupper 1709; AVX1-NEXT: retq 1710; 1711; AVX2-LABEL: test_unsigned_short_1024: 1712; AVX2: # %bb.0: # %entry 1713; AVX2-NEXT: movl %edx, %eax 1714; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 1715; AVX2-NEXT: xorl %ecx, %ecx 1716; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1717; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1718; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 1719; AVX2-NEXT: .p2align 4 1720; AVX2-NEXT: .LBB11_1: # %vector.body 1721; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1722; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1723; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1724; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1725; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1726; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1727; AVX2-NEXT: vpmulld %ymm4, %ymm8, %ymm4 1728; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 1729; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1730; AVX2-NEXT: vpmulld %ymm5, %ymm4, %ymm4 1731; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 1732; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1733; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4 1734; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 1735; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1736; AVX2-NEXT: vpmulld %ymm7, %ymm4, %ymm4 1737; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 1738; AVX2-NEXT: addq $16, %rcx 1739; AVX2-NEXT: cmpq %rcx, %rax 1740; AVX2-NEXT: jne .LBB11_1 1741; AVX2-NEXT: # %bb.2: # %middle.block 1742; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 1743; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1744; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1745; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1746; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1747; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1748; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1749; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1750; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1751; AVX2-NEXT: vmovd %xmm0, %eax 1752; AVX2-NEXT: vzeroupper 1753; AVX2-NEXT: retq 1754; 1755; AVX512-LABEL: test_unsigned_short_1024: 1756; AVX512: # %bb.0: # %entry 1757; AVX512-NEXT: movl %edx, %eax 1758; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 1759; AVX512-NEXT: xorl %ecx, %ecx 1760; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1761; AVX512-NEXT: .p2align 4 1762; AVX512-NEXT: .LBB11_1: # %vector.body 1763; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 1764; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1765; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1766; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1767; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2 1768; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1769; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1770; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 1771; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 1772; AVX512-NEXT: addq $16, %rcx 1773; AVX512-NEXT: cmpq %rcx, %rax 1774; AVX512-NEXT: jne .LBB11_1 1775; AVX512-NEXT: # %bb.2: # %middle.block 1776; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1777; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1778; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1779; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1780; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1781; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1782; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1783; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1784; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1785; AVX512-NEXT: vmovd %xmm0, %eax 1786; AVX512-NEXT: vzeroupper 1787; AVX512-NEXT: retq 1788entry: 1789 %3 = zext i32 %2 to i64 1790 br label %vector.body 1791 1792vector.body: 1793 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 1794 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] 1795 %4 = getelementptr inbounds i16, ptr %0, i64 %index 1796 %5 = bitcast ptr %4 to ptr 1797 %wide.load = load <32 x i16>, ptr %5, align 2 1798 %6 = zext <32 x i16> %wide.load to <32 x i32> 1799 %7 = getelementptr inbounds i16, ptr %1, i64 %index 1800 %8 = bitcast ptr %7 to ptr 1801 %wide.load14 = load <32 x i16>, ptr %8, align 2 1802 %9 = zext <32 x i16> %wide.load14 to <32 x i32> 1803 %10 = mul nsw <32 x i32> %9, %6 1804 %11 = add nsw <32 x i32> %10, %vec.phi 1805 %index.next = add i64 %index, 16 1806 %12 = icmp eq i64 %index.next, %3 1807 br i1 %12, label %middle.block, label %vector.body 1808 1809middle.block: 1810 %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1811 %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2 1812 %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1813 %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1 1814 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1815 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf 1816 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1817 %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15 1818 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1819 %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17 1820 %13 = extractelement <32 x i32> %bin.rdx18, i32 0 1821 ret i32 %13 1822} 1823 1824define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) { 1825; SSE2-LABEL: pmaddwd_8: 1826; SSE2: # %bb.0: 1827; SSE2-NEXT: pmaddwd %xmm1, %xmm0 1828; SSE2-NEXT: retq 1829; 1830; AVX-LABEL: pmaddwd_8: 1831; AVX: # %bb.0: 1832; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1833; AVX-NEXT: retq 1834 %a = sext <8 x i16> %A to <8 x i32> 1835 %b = sext <8 x i16> %B to <8 x i32> 1836 %m = mul nsw <8 x i32> %a, %b 1837 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1838 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1839 %ret = add <4 x i32> %odd, %even 1840 ret <4 x i32> %ret 1841} 1842 1843define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) { 1844; SSE2-LABEL: pmaddwd_8_swapped: 1845; SSE2: # %bb.0: 1846; SSE2-NEXT: pmaddwd %xmm1, %xmm0 1847; SSE2-NEXT: retq 1848; 1849; AVX-LABEL: pmaddwd_8_swapped: 1850; AVX: # %bb.0: 1851; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1852; AVX-NEXT: retq 1853 %a = sext <8 x i16> %A to <8 x i32> 1854 %b = sext <8 x i16> %B to <8 x i32> 1855 %m = mul nsw <8 x i32> %a, %b 1856 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1857 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1858 %ret = add <4 x i32> %even, %odd 1859 ret <4 x i32> %ret 1860} 1861 1862; FIXME: SSE fails to match PMADDWD 1863define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) { 1864; SSE2-LABEL: larger_mul: 1865; SSE2: # %bb.0: 1866; SSE2-NEXT: movdqa %xmm0, %xmm1 1867; SSE2-NEXT: pmulhw %xmm2, %xmm1 1868; SSE2-NEXT: pmullw %xmm2, %xmm0 1869; SSE2-NEXT: movdqa %xmm0, %xmm2 1870; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1871; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1872; SSE2-NEXT: movdqa %xmm0, %xmm1 1873; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1874; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 1875; SSE2-NEXT: paddd %xmm1, %xmm0 1876; SSE2-NEXT: retq 1877; 1878; AVX1-LABEL: larger_mul: 1879; AVX1: # %bb.0: 1880; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1881; AVX1-NEXT: vzeroupper 1882; AVX1-NEXT: retq 1883; 1884; AVX2-LABEL: larger_mul: 1885; AVX2: # %bb.0: 1886; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1887; AVX2-NEXT: vzeroupper 1888; AVX2-NEXT: retq 1889; 1890; AVX512-LABEL: larger_mul: 1891; AVX512: # %bb.0: 1892; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 1893; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 1894; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1895; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1896; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1897; AVX512-NEXT: vzeroupper 1898; AVX512-NEXT: retq 1899 %a = sext <16 x i16> %A to <16 x i32> 1900 %b = sext <16 x i16> %B to <16 x i32> 1901 %m = mul nsw <16 x i32> %a, %b 1902 %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1903 %even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1904 %ret = add <4 x i32> %odd, %even 1905 ret <4 x i32> %ret 1906} 1907 1908define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) { 1909; SSE2-LABEL: pmaddwd_16: 1910; SSE2: # %bb.0: 1911; SSE2-NEXT: pmaddwd %xmm2, %xmm0 1912; SSE2-NEXT: pmaddwd %xmm3, %xmm1 1913; SSE2-NEXT: retq 1914; 1915; AVX1-LABEL: pmaddwd_16: 1916; AVX1: # %bb.0: 1917; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1918; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1919; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 1920; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1921; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1922; AVX1-NEXT: retq 1923; 1924; AVX256-LABEL: pmaddwd_16: 1925; AVX256: # %bb.0: 1926; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1927; AVX256-NEXT: retq 1928 %a = sext <16 x i16> %A to <16 x i32> 1929 %b = sext <16 x i16> %B to <16 x i32> 1930 %m = mul nsw <16 x i32> %a, %b 1931 %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 1932 %even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 1933 %ret = add <8 x i32> %odd, %even 1934 ret <8 x i32> %ret 1935} 1936 1937define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { 1938; SSE2-LABEL: pmaddwd_32: 1939; SSE2: # %bb.0: 1940; SSE2-NEXT: pmaddwd %xmm4, %xmm0 1941; SSE2-NEXT: pmaddwd %xmm5, %xmm1 1942; SSE2-NEXT: pmaddwd %xmm6, %xmm2 1943; SSE2-NEXT: pmaddwd %xmm7, %xmm3 1944; SSE2-NEXT: retq 1945; 1946; AVX1-LABEL: pmaddwd_32: 1947; AVX1: # %bb.0: 1948; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1949; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1950; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 1951; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 1952; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1953; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1954; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1955; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 1956; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 1957; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1958; AVX1-NEXT: retq 1959; 1960; AVX2-LABEL: pmaddwd_32: 1961; AVX2: # %bb.0: 1962; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 1963; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 1964; AVX2-NEXT: retq 1965; 1966; AVX512F-LABEL: pmaddwd_32: 1967; AVX512F: # %bb.0: 1968; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 1969; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 1970; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 1971; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 1972; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1973; AVX512F-NEXT: retq 1974; 1975; AVX512BW-LABEL: pmaddwd_32: 1976; AVX512BW: # %bb.0: 1977; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 1978; AVX512BW-NEXT: retq 1979 %a = sext <32 x i16> %A to <32 x i32> 1980 %b = sext <32 x i16> %B to <32 x i32> 1981 %m = mul nsw <32 x i32> %a, %b 1982 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 1983 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 1984 %ret = add <16 x i32> %odd, %even 1985 ret <16 x i32> %ret 1986} 1987 1988define <4 x i32> @pmaddwd_const(<8 x i16> %A) { 1989; SSE2-LABEL: pmaddwd_const: 1990; SSE2: # %bb.0: 1991; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32768,0,0,1,7,42,32] 1992; SSE2-NEXT: retq 1993; 1994; AVX-LABEL: pmaddwd_const: 1995; AVX: # %bb.0: 1996; AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32768,0,0,1,7,42,32] 1997; AVX-NEXT: retq 1998 %a = sext <8 x i16> %A to <8 x i32> 1999 %m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32> 2000 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2001 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2002 %ret = add <4 x i32> %odd, %even 2003 ret <4 x i32> %ret 2004} 2005 2006; Do not select unsigned i16 multiplication 2007define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) { 2008; SSE2-LABEL: pmaddwd_negative1: 2009; SSE2: # %bb.0: 2010; SSE2-NEXT: movdqa %xmm0, %xmm2 2011; SSE2-NEXT: pmulhuw %xmm1, %xmm2 2012; SSE2-NEXT: pmullw %xmm1, %xmm0 2013; SSE2-NEXT: movdqa %xmm0, %xmm1 2014; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2015; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2016; SSE2-NEXT: movdqa %xmm0, %xmm2 2017; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 2018; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 2019; SSE2-NEXT: paddd %xmm2, %xmm0 2020; SSE2-NEXT: retq 2021; 2022; AVX1-LABEL: pmaddwd_negative1: 2023; AVX1: # %bb.0: 2024; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2025; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 2026; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2027; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2028; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 2029; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2030; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2031; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 2032; AVX1-NEXT: retq 2033; 2034; AVX256-LABEL: pmaddwd_negative1: 2035; AVX256: # %bb.0: 2036; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2037; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2038; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2039; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2040; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 2041; AVX256-NEXT: vzeroupper 2042; AVX256-NEXT: retq 2043 %a = zext <8 x i16> %A to <8 x i32> 2044 %b = zext <8 x i16> %B to <8 x i32> 2045 %m = mul nuw <8 x i32> %a, %b 2046 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2047 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2048 %ret = add <4 x i32> %odd, %even 2049 ret <4 x i32> %ret 2050} 2051 2052; Do not select if constant is too large 2053; Lower half is too large, upper half is in range. 2054define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) { 2055; SSE2-LABEL: pmaddwd_negative2: 2056; SSE2: # %bb.0: 2057; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 2058; SSE2-NEXT: psrad $16, %xmm2 2059; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2060; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2061; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 2062; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,7,0,42,0,32,0] 2063; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2064; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] 2065; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 2066; SSE2-NEXT: paddd %xmm2, %xmm1 2067; SSE2-NEXT: movdqa %xmm1, %xmm0 2068; SSE2-NEXT: retq 2069; 2070; AVX1-LABEL: pmaddwd_negative2: 2071; AVX1: # %bb.0: 2072; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 2073; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 2074; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2075; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2076; AVX1-NEXT: vphaddd %xmm0, %xmm1, %xmm0 2077; AVX1-NEXT: retq 2078; 2079; AVX256-LABEL: pmaddwd_negative2: 2080; AVX256: # %bb.0: 2081; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 2082; AVX256-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2083; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2084; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 2085; AVX256-NEXT: vzeroupper 2086; AVX256-NEXT: retq 2087 %a = sext <8 x i16> %A to <8 x i32> 2088 %m = mul nsw <8 x i32> %a, <i32 32768, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32> 2089 %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2090 %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2091 %ret = add <4 x i32> %odd, %even 2092 ret <4 x i32> %ret 2093} 2094 2095define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) { 2096; SSE2-LABEL: jumbled_indices4: 2097; SSE2: # %bb.0: 2098; SSE2-NEXT: pmaddwd %xmm1, %xmm0 2099; SSE2-NEXT: retq 2100; 2101; AVX-LABEL: jumbled_indices4: 2102; AVX: # %bb.0: 2103; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 2104; AVX-NEXT: retq 2105 %exta = sext <8 x i16> %A to <8 x i32> 2106 %extb = sext <8 x i16> %B to <8 x i32> 2107 %m = mul <8 x i32> %exta, %extb 2108 %sa = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 3, i32 1, i32 5, i32 6> 2109 %sb = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 2, i32 0, i32 4, i32 7> 2110 %a = add <4 x i32> %sa, %sb 2111 ret <4 x i32> %a 2112} 2113 2114define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) { 2115; SSE2-LABEL: jumbled_indices8: 2116; SSE2: # %bb.0: 2117; SSE2-NEXT: pmaddwd %xmm2, %xmm0 2118; SSE2-NEXT: pmaddwd %xmm3, %xmm1 2119; SSE2-NEXT: retq 2120; 2121; AVX1-LABEL: jumbled_indices8: 2122; AVX1: # %bb.0: 2123; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2124; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2125; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 2126; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 2127; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2128; AVX1-NEXT: retq 2129; 2130; AVX256-LABEL: jumbled_indices8: 2131; AVX256: # %bb.0: 2132; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 2133; AVX256-NEXT: retq 2134 %exta = sext <16 x i16> %A to <16 x i32> 2135 %extb = sext <16 x i16> %B to <16 x i32> 2136 %m = mul <16 x i32> %exta, %extb 2137 %sa = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 7, i32 4, i32 11, i32 8, i32 15, i32 12> 2138 %sb = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 6, i32 5, i32 10, i32 9, i32 14, i32 13> 2139 %a = add <8 x i32> %sa, %sb 2140 ret <8 x i32> %a 2141} 2142 2143define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { 2144; SSE2-LABEL: jumbled_indices16: 2145; SSE2: # %bb.0: 2146; SSE2-NEXT: pmaddwd %xmm4, %xmm0 2147; SSE2-NEXT: pmaddwd %xmm5, %xmm1 2148; SSE2-NEXT: pmaddwd %xmm6, %xmm2 2149; SSE2-NEXT: pmaddwd %xmm7, %xmm3 2150; SSE2-NEXT: retq 2151; 2152; AVX1-LABEL: jumbled_indices16: 2153; AVX1: # %bb.0: 2154; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 2155; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 2156; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 2157; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 2158; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 2159; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 2160; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2161; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 2162; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 2163; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2164; AVX1-NEXT: retq 2165; 2166; AVX2-LABEL: jumbled_indices16: 2167; AVX2: # %bb.0: 2168; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 2169; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 2170; AVX2-NEXT: retq 2171; 2172; AVX512F-LABEL: jumbled_indices16: 2173; AVX512F: # %bb.0: 2174; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 2175; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 2176; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 2177; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 2178; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 2179; AVX512F-NEXT: retq 2180; 2181; AVX512BW-LABEL: jumbled_indices16: 2182; AVX512BW: # %bb.0: 2183; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 2184; AVX512BW-NEXT: retq 2185 %exta = sext <32 x i16> %A to <32 x i32> 2186 %extb = sext <32 x i16> %B to <32 x i32> 2187 %m = mul <32 x i32> %exta, %extb 2188 %sa = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 2, i32 0, i32 5, i32 6, i32 11, i32 9, i32 15, i32 12, i32 17, i32 18, i32 20, i32 23, i32 27, i32 24, i32 31, i32 29> 2189 %sb = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 3, i32 1, i32 4, i32 7, i32 10, i32 8, i32 14, i32 13, i32 16, i32 19, i32 21, i32 22, i32 26, i32 25, i32 30, i32 28> 2190 %a = add <16 x i32> %sa, %sb 2191 ret <16 x i32> %a 2192} 2193 2194define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { 2195; SSE2-LABEL: jumbled_indices32: 2196; SSE2: # %bb.0: 2197; SSE2-NEXT: movq %rdi, %rax 2198; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm0 2199; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm1 2200; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm2 2201; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm3 2202; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm4 2203; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm5 2204; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm6 2205; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm7 2206; SSE2-NEXT: movdqa %xmm7, 112(%rdi) 2207; SSE2-NEXT: movdqa %xmm6, 96(%rdi) 2208; SSE2-NEXT: movdqa %xmm5, 80(%rdi) 2209; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 2210; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 2211; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 2212; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 2213; SSE2-NEXT: movdqa %xmm0, (%rdi) 2214; SSE2-NEXT: retq 2215; 2216; AVX1-LABEL: jumbled_indices32: 2217; AVX1: # %bb.0: 2218; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 2219; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 2220; AVX1-NEXT: vpmaddwd %xmm8, %xmm9, %xmm8 2221; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 2222; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 2223; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 2224; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 2225; AVX1-NEXT: vpmaddwd %xmm4, %xmm8, %xmm4 2226; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 2227; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2228; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4 2229; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 2230; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 2231; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2 2232; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 2233; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 2234; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 2235; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 2236; AVX1-NEXT: vpmaddwd %xmm7, %xmm3, %xmm3 2237; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 2238; AVX1-NEXT: retq 2239; 2240; AVX2-LABEL: jumbled_indices32: 2241; AVX2: # %bb.0: 2242; AVX2-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0 2243; AVX2-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 2244; AVX2-NEXT: vpmaddwd %ymm6, %ymm2, %ymm2 2245; AVX2-NEXT: vpmaddwd %ymm7, %ymm3, %ymm3 2246; AVX2-NEXT: retq 2247; 2248; AVX512F-LABEL: jumbled_indices32: 2249; AVX512F: # %bb.0: 2250; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 2251; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 2252; AVX512F-NEXT: vpmaddwd %ymm4, %ymm5, %ymm4 2253; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 2254; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 2255; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 2256; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 2257; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 2258; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 2259; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 2260; AVX512F-NEXT: retq 2261; 2262; AVX512BW-LABEL: jumbled_indices32: 2263; AVX512BW: # %bb.0: 2264; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm0, %zmm0 2265; AVX512BW-NEXT: vpmaddwd %zmm3, %zmm1, %zmm1 2266; AVX512BW-NEXT: retq 2267 %exta = sext <64 x i16> %A to <64 x i32> 2268 %extb = sext <64 x i16> %B to <64 x i32> 2269 %m = mul <64 x i32> %exta, %extb 2270 %sa = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 1, i32 2, i32 6, i32 5, i32 10, i32 8, i32 14, i32 12, i32 19, i32 17, i32 22, i32 20, i32 25, i32 27, i32 30, i32 28, i32 32, i32 34, i32 37, i32 38, i32 41, i32 43, i32 45, i32 47, i32 50, i32 48, i32 52, i32 54, i32 59, i32 56, i32 61, i32 63> 2271 %sb = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 0, i32 3, i32 7, i32 4, i32 11, i32 9, i32 15, i32 13, i32 18, i32 16, i32 23, i32 21, i32 24, i32 26, i32 31, i32 29, i32 33, i32 35, i32 36, i32 39, i32 40, i32 42, i32 44, i32 46, i32 51, i32 49, i32 53, i32 55, i32 58, i32 57, i32 60, i32 62> 2272 %a = add <32 x i32> %sa, %sb 2273 ret <32 x i32> %a 2274} 2275 2276; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through. 2277; This would require the combine to recreate the concat_vectors. 2278define <4 x i32> @pmaddwd_128(ptr %Aptr, ptr %Bptr) { 2279; SSE2-LABEL: pmaddwd_128: 2280; SSE2: # %bb.0: 2281; SSE2-NEXT: movdqa (%rdi), %xmm0 2282; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2283; SSE2-NEXT: retq 2284; 2285; AVX-LABEL: pmaddwd_128: 2286; AVX: # %bb.0: 2287; AVX-NEXT: vmovdqa (%rdi), %xmm0 2288; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2289; AVX-NEXT: retq 2290 %A = load <8 x i16>, ptr %Aptr 2291 %B = load <8 x i16>, ptr %Bptr 2292 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2293 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2294 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2295 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2296 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2297 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2298 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2299 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2300 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2301 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2302 %add = add <4 x i32> %even_mul, %odd_mul 2303 ret <4 x i32> %add 2304} 2305 2306define <8 x i32> @pmaddwd_256(ptr %Aptr, ptr %Bptr) { 2307; SSE2-LABEL: pmaddwd_256: 2308; SSE2: # %bb.0: 2309; SSE2-NEXT: movdqa (%rdi), %xmm0 2310; SSE2-NEXT: movdqa 16(%rdi), %xmm1 2311; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2312; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 2313; SSE2-NEXT: retq 2314; 2315; AVX1-LABEL: pmaddwd_256: 2316; AVX1: # %bb.0: 2317; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2318; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 2319; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 2320; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2321; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2322; AVX1-NEXT: retq 2323; 2324; AVX256-LABEL: pmaddwd_256: 2325; AVX256: # %bb.0: 2326; AVX256-NEXT: vmovdqa (%rdi), %ymm0 2327; AVX256-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2328; AVX256-NEXT: retq 2329 %A = load <16 x i16>, ptr %Aptr 2330 %B = load <16 x i16>, ptr %Bptr 2331 %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2332 %A_odd = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2333 %B_even = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2334 %B_odd = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2335 %A_even_ext = sext <8 x i16> %A_even to <8 x i32> 2336 %B_even_ext = sext <8 x i16> %B_even to <8 x i32> 2337 %A_odd_ext = sext <8 x i16> %A_odd to <8 x i32> 2338 %B_odd_ext = sext <8 x i16> %B_odd to <8 x i32> 2339 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 2340 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 2341 %add = add <8 x i32> %even_mul, %odd_mul 2342 ret <8 x i32> %add 2343} 2344 2345define <16 x i32> @pmaddwd_512(ptr %Aptr, ptr %Bptr) { 2346; SSE2-LABEL: pmaddwd_512: 2347; SSE2: # %bb.0: 2348; SSE2-NEXT: movdqa (%rdi), %xmm0 2349; SSE2-NEXT: movdqa 16(%rdi), %xmm1 2350; SSE2-NEXT: movdqa 32(%rdi), %xmm2 2351; SSE2-NEXT: movdqa 48(%rdi), %xmm3 2352; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2353; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 2354; SSE2-NEXT: pmaddwd 32(%rsi), %xmm2 2355; SSE2-NEXT: pmaddwd 48(%rsi), %xmm3 2356; SSE2-NEXT: retq 2357; 2358; AVX1-LABEL: pmaddwd_512: 2359; AVX1: # %bb.0: 2360; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2361; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 2362; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 2363; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 2364; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 2365; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2366; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2367; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1 2368; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2 2369; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2370; AVX1-NEXT: retq 2371; 2372; AVX2-LABEL: pmaddwd_512: 2373; AVX2: # %bb.0: 2374; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2375; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2376; AVX2-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2377; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2378; AVX2-NEXT: retq 2379; 2380; AVX512F-LABEL: pmaddwd_512: 2381; AVX512F: # %bb.0: 2382; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2383; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 2384; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2385; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2386; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2387; AVX512F-NEXT: retq 2388; 2389; AVX512BW-LABEL: pmaddwd_512: 2390; AVX512BW: # %bb.0: 2391; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2392; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 2393; AVX512BW-NEXT: retq 2394 %A = load <32 x i16>, ptr %Aptr 2395 %B = load <32 x i16>, ptr %Bptr 2396 %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 2397 %A_odd = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 2398 %B_even = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 2399 %B_odd = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 2400 %A_even_ext = sext <16 x i16> %A_even to <16 x i32> 2401 %B_even_ext = sext <16 x i16> %B_even to <16 x i32> 2402 %A_odd_ext = sext <16 x i16> %A_odd to <16 x i32> 2403 %B_odd_ext = sext <16 x i16> %B_odd to <16 x i32> 2404 %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext 2405 %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext 2406 %add = add <16 x i32> %even_mul, %odd_mul 2407 ret <16 x i32> %add 2408} 2409 2410define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) { 2411; SSE2-LABEL: pmaddwd_1024: 2412; SSE2: # %bb.0: 2413; SSE2-NEXT: movq %rdi, %rax 2414; SSE2-NEXT: movdqa (%rsi), %xmm0 2415; SSE2-NEXT: movdqa 16(%rsi), %xmm1 2416; SSE2-NEXT: movdqa 32(%rsi), %xmm2 2417; SSE2-NEXT: movdqa 48(%rsi), %xmm3 2418; SSE2-NEXT: pmaddwd (%rdx), %xmm0 2419; SSE2-NEXT: pmaddwd 16(%rdx), %xmm1 2420; SSE2-NEXT: pmaddwd 32(%rdx), %xmm2 2421; SSE2-NEXT: pmaddwd 48(%rdx), %xmm3 2422; SSE2-NEXT: movdqa 64(%rsi), %xmm4 2423; SSE2-NEXT: pmaddwd 64(%rdx), %xmm4 2424; SSE2-NEXT: movdqa 80(%rsi), %xmm5 2425; SSE2-NEXT: pmaddwd 80(%rdx), %xmm5 2426; SSE2-NEXT: movdqa 96(%rsi), %xmm6 2427; SSE2-NEXT: pmaddwd 96(%rdx), %xmm6 2428; SSE2-NEXT: movdqa 112(%rsi), %xmm7 2429; SSE2-NEXT: pmaddwd 112(%rdx), %xmm7 2430; SSE2-NEXT: movdqa %xmm7, 112(%rdi) 2431; SSE2-NEXT: movdqa %xmm6, 96(%rdi) 2432; SSE2-NEXT: movdqa %xmm5, 80(%rdi) 2433; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 2434; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 2435; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 2436; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 2437; SSE2-NEXT: movdqa %xmm0, (%rdi) 2438; SSE2-NEXT: retq 2439; 2440; AVX1-LABEL: pmaddwd_1024: 2441; AVX1: # %bb.0: 2442; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2443; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 2444; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 2445; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 2446; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 2447; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2448; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2449; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1 2450; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2 2451; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2452; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2 2453; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm2, %xmm2 2454; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 2455; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm3, %xmm3 2456; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 2457; AVX1-NEXT: vmovdqa 112(%rdi), %xmm3 2458; AVX1-NEXT: vpmaddwd 112(%rsi), %xmm3, %xmm3 2459; AVX1-NEXT: vmovdqa 96(%rdi), %xmm4 2460; AVX1-NEXT: vpmaddwd 96(%rsi), %xmm4, %xmm4 2461; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 2462; AVX1-NEXT: retq 2463; 2464; AVX2-LABEL: pmaddwd_1024: 2465; AVX2: # %bb.0: 2466; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2467; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2468; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 2469; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 2470; AVX2-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2471; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2472; AVX2-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2 2473; AVX2-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm3 2474; AVX2-NEXT: retq 2475; 2476; AVX512F-LABEL: pmaddwd_1024: 2477; AVX512F: # %bb.0: 2478; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 2479; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 2480; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm2 2481; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm3 2482; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 2483; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 2484; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2485; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm1 2486; AVX512F-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2 2487; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 2488; AVX512F-NEXT: retq 2489; 2490; AVX512BW-LABEL: pmaddwd_1024: 2491; AVX512BW: # %bb.0: 2492; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2493; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 2494; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 2495; AVX512BW-NEXT: vpmaddwd 64(%rsi), %zmm1, %zmm1 2496; AVX512BW-NEXT: retq 2497 %A = load <64 x i16>, ptr %Aptr 2498 %B = load <64 x i16>, ptr %Bptr 2499 %A_even = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 2500 %A_odd = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63> 2501 %B_even = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 2502 %B_odd = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63> 2503 %A_even_ext = sext <32 x i16> %A_even to <32 x i32> 2504 %B_even_ext = sext <32 x i16> %B_even to <32 x i32> 2505 %A_odd_ext = sext <32 x i16> %A_odd to <32 x i32> 2506 %B_odd_ext = sext <32 x i16> %B_odd to <32 x i32> 2507 %even_mul = mul <32 x i32> %A_even_ext, %B_even_ext 2508 %odd_mul = mul <32 x i32> %A_odd_ext, %B_odd_ext 2509 %add = add <32 x i32> %even_mul, %odd_mul 2510 ret <32 x i32> %add 2511} 2512 2513define <4 x i32> @pmaddwd_commuted_mul(ptr %Aptr, ptr %Bptr) { 2514; SSE2-LABEL: pmaddwd_commuted_mul: 2515; SSE2: # %bb.0: 2516; SSE2-NEXT: movdqa (%rdi), %xmm0 2517; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2518; SSE2-NEXT: retq 2519; 2520; AVX-LABEL: pmaddwd_commuted_mul: 2521; AVX: # %bb.0: 2522; AVX-NEXT: vmovdqa (%rdi), %xmm0 2523; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2524; AVX-NEXT: retq 2525 %A = load <8 x i16>, ptr %Aptr 2526 %B = load <8 x i16>, ptr %Bptr 2527 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2528 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2529 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 2530 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 2531 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2532 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2533 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2534 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2535 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2536 %odd_mul = mul <4 x i32> %B_odd_ext, %A_odd_ext ; Different order than previous mul 2537 %add = add <4 x i32> %even_mul, %odd_mul 2538 ret <4 x i32> %add 2539} 2540 2541define <4 x i32> @pmaddwd_swapped_indices(ptr %Aptr, ptr %Bptr) { 2542; SSE2-LABEL: pmaddwd_swapped_indices: 2543; SSE2: # %bb.0: 2544; SSE2-NEXT: movdqa (%rdi), %xmm0 2545; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2546; SSE2-NEXT: retq 2547; 2548; AVX-LABEL: pmaddwd_swapped_indices: 2549; AVX: # %bb.0: 2550; AVX-NEXT: vmovdqa (%rdi), %xmm0 2551; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2552; AVX-NEXT: retq 2553 %A = load <8 x i16>, ptr %Aptr 2554 %B = load <8 x i16>, ptr %Bptr 2555 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; indices aren't all even 2556 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; indices aren't all odd 2557 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; same indices as A 2558 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; same indices as A 2559 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2560 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2561 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2562 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2563 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2564 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2565 %add = add <4 x i32> %even_mul, %odd_mul 2566 ret <4 x i32> %add 2567} 2568 2569; Negative test where indices aren't paired properly 2570define <4 x i32> @pmaddwd_bad_indices(ptr %Aptr, ptr %Bptr) { 2571; SSE2-LABEL: pmaddwd_bad_indices: 2572; SSE2: # %bb.0: 2573; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] 2574; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] 2575; SSE2-NEXT: pmaddwd (%rsi), %xmm0 2576; SSE2-NEXT: retq 2577; 2578; AVX-LABEL: pmaddwd_bad_indices: 2579; AVX: # %bb.0: 2580; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] 2581; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] 2582; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2583; AVX-NEXT: retq 2584 %A = load <8 x i16>, ptr %Aptr 2585 %B = load <8 x i16>, ptr %Bptr 2586 %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 2587 %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> 2588 %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ; different indices than A 2589 %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> ; different indices than A 2590 %A_even_ext = sext <4 x i16> %A_even to <4 x i32> 2591 %B_even_ext = sext <4 x i16> %B_even to <4 x i32> 2592 %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32> 2593 %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32> 2594 %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext 2595 %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext 2596 %add = add <4 x i32> %even_mul, %odd_mul 2597 ret <4 x i32> %add 2598} 2599 2600; This test contains two multiplies joined by an add. The result of that add is then reduced to a single element. 2601; SelectionDAGBuilder should tag the joining add as a vector reduction. We need to recognize that both sides can use pmaddwd 2602define i32 @madd_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) { 2603; SSE2-LABEL: madd_double_reduction: 2604; SSE2: # %bb.0: 2605; SSE2-NEXT: movdqu (%rdi), %xmm0 2606; SSE2-NEXT: movdqu (%rsi), %xmm1 2607; SSE2-NEXT: pmaddwd %xmm0, %xmm1 2608; SSE2-NEXT: movdqu (%rdx), %xmm0 2609; SSE2-NEXT: movdqu (%rcx), %xmm2 2610; SSE2-NEXT: pmaddwd %xmm0, %xmm2 2611; SSE2-NEXT: paddd %xmm1, %xmm2 2612; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 2613; SSE2-NEXT: paddd %xmm2, %xmm0 2614; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2615; SSE2-NEXT: paddd %xmm0, %xmm1 2616; SSE2-NEXT: movd %xmm1, %eax 2617; SSE2-NEXT: retq 2618; 2619; AVX-LABEL: madd_double_reduction: 2620; AVX: # %bb.0: 2621; AVX-NEXT: vmovdqu (%rdi), %xmm0 2622; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2623; AVX-NEXT: vmovdqu (%rdx), %xmm1 2624; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 2625; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2626; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2627; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2628; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2629; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2630; AVX-NEXT: vmovd %xmm0, %eax 2631; AVX-NEXT: retq 2632 %tmp = load <8 x i16>, ptr %arg, align 1 2633 %tmp6 = load <8 x i16>, ptr %arg1, align 1 2634 %tmp7 = sext <8 x i16> %tmp to <8 x i32> 2635 %tmp17 = sext <8 x i16> %tmp6 to <8 x i32> 2636 %tmp19 = mul nsw <8 x i32> %tmp7, %tmp17 2637 %tmp20 = load <8 x i16>, ptr %arg2, align 1 2638 %tmp21 = load <8 x i16>, ptr %arg3, align 1 2639 %tmp22 = sext <8 x i16> %tmp20 to <8 x i32> 2640 %tmp23 = sext <8 x i16> %tmp21 to <8 x i32> 2641 %tmp25 = mul nsw <8 x i32> %tmp22, %tmp23 2642 %tmp26 = add nuw nsw <8 x i32> %tmp25, %tmp19 2643 %tmp29 = shufflevector <8 x i32> %tmp26, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2644 %tmp30 = add <8 x i32> %tmp26, %tmp29 2645 %tmp31 = shufflevector <8 x i32> %tmp30, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2646 %tmp32 = add <8 x i32> %tmp30, %tmp31 2647 %tmp33 = shufflevector <8 x i32> %tmp32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2648 %tmp34 = add <8 x i32> %tmp32, %tmp33 2649 %tmp35 = extractelement <8 x i32> %tmp34, i64 0 2650 ret i32 %tmp35 2651} 2652 2653define i32 @madd_quad_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7) { 2654; SSE2-LABEL: madd_quad_reduction: 2655; SSE2: # %bb.0: 2656; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2657; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 2658; SSE2-NEXT: movdqu (%rdi), %xmm0 2659; SSE2-NEXT: movdqu (%rsi), %xmm1 2660; SSE2-NEXT: pmaddwd %xmm0, %xmm1 2661; SSE2-NEXT: movdqu (%rdx), %xmm0 2662; SSE2-NEXT: movdqu (%rcx), %xmm2 2663; SSE2-NEXT: pmaddwd %xmm0, %xmm2 2664; SSE2-NEXT: paddd %xmm1, %xmm2 2665; SSE2-NEXT: movdqu (%r8), %xmm0 2666; SSE2-NEXT: movdqu (%r9), %xmm1 2667; SSE2-NEXT: pmaddwd %xmm0, %xmm1 2668; SSE2-NEXT: paddd %xmm2, %xmm1 2669; SSE2-NEXT: movdqu (%r10), %xmm0 2670; SSE2-NEXT: movdqu (%rax), %xmm2 2671; SSE2-NEXT: pmaddwd %xmm0, %xmm2 2672; SSE2-NEXT: paddd %xmm1, %xmm2 2673; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 2674; SSE2-NEXT: paddd %xmm2, %xmm0 2675; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2676; SSE2-NEXT: paddd %xmm0, %xmm1 2677; SSE2-NEXT: movd %xmm1, %eax 2678; SSE2-NEXT: retq 2679; 2680; AVX-LABEL: madd_quad_reduction: 2681; AVX: # %bb.0: 2682; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2683; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 2684; AVX-NEXT: vmovdqu (%rdi), %xmm0 2685; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 2686; AVX-NEXT: vmovdqu (%rdx), %xmm1 2687; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 2688; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2689; AVX-NEXT: vmovdqu (%r8), %xmm1 2690; AVX-NEXT: vpmaddwd (%r9), %xmm1, %xmm1 2691; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2692; AVX-NEXT: vmovdqu (%r10), %xmm1 2693; AVX-NEXT: vpmaddwd (%rax), %xmm1, %xmm1 2694; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2695; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2696; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2697; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2698; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2699; AVX-NEXT: vmovd %xmm0, %eax 2700; AVX-NEXT: retq 2701 %tmp = load <8 x i16>, ptr %arg, align 1 2702 %tmp6 = load <8 x i16>, ptr %arg1, align 1 2703 %tmp7 = sext <8 x i16> %tmp to <8 x i32> 2704 %tmp17 = sext <8 x i16> %tmp6 to <8 x i32> 2705 %tmp19 = mul nsw <8 x i32> %tmp7, %tmp17 2706 %tmp20 = load <8 x i16>, ptr %arg2, align 1 2707 %tmp21 = load <8 x i16>, ptr %arg3, align 1 2708 %tmp22 = sext <8 x i16> %tmp20 to <8 x i32> 2709 %tmp23 = sext <8 x i16> %tmp21 to <8 x i32> 2710 %tmp25 = mul nsw <8 x i32> %tmp22, %tmp23 2711 %tmp26 = add nuw nsw <8 x i32> %tmp25, %tmp19 2712 2713 %tmp40 = load <8 x i16>, ptr %arg4, align 1 2714 %tmp41 = load <8 x i16>, ptr %arg5, align 1 2715 %tmp42 = sext <8 x i16> %tmp40 to <8 x i32> 2716 %tmp43 = sext <8 x i16> %tmp41 to <8 x i32> 2717 %tmp45 = mul nsw <8 x i32> %tmp42, %tmp43 2718 %tmp56 = add nuw nsw <8 x i32> %tmp26, %tmp45 2719 2720 %tmp50 = load <8 x i16>, ptr %arg6, align 1 2721 %tmp51 = load <8 x i16>, ptr %arg7, align 1 2722 %tmp52 = sext <8 x i16> %tmp50 to <8 x i32> 2723 %tmp53 = sext <8 x i16> %tmp51 to <8 x i32> 2724 %tmp55 = mul nsw <8 x i32> %tmp52, %tmp53 2725 %tmp57 = add nuw nsw <8 x i32> %tmp55, %tmp56 2726 2727 %tmp29 = shufflevector <8 x i32> %tmp57, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2728 %tmp30 = add <8 x i32> %tmp57, %tmp29 2729 %tmp31 = shufflevector <8 x i32> %tmp30, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2730 %tmp32 = add <8 x i32> %tmp30, %tmp31 2731 %tmp33 = shufflevector <8 x i32> %tmp32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2732 %tmp34 = add <8 x i32> %tmp32, %tmp33 2733 %tmp35 = extractelement <8 x i32> %tmp34, i64 0 2734 ret i32 %tmp35 2735} 2736 2737define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) { 2738; SSE2-LABEL: sum_and_sum_of_squares: 2739; SSE2: # %bb.0: # %entry 2740; SSE2-NEXT: movl %esi, %eax 2741; SSE2-NEXT: pxor %xmm0, %xmm0 2742; SSE2-NEXT: pxor %xmm1, %xmm1 2743; SSE2-NEXT: pxor %xmm2, %xmm2 2744; SSE2-NEXT: pxor %xmm3, %xmm3 2745; SSE2-NEXT: .p2align 4 2746; SSE2-NEXT: .LBB33_1: # %vector.body 2747; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 2748; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 2749; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 2750; SSE2-NEXT: movdqa %xmm4, %xmm5 2751; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 2752; SSE2-NEXT: paddd %xmm5, %xmm2 2753; SSE2-NEXT: movdqa %xmm4, %xmm5 2754; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 2755; SSE2-NEXT: paddd %xmm5, %xmm3 2756; SSE2-NEXT: pmaddwd %xmm4, %xmm4 2757; SSE2-NEXT: paddd %xmm4, %xmm1 2758; SSE2-NEXT: addq $8, %rdi 2759; SSE2-NEXT: addq $-8, %rax 2760; SSE2-NEXT: jne .LBB33_1 2761; SSE2-NEXT: # %bb.2: # %middle.block 2762; SSE2-NEXT: paddd %xmm3, %xmm2 2763; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 2764; SSE2-NEXT: paddd %xmm2, %xmm3 2765; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] 2766; SSE2-NEXT: paddd %xmm3, %xmm2 2767; SSE2-NEXT: movd %xmm2, %ecx 2768; SSE2-NEXT: paddd %xmm0, %xmm1 2769; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 2770; SSE2-NEXT: paddd %xmm1, %xmm0 2771; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2772; SSE2-NEXT: paddd %xmm0, %xmm1 2773; SSE2-NEXT: movd %xmm1, %eax 2774; SSE2-NEXT: shlq $32, %rcx 2775; SSE2-NEXT: orq %rcx, %rax 2776; SSE2-NEXT: retq 2777; 2778; AVX1-LABEL: sum_and_sum_of_squares: 2779; AVX1: # %bb.0: # %entry 2780; AVX1-NEXT: movl %esi, %eax 2781; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 2782; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2783; AVX1-NEXT: .p2align 4 2784; AVX1-NEXT: .LBB33_1: # %vector.body 2785; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 2786; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2787; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2788; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2789; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm4 2790; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 2791; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2792; AVX1-NEXT: vpmaddwd %xmm2, %xmm2, %xmm2 2793; AVX1-NEXT: vpmaddwd %xmm3, %xmm3, %xmm3 2794; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 2795; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 2796; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 2797; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 2798; AVX1-NEXT: addq $8, %rdi 2799; AVX1-NEXT: addq $-8, %rax 2800; AVX1-NEXT: jne .LBB33_1 2801; AVX1-NEXT: # %bb.2: # %middle.block 2802; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2803; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2804; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2805; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2806; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 2807; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2808; AVX1-NEXT: vmovd %xmm1, %ecx 2809; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2810; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2811; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2812; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2813; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2814; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2815; AVX1-NEXT: vmovd %xmm0, %eax 2816; AVX1-NEXT: shlq $32, %rcx 2817; AVX1-NEXT: orq %rcx, %rax 2818; AVX1-NEXT: vzeroupper 2819; AVX1-NEXT: retq 2820; 2821; AVX256-LABEL: sum_and_sum_of_squares: 2822; AVX256: # %bb.0: # %entry 2823; AVX256-NEXT: movl %esi, %eax 2824; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 2825; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1 2826; AVX256-NEXT: .p2align 4 2827; AVX256-NEXT: .LBB33_1: # %vector.body 2828; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 2829; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 2830; AVX256-NEXT: vpaddd %ymm1, %ymm2, %ymm1 2831; AVX256-NEXT: vpmaddwd %ymm2, %ymm2, %ymm2 2832; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 2833; AVX256-NEXT: addq $8, %rdi 2834; AVX256-NEXT: addq $-8, %rax 2835; AVX256-NEXT: jne .LBB33_1 2836; AVX256-NEXT: # %bb.2: # %middle.block 2837; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 2838; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2839; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2840; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2841; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 2842; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 2843; AVX256-NEXT: vmovd %xmm1, %ecx 2844; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2845; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2846; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2847; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2848; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2849; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2850; AVX256-NEXT: vmovd %xmm0, %eax 2851; AVX256-NEXT: shlq $32, %rcx 2852; AVX256-NEXT: orq %rcx, %rax 2853; AVX256-NEXT: vzeroupper 2854; AVX256-NEXT: retq 2855entry: 2856 %0 = zext i32 %n to i64 2857 br label %vector.body 2858 2859vector.body: 2860 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 2861 %vec.phi = phi <8 x i32> [ %6, %vector.body ], [ zeroinitializer, %entry ] 2862 %sum.phi = phi <8 x i32> [ %4, %vector.body ], [ zeroinitializer, %entry ] 2863 %1 = getelementptr inbounds i8, ptr %a, i64 %index 2864 %2 = bitcast ptr %1 to ptr 2865 %wide.load = load <8 x i8>, ptr %2, align 1 2866 %3 = zext <8 x i8> %wide.load to <8 x i32> 2867 %4 = add nsw <8 x i32> %3, %sum.phi 2868 %5 = mul nsw <8 x i32> %3, %3 2869 %6 = add nsw <8 x i32> %5, %vec.phi 2870 %index.next = add i64 %index, 8 2871 %7 = icmp eq i64 %index.next, %0 2872 br i1 %7, label %middle.block, label %vector.body 2873 2874middle.block: 2875 %rdx.shuf35 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2876 %bin.rdx36 = add <8 x i32> %4, %rdx.shuf35 2877 %rdx.shuf37 = shufflevector <8 x i32> %bin.rdx36, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2878 %bin.rdx38 = add <8 x i32> %bin.rdx36, %rdx.shuf37 2879 %rdx.shuf39 = shufflevector <8 x i32> %bin.rdx38, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2880 %bin.rdx40 = add <8 x i32> %bin.rdx38, %rdx.shuf39 2881 %8 = extractelement <8 x i32> %bin.rdx40, i32 0 2882 %rdx.shuf = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2883 %bin.rdx = add <8 x i32> %6, %rdx.shuf 2884 %rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2885 %bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31 2886 %rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2887 %bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33 2888 %9 = extractelement <8 x i32> %bin.rdx34, i32 0 2889 %tmp = zext i32 %8 to i64 2890 %tmp28 = shl nuw i64 %tmp, 32 2891 %tmp29 = zext i32 %9 to i64 2892 %tmp30 = or i64 %tmp28, %tmp29 2893 ret i64 %tmp30 2894} 2895 2896define i32 @sum_of_square_differences(ptr %a, ptr %b, i32 %n) { 2897; SSE2-LABEL: sum_of_square_differences: 2898; SSE2: # %bb.0: # %entry 2899; SSE2-NEXT: movl %edx, %eax 2900; SSE2-NEXT: pxor %xmm0, %xmm0 2901; SSE2-NEXT: xorl %ecx, %ecx 2902; SSE2-NEXT: pxor %xmm1, %xmm1 2903; SSE2-NEXT: .p2align 4 2904; SSE2-NEXT: .LBB34_1: # %vector.body 2905; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 2906; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 2907; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 2908; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2909; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2910; SSE2-NEXT: psubw %xmm2, %xmm3 2911; SSE2-NEXT: pmaddwd %xmm3, %xmm3 2912; SSE2-NEXT: paddd %xmm3, %xmm1 2913; SSE2-NEXT: addq $8, %rcx 2914; SSE2-NEXT: cmpq %rcx, %rax 2915; SSE2-NEXT: jne .LBB34_1 2916; SSE2-NEXT: # %bb.2: # %middle.block 2917; SSE2-NEXT: paddd %xmm0, %xmm1 2918; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 2919; SSE2-NEXT: paddd %xmm1, %xmm0 2920; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2921; SSE2-NEXT: paddd %xmm0, %xmm1 2922; SSE2-NEXT: movd %xmm1, %eax 2923; SSE2-NEXT: retq 2924; 2925; AVX1-LABEL: sum_of_square_differences: 2926; AVX1: # %bb.0: # %entry 2927; AVX1-NEXT: movl %edx, %eax 2928; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 2929; AVX1-NEXT: xorl %ecx, %ecx 2930; AVX1-NEXT: .p2align 4 2931; AVX1-NEXT: .LBB34_1: # %vector.body 2932; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 2933; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2934; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2935; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 2936; AVX1-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 2937; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 2938; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 2939; AVX1-NEXT: addq $8, %rcx 2940; AVX1-NEXT: cmpq %rcx, %rax 2941; AVX1-NEXT: jne .LBB34_1 2942; AVX1-NEXT: # %bb.2: # %middle.block 2943; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2944; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2945; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2946; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2947; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2948; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2949; AVX1-NEXT: vmovd %xmm0, %eax 2950; AVX1-NEXT: vzeroupper 2951; AVX1-NEXT: retq 2952; 2953; AVX256-LABEL: sum_of_square_differences: 2954; AVX256: # %bb.0: # %entry 2955; AVX256-NEXT: movl %edx, %eax 2956; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 2957; AVX256-NEXT: xorl %ecx, %ecx 2958; AVX256-NEXT: .p2align 4 2959; AVX256-NEXT: .LBB34_1: # %vector.body 2960; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 2961; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2962; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2963; AVX256-NEXT: vpsubw %xmm1, %xmm2, %xmm1 2964; AVX256-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 2965; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 2966; AVX256-NEXT: addq $8, %rcx 2967; AVX256-NEXT: cmpq %rcx, %rax 2968; AVX256-NEXT: jne .LBB34_1 2969; AVX256-NEXT: # %bb.2: # %middle.block 2970; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 2971; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2972; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2973; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2974; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2975; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2976; AVX256-NEXT: vmovd %xmm0, %eax 2977; AVX256-NEXT: vzeroupper 2978; AVX256-NEXT: retq 2979entry: 2980 %0 = zext i32 %n to i64 2981 br label %vector.body 2982 2983vector.body: 2984 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] 2985 %vec.phi = phi <8 x i32> [ %9, %vector.body ], [ zeroinitializer, %entry ] 2986 %1 = getelementptr inbounds i8, ptr %a, i64 %index 2987 %2 = bitcast ptr %1 to ptr 2988 %wide.load = load <8 x i8>, ptr %2, align 1 2989 %3 = zext <8 x i8> %wide.load to <8 x i32> 2990 %4 = getelementptr inbounds i8, ptr %b, i64 %index 2991 %5 = bitcast ptr %4 to ptr 2992 %wide.load2 = load <8 x i8>, ptr %5, align 1 2993 %6 = zext <8 x i8> %wide.load2 to <8 x i32> 2994 %7 = sub <8 x i32> %6, %3 2995 %8 = mul <8 x i32> %7, %7 2996 %9 = add nsw <8 x i32> %8, %vec.phi 2997 %index.next = add i64 %index, 8 2998 %10 = icmp eq i64 %index.next, %0 2999 br i1 %10, label %middle.block, label %vector.body 3000 3001middle.block: 3002 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 3003 %bin.rdx = add <8 x i32> %9, %rdx.shuf 3004 %rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3005 %bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31 3006 %rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3007 %bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33 3008 %11 = extractelement <8 x i32> %bin.rdx34, i32 0 3009 ret i32 %11 3010} 3011 3012; PR49716 - https://llvm.org/PR49716 3013 3014define <4 x i32> @input_size_mismatch(<16 x i16> %x, ptr %p) { 3015; SSE2-LABEL: input_size_mismatch: 3016; SSE2: # %bb.0: 3017; SSE2-NEXT: pmaddwd (%rdi), %xmm0 3018; SSE2-NEXT: retq 3019; 3020; AVX-LABEL: input_size_mismatch: 3021; AVX: # %bb.0: 3022; AVX-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 3023; AVX-NEXT: vzeroupper 3024; AVX-NEXT: retq 3025 %y = load <16 x i16>, ptr %p, align 32 3026 %x0 = shufflevector <16 x i16> %x, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 3027 %x1 = shufflevector <16 x i16> %x, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 3028 %y0 = shufflevector <16 x i16> %y, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 3029 %y1 = shufflevector <16 x i16> %y, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 3030 %sx0 = sext <4 x i16> %x0 to <4 x i32> 3031 %sx1 = sext <4 x i16> %x1 to <4 x i32> 3032 %sy0 = sext <4 x i16> %y0 to <4 x i32> 3033 %sy1 = sext <4 x i16> %y1 to <4 x i32> 3034 %m0 = mul <4 x i32> %sx0, %sy0 3035 %m1 = mul <4 x i32> %sx1, %sy1 3036 %r = add <4 x i32> %m0, %m1 3037 ret <4 x i32> %r 3038} 3039 3040define <4 x i32> @output_size_mismatch(<16 x i16> %x, <16 x i16> %y) { 3041; SSE2-LABEL: output_size_mismatch: 3042; SSE2: # %bb.0: 3043; SSE2-NEXT: pmaddwd %xmm2, %xmm0 3044; SSE2-NEXT: retq 3045; 3046; AVX-LABEL: output_size_mismatch: 3047; AVX: # %bb.0: 3048; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 3049; AVX-NEXT: vzeroupper 3050; AVX-NEXT: retq 3051 %x0 = shufflevector <16 x i16> %x, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 3052 %x1 = shufflevector <16 x i16> %x, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 3053 %y0 = shufflevector <16 x i16> %y, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 3054 %y1 = shufflevector <16 x i16> %y, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 3055 %sx0 = sext <4 x i16> %x0 to <4 x i32> 3056 %sx1 = sext <4 x i16> %x1 to <4 x i32> 3057 %sy0 = sext <4 x i16> %y0 to <4 x i32> 3058 %sy1 = sext <4 x i16> %y1 to <4 x i32> 3059 %m0 = mul <4 x i32> %sx0, %sy0 3060 %m1 = mul <4 x i32> %sx1, %sy1 3061 %r = add <4 x i32> %m0, %m1 3062 ret <4 x i32> %r 3063} 3064 3065define <4 x i32> @output_size_mismatch_high_subvector(<16 x i16> %x, <16 x i16> %y) { 3066; SSE2-LABEL: output_size_mismatch_high_subvector: 3067; SSE2: # %bb.0: 3068; SSE2-NEXT: movdqa %xmm1, %xmm0 3069; SSE2-NEXT: pmaddwd %xmm2, %xmm0 3070; SSE2-NEXT: retq 3071; 3072; AVX1-LABEL: output_size_mismatch_high_subvector: 3073; AVX1: # %bb.0: 3074; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3075; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 3076; AVX1-NEXT: vzeroupper 3077; AVX1-NEXT: retq 3078; 3079; AVX256-LABEL: output_size_mismatch_high_subvector: 3080; AVX256: # %bb.0: 3081; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm0 3082; AVX256-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 3083; AVX256-NEXT: vzeroupper 3084; AVX256-NEXT: retq 3085 %x0 = shufflevector <16 x i16> %x, <16 x i16> undef, <4 x i32> <i32 8, i32 10, i32 12, i32 14> 3086 %x1 = shufflevector <16 x i16> %x, <16 x i16> undef, <4 x i32> <i32 9, i32 11, i32 13, i32 15> 3087 %y0 = shufflevector <16 x i16> %y, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 3088 %y1 = shufflevector <16 x i16> %y, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 3089 %sx0 = sext <4 x i16> %x0 to <4 x i32> 3090 %sx1 = sext <4 x i16> %x1 to <4 x i32> 3091 %sy0 = sext <4 x i16> %y0 to <4 x i32> 3092 %sy1 = sext <4 x i16> %y1 to <4 x i32> 3093 %m0 = mul <4 x i32> %sx0, %sy0 3094 %m1 = mul <4 x i32> %sx1, %sy1 3095 %r = add <4 x i32> %m0, %m1 3096 ret <4 x i32> %r 3097} 3098 3099define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i64 %k) { 3100; SSE2-LABEL: add_used_by_loop_phi: 3101; SSE2: # %bb.0: # %entry 3102; SSE2-NEXT: addq %rdx, %rdi 3103; SSE2-NEXT: addq %rcx, %rsi 3104; SSE2-NEXT: pxor %xmm0, %xmm0 3105; SSE2-NEXT: xorl %eax, %eax 3106; SSE2-NEXT: pxor %xmm2, %xmm2 3107; SSE2-NEXT: pxor %xmm1, %xmm1 3108; SSE2-NEXT: .p2align 4 3109; SSE2-NEXT: .LBB38_1: # %loop 3110; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 3111; SSE2-NEXT: movdqu (%rdi,%rax), %xmm3 3112; SSE2-NEXT: movdqu (%rsi,%rax), %xmm4 3113; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 3114; SSE2-NEXT: psraw $8, %xmm5 3115; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] 3116; SSE2-NEXT: psraw $8, %xmm6 3117; SSE2-NEXT: pmaddwd %xmm5, %xmm6 3118; SSE2-NEXT: paddd %xmm6, %xmm2 3119; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3120; SSE2-NEXT: psraw $8, %xmm4 3121; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3122; SSE2-NEXT: psraw $8, %xmm3 3123; SSE2-NEXT: pmaddwd %xmm4, %xmm3 3124; SSE2-NEXT: paddd %xmm3, %xmm1 3125; SSE2-NEXT: addq $16, %rax 3126; SSE2-NEXT: cmpq %r8, %rax 3127; SSE2-NEXT: jb .LBB38_1 3128; SSE2-NEXT: # %bb.2: # %afterloop 3129; SSE2-NEXT: paddd %xmm0, %xmm2 3130; SSE2-NEXT: paddd %xmm0, %xmm1 3131; SSE2-NEXT: paddd %xmm2, %xmm1 3132; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 3133; SSE2-NEXT: paddd %xmm1, %xmm0 3134; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 3135; SSE2-NEXT: paddd %xmm0, %xmm1 3136; SSE2-NEXT: movd %xmm1, %eax 3137; SSE2-NEXT: retq 3138; 3139; AVX1-LABEL: add_used_by_loop_phi: 3140; AVX1: # %bb.0: # %entry 3141; AVX1-NEXT: addq %rdx, %rdi 3142; AVX1-NEXT: addq %rcx, %rsi 3143; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 3144; AVX1-NEXT: xorl %eax, %eax 3145; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 3146; AVX1-NEXT: .p2align 4 3147; AVX1-NEXT: .LBB38_1: # %loop 3148; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 3149; AVX1-NEXT: vpmovsxbw 8(%rdi,%rax), %xmm2 3150; AVX1-NEXT: vpmovsxbw (%rdi,%rax), %xmm3 3151; AVX1-NEXT: vpmovsxbw 8(%rsi,%rax), %xmm4 3152; AVX1-NEXT: vpmaddwd %xmm4, %xmm2, %xmm2 3153; AVX1-NEXT: vpmovsxbw (%rsi,%rax), %xmm4 3154; AVX1-NEXT: vpmaddwd %xmm4, %xmm3, %xmm3 3155; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 3156; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 3157; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 3158; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3159; AVX1-NEXT: addq $16, %rax 3160; AVX1-NEXT: cmpq %r8, %rax 3161; AVX1-NEXT: jb .LBB38_1 3162; AVX1-NEXT: # %bb.2: # %afterloop 3163; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3164; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3165; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 3166; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 3167; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 3168; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3169; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3170; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 3171; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3172; AVX1-NEXT: vmovd %xmm0, %eax 3173; AVX1-NEXT: vzeroupper 3174; AVX1-NEXT: retq 3175; 3176; AVX2-LABEL: add_used_by_loop_phi: 3177; AVX2: # %bb.0: # %entry 3178; AVX2-NEXT: addq %rdx, %rdi 3179; AVX2-NEXT: addq %rcx, %rsi 3180; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 3181; AVX2-NEXT: xorl %eax, %eax 3182; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 3183; AVX2-NEXT: .p2align 4 3184; AVX2-NEXT: .LBB38_1: # %loop 3185; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 3186; AVX2-NEXT: vpmovsxbw (%rdi,%rax), %ymm2 3187; AVX2-NEXT: vpmovsxbw (%rsi,%rax), %ymm3 3188; AVX2-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 3189; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 3190; AVX2-NEXT: addq $16, %rax 3191; AVX2-NEXT: cmpq %r8, %rax 3192; AVX2-NEXT: jb .LBB38_1 3193; AVX2-NEXT: # %bb.2: # %afterloop 3194; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 3195; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3196; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3197; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3198; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3199; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 3200; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3201; AVX2-NEXT: vmovd %xmm0, %eax 3202; AVX2-NEXT: vzeroupper 3203; AVX2-NEXT: retq 3204; 3205; AVX512-LABEL: add_used_by_loop_phi: 3206; AVX512: # %bb.0: # %entry 3207; AVX512-NEXT: addq %rdx, %rdi 3208; AVX512-NEXT: addq %rcx, %rsi 3209; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 3210; AVX512-NEXT: xorl %eax, %eax 3211; AVX512-NEXT: .p2align 4 3212; AVX512-NEXT: .LBB38_1: # %loop 3213; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 3214; AVX512-NEXT: vpmovsxbw (%rdi,%rax), %ymm1 3215; AVX512-NEXT: vpmovsxbw (%rsi,%rax), %ymm2 3216; AVX512-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 3217; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 3218; AVX512-NEXT: addq $16, %rax 3219; AVX512-NEXT: cmpq %r8, %rax 3220; AVX512-NEXT: jb .LBB38_1 3221; AVX512-NEXT: # %bb.2: # %afterloop 3222; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 3223; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 3224; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 3225; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3226; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3227; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3228; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 3229; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 3230; AVX512-NEXT: vmovd %xmm0, %eax 3231; AVX512-NEXT: vzeroupper 3232; AVX512-NEXT: retq 3233entry: 3234 %scevgep_a = getelementptr i8, ptr %a, i64 %offset_a 3235 %scevgep_b = getelementptr i8, ptr %b, i64 %offset_b 3236 br label %loop 3237 3238loop: 3239 %t0 = phi <16 x i32> [ %3, %loop ], [ zeroinitializer, %entry ] 3240 %ivloop = phi i64 [ %nextivloop, %loop ], [ 0, %entry ] 3241 %scevgep_a1 = getelementptr i8, ptr %scevgep_a, i64 %ivloop 3242 %scevgep_a2 = bitcast ptr %scevgep_a1 to ptr 3243 %gepload_a = load <16 x i8>, ptr %scevgep_a2, align 1 3244 %scevgep_b1 = getelementptr i8, ptr %scevgep_b, i64 %ivloop 3245 %scevgep_b2 = bitcast ptr %scevgep_b1 to ptr 3246 %gepload_b = load <16 x i8>, ptr %scevgep_b2, align 1 3247 %0 = sext <16 x i8> %gepload_a to <16 x i32> 3248 %1 = sext <16 x i8> %gepload_b to <16 x i32> 3249 %2 = mul nsw <16 x i32> %0, %1 3250 %3 = add <16 x i32> %2, %t0 3251 %nextivloop = add nuw nsw i64 %ivloop, 16 3252 %condloop = icmp ult i64 %nextivloop, %k 3253 br i1 %condloop, label %loop, label %afterloop 3254 3255afterloop: 3256 %.lcssa = phi <16 x i32> [ %3, %loop ] 3257 %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3258 %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf 3259 %rdx.shuf90 = shufflevector <16 x i32> %bin.rdx, <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3260 %bin.rdx91 = add <16 x i32> %bin.rdx, %rdx.shuf90 3261 %rdx.shuf92 = shufflevector <16 x i32> %bin.rdx91, <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3262 %bin.rdx93 = add <16 x i32> %bin.rdx91, %rdx.shuf92 3263 %rdx.shuf94 = shufflevector <16 x i32> %bin.rdx93, <16 x i32> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3264 %bin.rdx95 = add <16 x i32> %bin.rdx93, %rdx.shuf94 3265 %sum = extractelement <16 x i32> %bin.rdx95, i32 0 3266 ret i32 %sum 3267} 3268