1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512BW 7 8; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through. 9; This would require the combine to recreate the concat_vectors. 10define <8 x i16> @pmaddubsw_128(ptr %Aptr, ptr %Bptr) { 11; SSE-LABEL: pmaddubsw_128: 12; SSE: # %bb.0: 13; SSE-NEXT: movdqa (%rsi), %xmm0 14; SSE-NEXT: pmaddubsw (%rdi), %xmm0 15; SSE-NEXT: retq 16; 17; AVX-LABEL: pmaddubsw_128: 18; AVX: # %bb.0: 19; AVX-NEXT: vmovdqa (%rsi), %xmm0 20; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 21; AVX-NEXT: retq 22 %A = load <16 x i8>, ptr %Aptr 23 %B = load <16 x i8>, ptr %Bptr 24 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 25 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 26 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 27 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 28 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 29 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 30 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 31 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 32 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 33 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 34 %add = add <8 x i32> %even_mul, %odd_mul 35 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 36 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 37 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 38 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 39 %trunc = trunc <8 x i32> %min to <8 x i16> 40 ret <8 x i16> %trunc 41} 42 43define <16 x i16> @pmaddubsw_256(ptr %Aptr, ptr %Bptr) { 44; SSE-LABEL: pmaddubsw_256: 45; SSE: # %bb.0: 46; SSE-NEXT: movdqa (%rsi), %xmm0 47; SSE-NEXT: movdqa 16(%rsi), %xmm1 48; SSE-NEXT: pmaddubsw (%rdi), %xmm0 49; SSE-NEXT: pmaddubsw 16(%rdi), %xmm1 50; SSE-NEXT: retq 51; 52; AVX1-LABEL: pmaddubsw_256: 53; AVX1: # %bb.0: 54; AVX1-NEXT: vmovdqa (%rsi), %xmm0 55; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 56; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 57; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 58; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 59; AVX1-NEXT: retq 60; 61; AVX256-LABEL: pmaddubsw_256: 62; AVX256: # %bb.0: 63; AVX256-NEXT: vmovdqa (%rsi), %ymm0 64; AVX256-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 65; AVX256-NEXT: retq 66 %A = load <32 x i8>, ptr %Aptr 67 %B = load <32 x i8>, ptr %Bptr 68 %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 69 %A_odd = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 70 %B_even = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 71 %B_odd = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 72 %A_even_ext = sext <16 x i8> %A_even to <16 x i32> 73 %B_even_ext = zext <16 x i8> %B_even to <16 x i32> 74 %A_odd_ext = sext <16 x i8> %A_odd to <16 x i32> 75 %B_odd_ext = zext <16 x i8> %B_odd to <16 x i32> 76 %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext 77 %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext 78 %add = add <16 x i32> %even_mul, %odd_mul 79 %cmp_max = icmp sgt <16 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 80 %max = select <16 x i1> %cmp_max, <16 x i32> %add, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 81 %cmp_min = icmp slt <16 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 82 %min = select <16 x i1> %cmp_min, <16 x i32> %max, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 83 %trunc = trunc <16 x i32> %min to <16 x i16> 84 ret <16 x i16> %trunc 85} 86 87define <64 x i16> @pmaddubsw_512(ptr %Aptr, ptr %Bptr) { 88; SSE-LABEL: pmaddubsw_512: 89; SSE: # %bb.0: 90; SSE-NEXT: movq %rdi, %rax 91; SSE-NEXT: movdqa (%rdx), %xmm0 92; SSE-NEXT: movdqa 16(%rdx), %xmm1 93; SSE-NEXT: movdqa 32(%rdx), %xmm2 94; SSE-NEXT: movdqa 48(%rdx), %xmm3 95; SSE-NEXT: pmaddubsw (%rsi), %xmm0 96; SSE-NEXT: pmaddubsw 16(%rsi), %xmm1 97; SSE-NEXT: pmaddubsw 32(%rsi), %xmm2 98; SSE-NEXT: pmaddubsw 48(%rsi), %xmm3 99; SSE-NEXT: movdqa 64(%rdx), %xmm4 100; SSE-NEXT: pmaddubsw 64(%rsi), %xmm4 101; SSE-NEXT: movdqa 80(%rdx), %xmm5 102; SSE-NEXT: pmaddubsw 80(%rsi), %xmm5 103; SSE-NEXT: movdqa 96(%rdx), %xmm6 104; SSE-NEXT: pmaddubsw 96(%rsi), %xmm6 105; SSE-NEXT: movdqa 112(%rdx), %xmm7 106; SSE-NEXT: pmaddubsw 112(%rsi), %xmm7 107; SSE-NEXT: movdqa %xmm7, 112(%rdi) 108; SSE-NEXT: movdqa %xmm6, 96(%rdi) 109; SSE-NEXT: movdqa %xmm5, 80(%rdi) 110; SSE-NEXT: movdqa %xmm4, 64(%rdi) 111; SSE-NEXT: movdqa %xmm3, 48(%rdi) 112; SSE-NEXT: movdqa %xmm2, 32(%rdi) 113; SSE-NEXT: movdqa %xmm1, 16(%rdi) 114; SSE-NEXT: movdqa %xmm0, (%rdi) 115; SSE-NEXT: retq 116; 117; AVX1-LABEL: pmaddubsw_512: 118; AVX1: # %bb.0: 119; AVX1-NEXT: vmovdqa (%rsi), %xmm0 120; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 121; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 122; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 123; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 124; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 125; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 126; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm3, %xmm1 127; AVX1-NEXT: vpmaddubsw 32(%rdi), %xmm2, %xmm2 128; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 129; AVX1-NEXT: vmovdqa 80(%rsi), %xmm2 130; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm2, %xmm2 131; AVX1-NEXT: vmovdqa 64(%rsi), %xmm3 132; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm3, %xmm3 133; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 134; AVX1-NEXT: vmovdqa 112(%rsi), %xmm3 135; AVX1-NEXT: vpmaddubsw 112(%rdi), %xmm3, %xmm3 136; AVX1-NEXT: vmovdqa 96(%rsi), %xmm4 137; AVX1-NEXT: vpmaddubsw 96(%rdi), %xmm4, %xmm4 138; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 139; AVX1-NEXT: retq 140; 141; AVX2-LABEL: pmaddubsw_512: 142; AVX2: # %bb.0: 143; AVX2-NEXT: vmovdqa (%rsi), %ymm0 144; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 145; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 146; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3 147; AVX2-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 148; AVX2-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 149; AVX2-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 150; AVX2-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 151; AVX2-NEXT: retq 152; 153; AVX512F-LABEL: pmaddubsw_512: 154; AVX512F: # %bb.0: 155; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 156; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 157; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2 158; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3 159; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 160; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 161; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 162; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm1 163; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 164; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 165; AVX512F-NEXT: retq 166; 167; AVX512BW-LABEL: pmaddubsw_512: 168; AVX512BW: # %bb.0: 169; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 170; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 171; AVX512BW-NEXT: vpmaddubsw (%rdi), %zmm0, %zmm0 172; AVX512BW-NEXT: vpmaddubsw 64(%rdi), %zmm1, %zmm1 173; AVX512BW-NEXT: retq 174 %A = load <128 x i8>, ptr %Aptr 175 %B = load <128 x i8>, ptr %Bptr 176 %A_even = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126> 177 %A_odd = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> 178 %B_even = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126> 179 %B_odd = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> 180 %A_even_ext = sext <64 x i8> %A_even to <64 x i32> 181 %B_even_ext = zext <64 x i8> %B_even to <64 x i32> 182 %A_odd_ext = sext <64 x i8> %A_odd to <64 x i32> 183 %B_odd_ext = zext <64 x i8> %B_odd to <64 x i32> 184 %even_mul = mul <64 x i32> %A_even_ext, %B_even_ext 185 %odd_mul = mul <64 x i32> %A_odd_ext, %B_odd_ext 186 %add = add <64 x i32> %even_mul, %odd_mul 187 %cmp_max = icmp sgt <64 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 188 %max = select <64 x i1> %cmp_max, <64 x i32> %add, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 189 %cmp_min = icmp slt <64 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 190 %min = select <64 x i1> %cmp_min, <64 x i32> %max, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 191 %trunc = trunc <64 x i32> %min to <64 x i16> 192 ret <64 x i16> %trunc 193} 194 195define <8 x i16> @pmaddubsw_swapped_indices(ptr %Aptr, ptr %Bptr) { 196; SSE-LABEL: pmaddubsw_swapped_indices: 197; SSE: # %bb.0: 198; SSE-NEXT: movdqa (%rsi), %xmm0 199; SSE-NEXT: pmaddubsw (%rdi), %xmm0 200; SSE-NEXT: retq 201; 202; AVX-LABEL: pmaddubsw_swapped_indices: 203; AVX: # %bb.0: 204; AVX-NEXT: vmovdqa (%rsi), %xmm0 205; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 206; AVX-NEXT: retq 207 %A = load <16 x i8>, ptr %Aptr 208 %B = load <16 x i8>, ptr %Bptr 209 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even 210 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd 211 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;same indices as A 212 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;same indices as A 213 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 214 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 215 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 216 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 217 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 218 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 219 %add = add <8 x i32> %even_mul, %odd_mul 220 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 221 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 222 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 223 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 224 %trunc = trunc <8 x i32> %min to <8 x i16> 225 ret <8 x i16> %trunc 226} 227 228define <8 x i16> @pmaddubsw_swapped_extend(ptr %Aptr, ptr %Bptr) { 229; SSE-LABEL: pmaddubsw_swapped_extend: 230; SSE: # %bb.0: 231; SSE-NEXT: movdqa (%rdi), %xmm0 232; SSE-NEXT: pmaddubsw (%rsi), %xmm0 233; SSE-NEXT: retq 234; 235; AVX-LABEL: pmaddubsw_swapped_extend: 236; AVX: # %bb.0: 237; AVX-NEXT: vmovdqa (%rdi), %xmm0 238; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0 239; AVX-NEXT: retq 240 %A = load <16 x i8>, ptr %Aptr 241 %B = load <16 x i8>, ptr %Bptr 242 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 243 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 244 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 245 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 246 %A_even_ext = zext <8 x i8> %A_even to <8 x i32> 247 %B_even_ext = sext <8 x i8> %B_even to <8 x i32> 248 %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32> 249 %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32> 250 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 251 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 252 %add = add <8 x i32> %even_mul, %odd_mul 253 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 254 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 255 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 256 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 257 %trunc = trunc <8 x i32> %min to <8 x i16> 258 ret <8 x i16> %trunc 259} 260 261define <8 x i16> @pmaddubsw_commuted_mul(ptr %Aptr, ptr %Bptr) { 262; SSE-LABEL: pmaddubsw_commuted_mul: 263; SSE: # %bb.0: 264; SSE-NEXT: movdqa (%rsi), %xmm0 265; SSE-NEXT: pmaddubsw (%rdi), %xmm0 266; SSE-NEXT: retq 267; 268; AVX-LABEL: pmaddubsw_commuted_mul: 269; AVX: # %bb.0: 270; AVX-NEXT: vmovdqa (%rsi), %xmm0 271; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 272; AVX-NEXT: retq 273 %A = load <16 x i8>, ptr %Aptr 274 %B = load <16 x i8>, ptr %Bptr 275 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 276 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 277 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 278 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 279 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 280 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 281 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 282 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 283 %even_mul = mul <8 x i32> %B_even_ext, %A_even_ext 284 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 285 %add = add <8 x i32> %even_mul, %odd_mul 286 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 287 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 288 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 289 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 290 %trunc = trunc <8 x i32> %min to <8 x i16> 291 ret <8 x i16> %trunc 292} 293 294; If the extensions don't match see if we can use PMADDWD instead. 295define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) { 296; SSE-LABEL: pmaddubsw_bad_extend: 297; SSE: # %bb.0: 298; SSE-NEXT: movdqa (%rdi), %xmm1 299; SSE-NEXT: movdqa (%rsi), %xmm2 300; SSE-NEXT: movdqa %xmm1, %xmm0 301; SSE-NEXT: psllw $8, %xmm0 302; SSE-NEXT: psraw $8, %xmm0 303; SSE-NEXT: movdqa %xmm2, %xmm3 304; SSE-NEXT: psraw $8, %xmm3 305; SSE-NEXT: movdqa %xmm2, %xmm4 306; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,8],zero,xmm4[10],zero,xmm4[12],zero,xmm4[14],zero 307; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 308; SSE-NEXT: psrlw $8, %xmm1 309; SSE-NEXT: movdqa %xmm0, %xmm5 310; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 311; SSE-NEXT: pmaddwd %xmm4, %xmm5 312; SSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2],zero,xmm2[4],zero,xmm2[6],zero,xmm2[u,u,u,u,u,u,u,u] 313; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 314; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 315; SSE-NEXT: pmaddwd %xmm2, %xmm0 316; SSE-NEXT: packssdw %xmm5, %xmm0 317; SSE-NEXT: retq 318; 319; AVX1-LABEL: pmaddubsw_bad_extend: 320; AVX1: # %bb.0: 321; AVX1-NEXT: vmovdqa (%rdi), %xmm0 322; AVX1-NEXT: vmovdqa (%rsi), %xmm1 323; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0] 324; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 325; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 326; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 327; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] 328; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] 329; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] 330; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 331; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 332; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 333; AVX1-NEXT: vpmovsxbw %xmm3, %xmm2 334; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 335; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 336; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 337; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 338; AVX1-NEXT: vpmovsxbw %xmm6, %xmm3 339; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 340; AVX1-NEXT: vpmovsxbw %xmm4, %xmm3 341; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero,xmm0[u,u,u,u,u,u,u,u] 342; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] 343; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 344; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 345; AVX1-NEXT: retq 346; 347; AVX256-LABEL: pmaddubsw_bad_extend: 348; AVX256: # %bb.0: 349; AVX256-NEXT: vmovdqa (%rdi), %xmm0 350; AVX256-NEXT: vmovdqa (%rsi), %xmm1 351; AVX256-NEXT: vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0] 352; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 353; AVX256-NEXT: vmovq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] 354; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 355; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 356; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 357; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 358; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero 359; AVX256-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 360; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 361; AVX256-NEXT: vpmovsxbd %xmm1, %ymm1 362; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 363; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 364; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 365; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 366; AVX256-NEXT: vzeroupper 367; AVX256-NEXT: retq 368 %A = load <16 x i8>, ptr %Aptr 369 %B = load <16 x i8>, ptr %Bptr 370 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 371 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 372 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 373 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 374 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 375 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 376 %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32> 377 %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32> 378 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 379 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 380 %add = add <8 x i32> %even_mul, %odd_mul 381 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 382 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 383 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 384 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 385 %trunc = trunc <8 x i32> %min to <8 x i16> 386 ret <8 x i16> %trunc 387} 388 389define <8 x i16> @pmaddubsw_bad_indices(ptr %Aptr, ptr %Bptr) { 390; SSE-LABEL: pmaddubsw_bad_indices: 391; SSE: # %bb.0: 392; SSE-NEXT: movdqa (%rdi), %xmm1 393; SSE-NEXT: movdqa (%rsi), %xmm2 394; SSE-NEXT: movdqa %xmm1, %xmm0 395; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] 396; SSE-NEXT: psraw $8, %xmm0 397; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] 398; SSE-NEXT: psraw $8, %xmm1 399; SSE-NEXT: pxor %xmm3, %xmm3 400; SSE-NEXT: movdqa %xmm2, %xmm4 401; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 402; SSE-NEXT: movdqa %xmm0, %xmm5 403; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 404; SSE-NEXT: pmaddwd %xmm4, %xmm5 405; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 406; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 407; SSE-NEXT: pmaddwd %xmm2, %xmm0 408; SSE-NEXT: packssdw %xmm5, %xmm0 409; SSE-NEXT: retq 410; 411; AVX1-LABEL: pmaddubsw_bad_indices: 412; AVX1: # %bb.0: 413; AVX1-NEXT: vmovdqa (%rdi), %xmm0 414; AVX1-NEXT: vmovdqa (%rsi), %xmm1 415; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] 416; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] 417; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] 418; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 419; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 420; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 421; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 422; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 423; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 424; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3 425; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 426; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 427; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 428; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 429; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 430; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 431; AVX1-NEXT: retq 432; 433; AVX256-LABEL: pmaddubsw_bad_indices: 434; AVX256: # %bb.0: 435; AVX256-NEXT: vmovdqa (%rdi), %xmm0 436; AVX256-NEXT: vmovdqa (%rsi), %xmm1 437; AVX256-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] 438; AVX256-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] 439; AVX256-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 440; AVX256-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 441; AVX256-NEXT: vpmovsxbd %xmm2, %ymm2 442; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 443; AVX256-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 444; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 445; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 446; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 447; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 448; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 449; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 450; AVX256-NEXT: vzeroupper 451; AVX256-NEXT: retq 452 %A = load <16 x i8>, ptr %Aptr 453 %B = load <16 x i8>, ptr %Bptr 454 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even 455 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd 456 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> ;different than A 457 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ;different than A 458 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 459 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 460 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 461 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 462 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 463 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 464 %add = add <8 x i32> %even_mul, %odd_mul 465 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 466 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 467 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 468 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 469 %trunc = trunc <8 x i32> %min to <8 x i16> 470 ret <8 x i16> %trunc 471} 472 473define <8 x i16> @pmaddubsw_large_vector(ptr %p1, ptr %p2) { 474; SSE-LABEL: pmaddubsw_large_vector: 475; SSE: # %bb.0: 476; SSE-NEXT: movdqa (%rdi), %xmm0 477; SSE-NEXT: pmaddubsw (%rsi), %xmm0 478; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 479; SSE-NEXT: retq 480; 481; AVX-LABEL: pmaddubsw_large_vector: 482; AVX: # %bb.0: 483; AVX-NEXT: vmovdqa (%rdi), %xmm0 484; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0 485; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 486; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7] 487; AVX-NEXT: retq 488 %1 = load <64 x i8>, ptr %p1, align 64 489 %2 = shufflevector <64 x i8> %1, <64 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 490 %3 = shufflevector <64 x i8> %1, <64 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 491 %4 = load <32 x i8>, ptr %p2, align 64 492 %5 = shufflevector <32 x i8> %4, <32 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 493 %6 = shufflevector <32 x i8> %4, <32 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 494 %7 = sext <8 x i8> %5 to <8 x i32> 495 %8 = zext <8 x i8> %2 to <8 x i32> 496 %9 = mul nsw <8 x i32> %7, %8 497 %10 = sext <8 x i8> %6 to <8 x i32> 498 %11 = zext <8 x i8> %3 to <8 x i32> 499 %12 = mul nsw <8 x i32> %10, %11 500 %13 = add nsw <8 x i32> %9, %12 501 %14 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %13, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 502 %15 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %14, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>) 503 %16 = trunc <8 x i32> %15 to <8 x i16> 504 %17 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %16, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15> 505 ret <8 x i16> %17 506} 507 508declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) 509declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) 510