1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=SSE-32,SLM,SLM-32 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=SSE-64,SLM,SLM-64 4; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE-32,SLOW,SLOW-32 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE-64,SLOW,SLOW-64 6; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE-32,SSE4,SSE4-32 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE-64,SSE4,SSE4-64 8; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2,+slow-pmulld | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,AVX2-SLOW32 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+slow-pmulld | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,AVX2-SLOW64 10; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX2-32 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX2-64 12; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,AVX512DQ-32 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,AVX512DQ-64 14; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,AVX512BW-32 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,AVX512BW-64 16; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,KNL-32 17; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,KNL-64 18 19; Make sure that the slow-pmulld feature can be used without SSE4.1. 20; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1 21 22define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) { 23; SSE-32-LABEL: test_mul_v4i32_v4i8: 24; SSE-32: # %bb.0: 25; SSE-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 26; SSE-32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 27; SSE-32-NEXT: retl 28; 29; SSE-64-LABEL: test_mul_v4i32_v4i8: 30; SSE-64: # %bb.0: 31; SSE-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 32; SSE-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,0,18778,0,18778,0,18778,0] 33; SSE-64-NEXT: retq 34; 35; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i8: 36; AVX2-SLOW32: # %bb.0: 37; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 38; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 39; AVX2-SLOW32-NEXT: retl 40; 41; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i8: 42; AVX2-SLOW64: # %bb.0: 43; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 44; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 45; AVX2-SLOW64-NEXT: retq 46; 47; AVX2-32-LABEL: test_mul_v4i32_v4i8: 48; AVX2-32: # %bb.0: 49; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 50; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 51; AVX2-32-NEXT: retl 52; 53; AVX2-64-LABEL: test_mul_v4i32_v4i8: 54; AVX2-64: # %bb.0: 55; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 56; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 57; AVX2-64-NEXT: retq 58; 59; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8: 60; AVX512DQ-32: # %bb.0: 61; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 62; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 63; AVX512DQ-32-NEXT: retl 64; 65; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8: 66; AVX512DQ-64: # %bb.0: 67; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 68; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 69; AVX512DQ-64-NEXT: retq 70; 71; AVX512BW-32-LABEL: test_mul_v4i32_v4i8: 72; AVX512BW-32: # %bb.0: 73; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 74; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 75; AVX512BW-32-NEXT: retl 76; 77; AVX512BW-64-LABEL: test_mul_v4i32_v4i8: 78; AVX512BW-64: # %bb.0: 79; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 80; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 81; AVX512BW-64-NEXT: retq 82; 83; KNL-32-LABEL: test_mul_v4i32_v4i8: 84; KNL-32: # %bb.0: 85; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 86; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 87; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 88; KNL-32-NEXT: retl 89; 90; KNL-64-LABEL: test_mul_v4i32_v4i8: 91; KNL-64: # %bb.0: 92; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 93; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 94; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 95; KNL-64-NEXT: retq 96 %z = zext <4 x i8> %A to <4 x i32> 97 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778> 98 ret <4 x i32> %m 99} 100 101define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { 102; SLM-LABEL: test_mul_v8i32_v8i8: 103; SLM: # %bb.0: 104; SLM-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] 105; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 106; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 107; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 108; SLM-NEXT: pmaddwd %xmm2, %xmm0 109; SLM-NEXT: pmaddwd %xmm2, %xmm1 110; SLM-NEXT: ret{{[l|q]}} 111; 112; SLOW-LABEL: test_mul_v8i32_v8i8: 113; SLOW: # %bb.0: 114; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 115; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 116; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 117; SLOW-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] 118; SLOW-NEXT: pmaddwd %xmm2, %xmm0 119; SLOW-NEXT: pmaddwd %xmm2, %xmm1 120; SLOW-NEXT: ret{{[l|q]}} 121; 122; SSE4-LABEL: test_mul_v8i32_v8i8: 123; SSE4: # %bb.0: 124; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 125; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 126; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 127; SSE4-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] 128; SSE4-NEXT: pmaddwd %xmm2, %xmm0 129; SSE4-NEXT: pmaddwd %xmm2, %xmm1 130; SSE4-NEXT: ret{{[l|q]}} 131; 132; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8: 133; AVX2-SLOW32: # %bb.0: 134; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 135; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 136; AVX2-SLOW32-NEXT: retl 137; 138; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i8: 139; AVX2-SLOW64: # %bb.0: 140; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 141; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 142; AVX2-SLOW64-NEXT: retq 143; 144; AVX2-32-LABEL: test_mul_v8i32_v8i8: 145; AVX2-32: # %bb.0: 146; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 147; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 148; AVX2-32-NEXT: retl 149; 150; AVX2-64-LABEL: test_mul_v8i32_v8i8: 151; AVX2-64: # %bb.0: 152; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 153; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 154; AVX2-64-NEXT: retq 155; 156; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8: 157; AVX512DQ-32: # %bb.0: 158; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 159; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 160; AVX512DQ-32-NEXT: retl 161; 162; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8: 163; AVX512DQ-64: # %bb.0: 164; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 165; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 166; AVX512DQ-64-NEXT: retq 167; 168; AVX512BW-32-LABEL: test_mul_v8i32_v8i8: 169; AVX512BW-32: # %bb.0: 170; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 171; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 172; AVX512BW-32-NEXT: retl 173; 174; AVX512BW-64-LABEL: test_mul_v8i32_v8i8: 175; AVX512BW-64: # %bb.0: 176; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 177; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 178; AVX512BW-64-NEXT: retq 179; 180; KNL-32-LABEL: test_mul_v8i32_v8i8: 181; KNL-32: # %bb.0: 182; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 183; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 184; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 185; KNL-32-NEXT: retl 186; 187; KNL-64-LABEL: test_mul_v8i32_v8i8: 188; KNL-64: # %bb.0: 189; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 190; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 191; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 192; KNL-64-NEXT: retq 193 %z = zext <8 x i8> %A to <8 x i32> 194 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 195 ret <8 x i32> %m 196} 197 198define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { 199; SLM-LABEL: test_mul_v16i32_v16i8: 200; SLM: # %bb.0: 201; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 202; SLM-NEXT: pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778] 203; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 204; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 205; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 206; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 207; SLM-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 208; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 209; SLM-NEXT: pmaddwd %xmm5, %xmm0 210; SLM-NEXT: pmaddwd %xmm5, %xmm1 211; SLM-NEXT: pmaddwd %xmm5, %xmm2 212; SLM-NEXT: pmaddwd %xmm5, %xmm3 213; SLM-NEXT: ret{{[l|q]}} 214; 215; SLOW-LABEL: test_mul_v16i32_v16i8: 216; SLOW: # %bb.0: 217; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 218; SLOW-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 219; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 220; SLOW-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 221; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 222; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 223; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 224; SLOW-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] 225; SLOW-NEXT: pmaddwd %xmm4, %xmm0 226; SLOW-NEXT: pmaddwd %xmm4, %xmm1 227; SLOW-NEXT: pmaddwd %xmm4, %xmm2 228; SLOW-NEXT: pmaddwd %xmm4, %xmm3 229; SLOW-NEXT: ret{{[l|q]}} 230; 231; SSE4-LABEL: test_mul_v16i32_v16i8: 232; SSE4: # %bb.0: 233; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 234; SSE4-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 235; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 236; SSE4-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 237; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 238; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 239; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 240; SSE4-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] 241; SSE4-NEXT: pmaddwd %xmm4, %xmm0 242; SSE4-NEXT: pmaddwd %xmm4, %xmm1 243; SSE4-NEXT: pmaddwd %xmm4, %xmm2 244; SSE4-NEXT: pmaddwd %xmm4, %xmm3 245; SSE4-NEXT: ret{{[l|q]}} 246; 247; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8: 248; AVX2-SLOW: # %bb.0: 249; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 250; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 251; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 252; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 253; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 254; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 255; AVX2-SLOW-NEXT: ret{{[l|q]}} 256; 257; AVX2-32-LABEL: test_mul_v16i32_v16i8: 258; AVX2-32: # %bb.0: 259; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 260; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 261; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 262; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 263; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 264; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 265; AVX2-32-NEXT: retl 266; 267; AVX2-64-LABEL: test_mul_v16i32_v16i8: 268; AVX2-64: # %bb.0: 269; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 270; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 271; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 272; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 273; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 274; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 275; AVX2-64-NEXT: retq 276; 277; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8: 278; AVX512DQ-32: # %bb.0: 279; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 280; AVX512DQ-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 281; AVX512DQ-32-NEXT: retl 282; 283; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8: 284; AVX512DQ-64: # %bb.0: 285; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 286; AVX512DQ-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 287; AVX512DQ-64-NEXT: retq 288; 289; AVX512BW-32-LABEL: test_mul_v16i32_v16i8: 290; AVX512BW-32: # %bb.0: 291; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 292; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 293; AVX512BW-32-NEXT: retl 294; 295; AVX512BW-64-LABEL: test_mul_v16i32_v16i8: 296; AVX512BW-64: # %bb.0: 297; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 298; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 299; AVX512BW-64-NEXT: retq 300; 301; KNL-32-LABEL: test_mul_v16i32_v16i8: 302; KNL-32: # %bb.0: 303; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 304; KNL-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 305; KNL-32-NEXT: retl 306; 307; KNL-64-LABEL: test_mul_v16i32_v16i8: 308; KNL-64: # %bb.0: 309; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 310; KNL-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 311; KNL-64-NEXT: retq 312 %z = zext <16 x i8> %A to <16 x i32> 313 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 314 ret <16 x i32> %m 315} 316 317define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) { 318; SLM-LABEL: test_mul_v4i32_v4i16: 319; SLM: # %bb.0: 320; SLM-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,u,u,u,u] 321; SLM-NEXT: movdqa %xmm0, %xmm2 322; SLM-NEXT: pmulhuw %xmm1, %xmm2 323; SLM-NEXT: pmullw %xmm1, %xmm0 324; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 325; SLM-NEXT: ret{{[l|q]}} 326; 327; SLOW-LABEL: test_mul_v4i32_v4i16: 328; SLOW: # %bb.0: 329; SLOW-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,u,u,u,u] 330; SLOW-NEXT: movdqa %xmm0, %xmm2 331; SLOW-NEXT: pmulhuw %xmm1, %xmm2 332; SLOW-NEXT: pmullw %xmm1, %xmm0 333; SLOW-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 334; SLOW-NEXT: ret{{[l|q]}} 335; 336; SSE4-32-LABEL: test_mul_v4i32_v4i16: 337; SSE4-32: # %bb.0: 338; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 339; SSE4-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 340; SSE4-32-NEXT: retl 341; 342; SSE4-64-LABEL: test_mul_v4i32_v4i16: 343; SSE4-64: # %bb.0: 344; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 345; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 346; SSE4-64-NEXT: retq 347; 348; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16: 349; AVX2-SLOW: # %bb.0: 350; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 351; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 352; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 353; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 354; AVX2-SLOW-NEXT: ret{{[l|q]}} 355; 356; AVX-32-LABEL: test_mul_v4i32_v4i16: 357; AVX-32: # %bb.0: 358; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 359; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 360; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 361; AVX-32-NEXT: retl 362; 363; AVX-64-LABEL: test_mul_v4i32_v4i16: 364; AVX-64: # %bb.0: 365; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 366; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 367; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 368; AVX-64-NEXT: retq 369 %z = zext <4 x i16> %A to <4 x i32> 370 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778> 371 ret <4 x i32> %m 372} 373 374define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { 375; SLM-LABEL: test_mul_v8i32_v8i16: 376; SLM: # %bb.0: 377; SLM-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 378; SLM-NEXT: movdqa %xmm0, %xmm2 379; SLM-NEXT: pmulhuw %xmm1, %xmm2 380; SLM-NEXT: pmullw %xmm0, %xmm1 381; SLM-NEXT: movdqa %xmm1, %xmm0 382; SLM-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 383; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 384; SLM-NEXT: ret{{[l|q]}} 385; 386; SLOW-LABEL: test_mul_v8i32_v8i16: 387; SLOW: # %bb.0: 388; SLOW-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 389; SLOW-NEXT: movdqa %xmm0, %xmm2 390; SLOW-NEXT: pmulhuw %xmm1, %xmm2 391; SLOW-NEXT: pmullw %xmm0, %xmm1 392; SLOW-NEXT: movdqa %xmm1, %xmm0 393; SLOW-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 394; SLOW-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 395; SLOW-NEXT: ret{{[l|q]}} 396; 397; SSE4-LABEL: test_mul_v8i32_v8i16: 398; SSE4: # %bb.0: 399; SSE4-NEXT: pxor %xmm1, %xmm1 400; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 401; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 402; SSE4-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] 403; SSE4-NEXT: pmulld %xmm1, %xmm2 404; SSE4-NEXT: pmulld %xmm0, %xmm1 405; SSE4-NEXT: movdqa %xmm2, %xmm0 406; SSE4-NEXT: ret{{[l|q]}} 407; 408; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16: 409; AVX2-SLOW: # %bb.0: 410; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 411; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 412; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 413; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 414; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 415; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 416; AVX2-SLOW-NEXT: ret{{[l|q]}} 417; 418; AVX-32-LABEL: test_mul_v8i32_v8i16: 419; AVX-32: # %bb.0: 420; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 421; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 422; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 423; AVX-32-NEXT: retl 424; 425; AVX-64-LABEL: test_mul_v8i32_v8i16: 426; AVX-64: # %bb.0: 427; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 428; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 429; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 430; AVX-64-NEXT: retq 431 %z = zext <8 x i16> %A to <8 x i32> 432 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 433 ret <8 x i32> %m 434} 435 436define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { 437; SLM-LABEL: test_mul_v16i32_v16i16: 438; SLM: # %bb.0: 439; SLM-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] 440; SLM-NEXT: movdqa %xmm0, %xmm4 441; SLM-NEXT: movdqa %xmm0, %xmm2 442; SLM-NEXT: movdqa %xmm1, %xmm5 443; SLM-NEXT: pmullw %xmm3, %xmm4 444; SLM-NEXT: pmulhuw %xmm3, %xmm2 445; SLM-NEXT: pmulhuw %xmm3, %xmm5 446; SLM-NEXT: pmullw %xmm1, %xmm3 447; SLM-NEXT: movdqa %xmm4, %xmm0 448; SLM-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 449; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 450; SLM-NEXT: movdqa %xmm3, %xmm2 451; SLM-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 452; SLM-NEXT: movdqa %xmm4, %xmm1 453; SLM-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 454; SLM-NEXT: ret{{[l|q]}} 455; 456; SLOW-LABEL: test_mul_v16i32_v16i16: 457; SLOW: # %bb.0: 458; SLOW-NEXT: movdqa %xmm0, %xmm4 459; SLOW-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] 460; SLOW-NEXT: movdqa %xmm0, %xmm2 461; SLOW-NEXT: pmulhuw %xmm3, %xmm2 462; SLOW-NEXT: pmullw %xmm3, %xmm4 463; SLOW-NEXT: movdqa %xmm4, %xmm0 464; SLOW-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 465; SLOW-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 466; SLOW-NEXT: movdqa %xmm1, %xmm5 467; SLOW-NEXT: pmulhuw %xmm3, %xmm5 468; SLOW-NEXT: pmullw %xmm1, %xmm3 469; SLOW-NEXT: movdqa %xmm3, %xmm2 470; SLOW-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 471; SLOW-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 472; SLOW-NEXT: movdqa %xmm4, %xmm1 473; SLOW-NEXT: ret{{[l|q]}} 474; 475; SSE4-LABEL: test_mul_v16i32_v16i16: 476; SSE4: # %bb.0: 477; SSE4-NEXT: movdqa %xmm0, %xmm4 478; SSE4-NEXT: pxor %xmm3, %xmm3 479; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 480; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 481; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 482; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 483; SSE4-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] 484; SSE4-NEXT: pmulld %xmm3, %xmm0 485; SSE4-NEXT: pmulld %xmm3, %xmm4 486; SSE4-NEXT: pmulld %xmm3, %xmm2 487; SSE4-NEXT: pmulld %xmm1, %xmm3 488; SSE4-NEXT: movdqa %xmm4, %xmm1 489; SSE4-NEXT: ret{{[l|q]}} 490; 491; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16: 492; AVX2-SLOW: # %bb.0: 493; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] 494; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 495; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 496; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 497; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 498; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm1[0,1] 499; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] 500; AVX2-SLOW-NEXT: ret{{[l|q]}} 501; 502; AVX2-32-LABEL: test_mul_v16i32_v16i16: 503; AVX2-32: # %bb.0: 504; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1 505; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 506; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 507; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 508; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0 509; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1 510; AVX2-32-NEXT: retl 511; 512; AVX2-64-LABEL: test_mul_v16i32_v16i16: 513; AVX2-64: # %bb.0: 514; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1 515; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 516; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 517; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 518; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0 519; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1 520; AVX2-64-NEXT: retq 521; 522; AVX512-32-LABEL: test_mul_v16i32_v16i16: 523; AVX512-32: # %bb.0: 524; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 525; AVX512-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 526; AVX512-32-NEXT: retl 527; 528; AVX512-64-LABEL: test_mul_v16i32_v16i16: 529; AVX512-64: # %bb.0: 530; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 531; AVX512-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 532; AVX512-64-NEXT: retq 533 %z = zext <16 x i16> %A to <16 x i32> 534 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 535 ret <16 x i32> %m 536} 537 538; 539; MinSize Tests 540; 541 542define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize { 543; SSE-32-LABEL: test_mul_v4i32_v4i8_minsize: 544; SSE-32: # %bb.0: 545; SSE-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 546; SSE-32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 547; SSE-32-NEXT: retl 548; 549; SSE-64-LABEL: test_mul_v4i32_v4i8_minsize: 550; SSE-64: # %bb.0: 551; SSE-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 552; SSE-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,0,18778,0,18778,0,18778,0] 553; SSE-64-NEXT: retq 554; 555; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i8_minsize: 556; AVX2-SLOW32: # %bb.0: 557; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 558; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 559; AVX2-SLOW32-NEXT: retl 560; 561; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i8_minsize: 562; AVX2-SLOW64: # %bb.0: 563; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 564; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 565; AVX2-SLOW64-NEXT: retq 566; 567; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize: 568; AVX2-32: # %bb.0: 569; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 570; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 571; AVX2-32-NEXT: retl 572; 573; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize: 574; AVX2-64: # %bb.0: 575; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 576; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 577; AVX2-64-NEXT: retq 578; 579; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize: 580; AVX512DQ-32: # %bb.0: 581; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 582; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 583; AVX512DQ-32-NEXT: retl 584; 585; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize: 586; AVX512DQ-64: # %bb.0: 587; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 588; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 589; AVX512DQ-64-NEXT: retq 590; 591; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize: 592; AVX512BW-32: # %bb.0: 593; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 594; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 595; AVX512BW-32-NEXT: retl 596; 597; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize: 598; AVX512BW-64: # %bb.0: 599; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 600; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18778,0,18778,0,18778,0,18778,0] 601; AVX512BW-64-NEXT: retq 602; 603; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize: 604; KNL-32: # %bb.0: 605; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 606; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 607; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 608; KNL-32-NEXT: retl 609; 610; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize: 611; KNL-64: # %bb.0: 612; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 613; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 614; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 615; KNL-64-NEXT: retq 616 %z = zext <4 x i8> %A to <4 x i32> 617 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778> 618 ret <4 x i32> %m 619} 620 621define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { 622; SLM-LABEL: test_mul_v8i32_v8i8_minsize: 623; SLM: # %bb.0: 624; SLM-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] 625; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 626; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 627; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 628; SLM-NEXT: pmaddwd %xmm2, %xmm0 629; SLM-NEXT: pmaddwd %xmm2, %xmm1 630; SLM-NEXT: ret{{[l|q]}} 631; 632; SLOW-LABEL: test_mul_v8i32_v8i8_minsize: 633; SLOW: # %bb.0: 634; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 635; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 636; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 637; SLOW-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] 638; SLOW-NEXT: pmaddwd %xmm2, %xmm0 639; SLOW-NEXT: pmaddwd %xmm2, %xmm1 640; SLOW-NEXT: ret{{[l|q]}} 641; 642; SSE4-LABEL: test_mul_v8i32_v8i8_minsize: 643; SSE4: # %bb.0: 644; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 645; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 646; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 647; SSE4-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] 648; SSE4-NEXT: pmaddwd %xmm2, %xmm0 649; SSE4-NEXT: pmaddwd %xmm2, %xmm1 650; SSE4-NEXT: ret{{[l|q]}} 651; 652; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8_minsize: 653; AVX2-SLOW32: # %bb.0: 654; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 655; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 656; AVX2-SLOW32-NEXT: retl 657; 658; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i8_minsize: 659; AVX2-SLOW64: # %bb.0: 660; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 661; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 662; AVX2-SLOW64-NEXT: retq 663; 664; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize: 665; AVX2-32: # %bb.0: 666; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 667; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 668; AVX2-32-NEXT: retl 669; 670; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize: 671; AVX2-64: # %bb.0: 672; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 673; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 674; AVX2-64-NEXT: retq 675; 676; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize: 677; AVX512DQ-32: # %bb.0: 678; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 679; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 680; AVX512DQ-32-NEXT: retl 681; 682; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize: 683; AVX512DQ-64: # %bb.0: 684; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 685; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 686; AVX512DQ-64-NEXT: retq 687; 688; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize: 689; AVX512BW-32: # %bb.0: 690; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 691; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 692; AVX512BW-32-NEXT: retl 693; 694; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize: 695; AVX512BW-64: # %bb.0: 696; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 697; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 698; AVX512BW-64-NEXT: retq 699; 700; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize: 701; KNL-32: # %bb.0: 702; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 703; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 704; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 705; KNL-32-NEXT: retl 706; 707; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize: 708; KNL-64: # %bb.0: 709; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 710; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 711; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 712; KNL-64-NEXT: retq 713 %z = zext <8 x i8> %A to <8 x i32> 714 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 715 ret <8 x i32> %m 716} 717 718define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { 719; SLM-LABEL: test_mul_v16i32_v16i8_minsize: 720; SLM: # %bb.0: 721; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 722; SLM-NEXT: pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778] 723; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 724; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 725; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 726; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 727; SLM-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 728; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 729; SLM-NEXT: pmaddwd %xmm5, %xmm0 730; SLM-NEXT: pmaddwd %xmm5, %xmm1 731; SLM-NEXT: pmaddwd %xmm5, %xmm2 732; SLM-NEXT: pmaddwd %xmm5, %xmm3 733; SLM-NEXT: ret{{[l|q]}} 734; 735; SLOW-LABEL: test_mul_v16i32_v16i8_minsize: 736; SLOW: # %bb.0: 737; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 738; SLOW-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 739; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 740; SLOW-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 741; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 742; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 743; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 744; SLOW-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] 745; SLOW-NEXT: pmaddwd %xmm4, %xmm0 746; SLOW-NEXT: pmaddwd %xmm4, %xmm1 747; SLOW-NEXT: pmaddwd %xmm4, %xmm2 748; SLOW-NEXT: pmaddwd %xmm4, %xmm3 749; SLOW-NEXT: ret{{[l|q]}} 750; 751; SSE4-LABEL: test_mul_v16i32_v16i8_minsize: 752; SSE4: # %bb.0: 753; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 754; SSE4-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 755; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 756; SSE4-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 757; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 758; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 759; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 760; SSE4-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] 761; SSE4-NEXT: pmaddwd %xmm4, %xmm0 762; SSE4-NEXT: pmaddwd %xmm4, %xmm1 763; SSE4-NEXT: pmaddwd %xmm4, %xmm2 764; SSE4-NEXT: pmaddwd %xmm4, %xmm3 765; SSE4-NEXT: ret{{[l|q]}} 766; 767; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8_minsize: 768; AVX2-SLOW: # %bb.0: 769; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 770; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 771; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 772; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 773; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 774; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 775; AVX2-SLOW-NEXT: ret{{[l|q]}} 776; 777; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize: 778; AVX2-32: # %bb.0: 779; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 780; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 781; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 782; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 783; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 784; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 785; AVX2-32-NEXT: retl 786; 787; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize: 788; AVX2-64: # %bb.0: 789; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 790; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 791; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 792; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 793; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 794; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 795; AVX2-64-NEXT: retq 796; 797; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize: 798; AVX512DQ-32: # %bb.0: 799; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 800; AVX512DQ-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 801; AVX512DQ-32-NEXT: retl 802; 803; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize: 804; AVX512DQ-64: # %bb.0: 805; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 806; AVX512DQ-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 807; AVX512DQ-64-NEXT: retq 808; 809; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize: 810; AVX512BW-32: # %bb.0: 811; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 812; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 813; AVX512BW-32-NEXT: retl 814; 815; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize: 816; AVX512BW-64: # %bb.0: 817; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 818; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0] 819; AVX512BW-64-NEXT: retq 820; 821; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize: 822; KNL-32: # %bb.0: 823; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 824; KNL-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 825; KNL-32-NEXT: retl 826; 827; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize: 828; KNL-64: # %bb.0: 829; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 830; KNL-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 831; KNL-64-NEXT: retq 832 %z = zext <16 x i8> %A to <16 x i32> 833 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 834 ret <16 x i32> %m 835} 836 837define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize { 838; SSE-32-LABEL: test_mul_v4i32_v4i16_minsize: 839; SSE-32: # %bb.0: 840; SSE-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 841; SSE-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 842; SSE-32-NEXT: retl 843; 844; SSE-64-LABEL: test_mul_v4i32_v4i16_minsize: 845; SSE-64: # %bb.0: 846; SSE-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 847; SSE-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 848; SSE-64-NEXT: retq 849; 850; AVX2-LABEL: test_mul_v4i32_v4i16_minsize: 851; AVX2: # %bb.0: 852; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 853; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] 854; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 855; AVX2-NEXT: ret{{[l|q]}} 856 %z = zext <4 x i16> %A to <4 x i32> 857 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778> 858 ret <4 x i32> %m 859} 860 861define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { 862; SLM-LABEL: test_mul_v8i32_v8i16_minsize: 863; SLM: # %bb.0: 864; SLM-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] 865; SLM-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 866; SLM-NEXT: pxor %xmm3, %xmm3 867; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 868; SLM-NEXT: pmulld %xmm1, %xmm2 869; SLM-NEXT: pmulld %xmm0, %xmm1 870; SLM-NEXT: movdqa %xmm2, %xmm0 871; SLM-NEXT: ret{{[l|q]}} 872; 873; SLOW-LABEL: test_mul_v8i32_v8i16_minsize: 874; SLOW: # %bb.0: 875; SLOW-NEXT: pxor %xmm1, %xmm1 876; SLOW-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 877; SLOW-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 878; SLOW-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] 879; SLOW-NEXT: pmulld %xmm1, %xmm2 880; SLOW-NEXT: pmulld %xmm0, %xmm1 881; SLOW-NEXT: movdqa %xmm2, %xmm0 882; SLOW-NEXT: ret{{[l|q]}} 883; 884; SSE4-LABEL: test_mul_v8i32_v8i16_minsize: 885; SSE4: # %bb.0: 886; SSE4-NEXT: pxor %xmm1, %xmm1 887; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 888; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 889; SSE4-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] 890; SSE4-NEXT: pmulld %xmm1, %xmm2 891; SSE4-NEXT: pmulld %xmm0, %xmm1 892; SSE4-NEXT: movdqa %xmm2, %xmm0 893; SSE4-NEXT: ret{{[l|q]}} 894; 895; AVX2-LABEL: test_mul_v8i32_v8i16_minsize: 896; AVX2: # %bb.0: 897; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 898; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] 899; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 900; AVX2-NEXT: ret{{[l|q]}} 901 %z = zext <8 x i16> %A to <8 x i32> 902 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 903 ret <8 x i32> %m 904} 905 906define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { 907; SLM-LABEL: test_mul_v16i32_v16i16_minsize: 908; SLM: # %bb.0: 909; SLM-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] 910; SLM-NEXT: movdqa %xmm0, %xmm4 911; SLM-NEXT: pxor %xmm5, %xmm5 912; SLM-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 913; SLM-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 914; SLM-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 915; SLM-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 916; SLM-NEXT: pmulld %xmm3, %xmm4 917; SLM-NEXT: pmulld %xmm3, %xmm0 918; SLM-NEXT: pmulld %xmm3, %xmm2 919; SLM-NEXT: pmulld %xmm1, %xmm3 920; SLM-NEXT: movdqa %xmm4, %xmm1 921; SLM-NEXT: ret{{[l|q]}} 922; 923; SLOW-LABEL: test_mul_v16i32_v16i16_minsize: 924; SLOW: # %bb.0: 925; SLOW-NEXT: movdqa %xmm0, %xmm4 926; SLOW-NEXT: pxor %xmm3, %xmm3 927; SLOW-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 928; SLOW-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 929; SLOW-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 930; SLOW-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 931; SLOW-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] 932; SLOW-NEXT: pmulld %xmm3, %xmm0 933; SLOW-NEXT: pmulld %xmm3, %xmm4 934; SLOW-NEXT: pmulld %xmm3, %xmm2 935; SLOW-NEXT: pmulld %xmm1, %xmm3 936; SLOW-NEXT: movdqa %xmm4, %xmm1 937; SLOW-NEXT: ret{{[l|q]}} 938; 939; SSE4-LABEL: test_mul_v16i32_v16i16_minsize: 940; SSE4: # %bb.0: 941; SSE4-NEXT: movdqa %xmm0, %xmm4 942; SSE4-NEXT: pxor %xmm3, %xmm3 943; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 944; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 945; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 946; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 947; SSE4-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] 948; SSE4-NEXT: pmulld %xmm3, %xmm0 949; SSE4-NEXT: pmulld %xmm3, %xmm4 950; SSE4-NEXT: pmulld %xmm3, %xmm2 951; SSE4-NEXT: pmulld %xmm1, %xmm3 952; SSE4-NEXT: movdqa %xmm4, %xmm1 953; SSE4-NEXT: ret{{[l|q]}} 954; 955; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16_minsize: 956; AVX2-SLOW: # %bb.0: 957; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 958; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 959; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 960; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 961; AVX2-SLOW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 962; AVX2-SLOW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 963; AVX2-SLOW-NEXT: ret{{[l|q]}} 964; 965; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize: 966; AVX2-32: # %bb.0: 967; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1 968; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 969; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 970; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 971; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0 972; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1 973; AVX2-32-NEXT: retl 974; 975; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize: 976; AVX2-64: # %bb.0: 977; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1 978; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 979; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 980; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] 981; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0 982; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1 983; AVX2-64-NEXT: retq 984; 985; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize: 986; AVX512-32: # %bb.0: 987; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 988; AVX512-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 989; AVX512-32-NEXT: retl 990; 991; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize: 992; AVX512-64: # %bb.0: 993; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 994; AVX512-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 995; AVX512-64-NEXT: retq 996 %z = zext <16 x i16> %A to <16 x i32> 997 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778> 998 ret <16 x i32> %m 999} 1000;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 1001; SLM-32: {{.*}} 1002; SLM-64: {{.*}} 1003; SLOW-32: {{.*}} 1004; SLOW-64: {{.*}} 1005