1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86-AVX,X86-AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2 8 9@c = external dso_local global ptr, align 8 10 11; %val1 = load <2 x i8> 12; %op1 = zext<2 x i32> %val1 13; %val2 = load <2 x i8> 14; %op2 = zext<2 x i32> %val2 15; %rst = mul <2 x i32> %op1, %op2 16; 17define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 18; X86-SSE-LABEL: mul_2xi8: 19; X86-SSE: # %bb.0: # %entry 20; X86-SSE-NEXT: pushl %esi 21; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 22; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 23; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 24; X86-SSE-NEXT: movl c, %esi 25; X86-SSE-NEXT: movzwl (%edx,%eax), %edx 26; X86-SSE-NEXT: movd %edx, %xmm0 27; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 28; X86-SSE-NEXT: movd %ecx, %xmm1 29; X86-SSE-NEXT: pxor %xmm2, %xmm2 30; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 31; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 32; X86-SSE-NEXT: pmullw %xmm0, %xmm1 33; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 34; X86-SSE-NEXT: movq %xmm1, (%esi,%eax,4) 35; X86-SSE-NEXT: popl %esi 36; X86-SSE-NEXT: retl 37; 38; X86-AVX-LABEL: mul_2xi8: 39; X86-AVX: # %bb.0: # %entry 40; X86-AVX-NEXT: pushl %esi 41; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 42; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 43; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 44; X86-AVX-NEXT: movl c, %esi 45; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx 46; X86-AVX-NEXT: vmovd %edx, %xmm0 47; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 48; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax 49; X86-AVX-NEXT: vmovd %eax, %xmm1 50; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 51; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 52; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 53; X86-AVX-NEXT: popl %esi 54; X86-AVX-NEXT: retl 55; 56; X64-SSE-LABEL: mul_2xi8: 57; X64-SSE: # %bb.0: # %entry 58; X64-SSE-NEXT: movq c(%rip), %rax 59; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 60; X64-SSE-NEXT: movd %ecx, %xmm0 61; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 62; X64-SSE-NEXT: movd %ecx, %xmm1 63; X64-SSE-NEXT: pxor %xmm2, %xmm2 64; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 65; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 66; X64-SSE-NEXT: pmullw %xmm0, %xmm1 67; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 68; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 69; X64-SSE-NEXT: retq 70; 71; X64-AVX-LABEL: mul_2xi8: 72; X64-AVX: # %bb.0: # %entry 73; X64-AVX-NEXT: movq c(%rip), %rax 74; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx 75; X64-AVX-NEXT: vmovd %ecx, %xmm0 76; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 77; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx 78; X64-AVX-NEXT: vmovd %ecx, %xmm1 79; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 80; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 81; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 82; X64-AVX-NEXT: retq 83entry: 84 %pre = load ptr, ptr @c 85 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 86 %wide.load = load <2 x i8>, ptr %tmp6, align 1 87 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 88 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 89 %wide.load17 = load <2 x i8>, ptr %tmp10, align 1 90 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 91 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 92 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 93 store <2 x i32> %tmp13, ptr %tmp14, align 4 94 ret void 95} 96 97; %val1 = load <4 x i8> 98; %op1 = zext<4 x i32> %val1 99; %val2 = load <4 x i8> 100; %op2 = zext<4 x i32> %val2 101; %rst = mul <4 x i32> %op1, %op2 102; 103define void @mul_4xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 104; X86-SSE-LABEL: mul_4xi8: 105; X86-SSE: # %bb.0: # %entry 106; X86-SSE-NEXT: pushl %esi 107; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 108; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 109; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 110; X86-SSE-NEXT: movl c, %esi 111; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 112; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 113; X86-SSE-NEXT: pxor %xmm2, %xmm2 114; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 115; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 116; X86-SSE-NEXT: pmullw %xmm0, %xmm1 117; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 118; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) 119; X86-SSE-NEXT: popl %esi 120; X86-SSE-NEXT: retl 121; 122; X86-AVX-LABEL: mul_4xi8: 123; X86-AVX: # %bb.0: # %entry 124; X86-AVX-NEXT: pushl %esi 125; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 126; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 127; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 128; X86-AVX-NEXT: movl c, %esi 129; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 130; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 131; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 132; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) 133; X86-AVX-NEXT: popl %esi 134; X86-AVX-NEXT: retl 135; 136; X64-SSE-LABEL: mul_4xi8: 137; X64-SSE: # %bb.0: # %entry 138; X64-SSE-NEXT: movq c(%rip), %rax 139; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 140; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 141; X64-SSE-NEXT: pxor %xmm2, %xmm2 142; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 143; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 144; X64-SSE-NEXT: pmullw %xmm0, %xmm1 145; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 146; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) 147; X64-SSE-NEXT: retq 148; 149; X64-AVX-LABEL: mul_4xi8: 150; X64-AVX: # %bb.0: # %entry 151; X64-AVX-NEXT: movq c(%rip), %rax 152; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 153; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 154; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 155; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) 156; X64-AVX-NEXT: retq 157entry: 158 %pre = load ptr, ptr @c 159 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 160 %wide.load = load <4 x i8>, ptr %tmp6, align 1 161 %tmp8 = zext <4 x i8> %wide.load to <4 x i32> 162 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 163 %wide.load17 = load <4 x i8>, ptr %tmp10, align 1 164 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> 165 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 166 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 167 store <4 x i32> %tmp13, ptr %tmp14, align 4 168 ret void 169} 170 171; %val1 = load <8 x i8> 172; %op1 = zext<8 x i32> %val1 173; %val2 = load <8 x i8> 174; %op2 = zext<8 x i32> %val2 175; %rst = mul <8 x i32> %op1, %op2 176; 177define void @mul_8xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 178; X86-SSE-LABEL: mul_8xi8: 179; X86-SSE: # %bb.0: # %entry 180; X86-SSE-NEXT: pushl %esi 181; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 182; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 183; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi 184; X86-SSE-NEXT: movl c, %ecx 185; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 186; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 187; X86-SSE-NEXT: pxor %xmm2, %xmm2 188; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 189; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 190; X86-SSE-NEXT: pmullw %xmm0, %xmm1 191; X86-SSE-NEXT: movdqa %xmm1, %xmm0 192; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 193; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 194; X86-SSE-NEXT: movdqu %xmm1, 16(%ecx,%eax,4) 195; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) 196; X86-SSE-NEXT: popl %esi 197; X86-SSE-NEXT: retl 198; 199; X86-AVX1-LABEL: mul_8xi8: 200; X86-AVX1: # %bb.0: # %entry 201; X86-AVX1-NEXT: pushl %esi 202; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 203; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 204; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 205; X86-AVX1-NEXT: movl c, %esi 206; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 207; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 208; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 209; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 210; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 211; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 212; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) 213; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) 214; X86-AVX1-NEXT: popl %esi 215; X86-AVX1-NEXT: retl 216; 217; X86-AVX2-LABEL: mul_8xi8: 218; X86-AVX2: # %bb.0: # %entry 219; X86-AVX2-NEXT: pushl %esi 220; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 221; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 222; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 223; X86-AVX2-NEXT: movl c, %esi 224; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 225; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 226; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 227; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) 228; X86-AVX2-NEXT: popl %esi 229; X86-AVX2-NEXT: vzeroupper 230; X86-AVX2-NEXT: retl 231; 232; X64-SSE-LABEL: mul_8xi8: 233; X64-SSE: # %bb.0: # %entry 234; X64-SSE-NEXT: movq c(%rip), %rax 235; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 236; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 237; X64-SSE-NEXT: pxor %xmm2, %xmm2 238; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 239; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 240; X64-SSE-NEXT: pmullw %xmm0, %xmm1 241; X64-SSE-NEXT: movdqa %xmm1, %xmm0 242; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 243; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 244; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 245; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) 246; X64-SSE-NEXT: retq 247; 248; X64-AVX1-LABEL: mul_8xi8: 249; X64-AVX1: # %bb.0: # %entry 250; X64-AVX1-NEXT: movq c(%rip), %rax 251; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 252; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 253; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 254; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 255; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 256; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 257; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) 258; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) 259; X64-AVX1-NEXT: retq 260; 261; X64-AVX2-LABEL: mul_8xi8: 262; X64-AVX2: # %bb.0: # %entry 263; X64-AVX2-NEXT: movq c(%rip), %rax 264; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 265; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 266; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 267; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) 268; X64-AVX2-NEXT: vzeroupper 269; X64-AVX2-NEXT: retq 270entry: 271 %pre = load ptr, ptr @c 272 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 273 %wide.load = load <8 x i8>, ptr %tmp6, align 1 274 %tmp8 = zext <8 x i8> %wide.load to <8 x i32> 275 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 276 %wide.load17 = load <8 x i8>, ptr %tmp10, align 1 277 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> 278 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 279 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 280 store <8 x i32> %tmp13, ptr %tmp14, align 4 281 ret void 282} 283 284; %val1 = load <16 x i8> 285; %op1 = zext<16 x i32> %val1 286; %val2 = load <16 x i8> 287; %op2 = zext<16 x i32> %val2 288; %rst = mul <16 x i32> %op1, %op2 289; 290define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 291; X86-SSE-LABEL: mul_16xi8: 292; X86-SSE: # %bb.0: # %entry 293; X86-SSE-NEXT: pushl %esi 294; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 295; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 296; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi 297; X86-SSE-NEXT: movl c, %ecx 298; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm3 299; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 300; X86-SSE-NEXT: pxor %xmm1, %xmm1 301; X86-SSE-NEXT: movdqa %xmm3, %xmm4 302; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 303; X86-SSE-NEXT: movdqa %xmm0, %xmm2 304; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 305; X86-SSE-NEXT: pmullw %xmm4, %xmm2 306; X86-SSE-NEXT: movdqa %xmm2, %xmm4 307; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 308; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 309; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 310; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 311; X86-SSE-NEXT: pmullw %xmm3, %xmm0 312; X86-SSE-NEXT: movdqa %xmm0, %xmm3 313; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 314; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 315; X86-SSE-NEXT: movdqu %xmm0, 48(%ecx,%eax,4) 316; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) 317; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) 318; X86-SSE-NEXT: movdqu %xmm4, (%ecx,%eax,4) 319; X86-SSE-NEXT: popl %esi 320; X86-SSE-NEXT: retl 321; 322; X86-AVX1-LABEL: mul_16xi8: 323; X86-AVX1: # %bb.0: # %entry 324; X86-AVX1-NEXT: pushl %esi 325; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 326; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 327; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi 328; X86-AVX1-NEXT: movl c, %ecx 329; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 330; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 331; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 332; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 333; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 334; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 335; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 336; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 337; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 338; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 339; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 340; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 341; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) 342; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) 343; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) 344; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) 345; X86-AVX1-NEXT: popl %esi 346; X86-AVX1-NEXT: retl 347; 348; X86-AVX2-LABEL: mul_16xi8: 349; X86-AVX2: # %bb.0: # %entry 350; X86-AVX2-NEXT: pushl %esi 351; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 352; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 353; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 354; X86-AVX2-NEXT: movl c, %esi 355; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 356; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 357; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 358; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 359; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 360; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 361; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 362; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 363; X86-AVX2-NEXT: popl %esi 364; X86-AVX2-NEXT: vzeroupper 365; X86-AVX2-NEXT: retl 366; 367; X64-SSE-LABEL: mul_16xi8: 368; X64-SSE: # %bb.0: # %entry 369; X64-SSE-NEXT: movq c(%rip), %rax 370; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 371; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 372; X64-SSE-NEXT: pxor %xmm2, %xmm2 373; X64-SSE-NEXT: movdqa %xmm0, %xmm3 374; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 375; X64-SSE-NEXT: movdqa %xmm1, %xmm4 376; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 377; X64-SSE-NEXT: pmullw %xmm3, %xmm4 378; X64-SSE-NEXT: movdqa %xmm4, %xmm3 379; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 380; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 381; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 382; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 383; X64-SSE-NEXT: pmullw %xmm0, %xmm1 384; X64-SSE-NEXT: movdqa %xmm1, %xmm0 385; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 386; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 387; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 388; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) 389; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) 390; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) 391; X64-SSE-NEXT: retq 392; 393; X64-AVX1-LABEL: mul_16xi8: 394; X64-AVX1: # %bb.0: # %entry 395; X64-AVX1-NEXT: movq c(%rip), %rax 396; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 397; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 398; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 399; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 400; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 401; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 402; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 403; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 404; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 405; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 406; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 407; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 408; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) 409; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) 410; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) 411; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) 412; X64-AVX1-NEXT: retq 413; 414; X64-AVX2-LABEL: mul_16xi8: 415; X64-AVX2: # %bb.0: # %entry 416; X64-AVX2-NEXT: movq c(%rip), %rax 417; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 418; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 419; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 420; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 421; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 422; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 423; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 424; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 425; X64-AVX2-NEXT: vzeroupper 426; X64-AVX2-NEXT: retq 427entry: 428 %pre = load ptr, ptr @c 429 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 430 %wide.load = load <16 x i8>, ptr %tmp6, align 1 431 %tmp8 = zext <16 x i8> %wide.load to <16 x i32> 432 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 433 %wide.load17 = load <16 x i8>, ptr %tmp10, align 1 434 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> 435 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 436 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 437 store <16 x i32> %tmp13, ptr %tmp14, align 4 438 ret void 439} 440 441; %val1 = load <2 x i16> 442; %op1 = zext<2 x i32> %val1 443; %val2 = load <2 x i16> 444; %op2 = zext<2 x i32> %val2 445; %rst = mul <2 x i32> %op1, %op2 446; 447define void @mul_2xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 448; X86-SSE-LABEL: mul_2xi16: 449; X86-SSE: # %bb.0: # %entry 450; X86-SSE-NEXT: pushl %esi 451; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 452; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 453; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 454; X86-SSE-NEXT: movl c, %esi 455; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 456; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 457; X86-SSE-NEXT: movdqa %xmm1, %xmm2 458; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 459; X86-SSE-NEXT: pmullw %xmm0, %xmm1 460; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 461; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 462; X86-SSE-NEXT: popl %esi 463; X86-SSE-NEXT: retl 464; 465; X86-AVX-LABEL: mul_2xi16: 466; X86-AVX: # %bb.0: # %entry 467; X86-AVX-NEXT: pushl %esi 468; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 469; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 470; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 471; X86-AVX-NEXT: movl c, %esi 472; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 473; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 474; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 475; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 476; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 477; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 478; X86-AVX-NEXT: popl %esi 479; X86-AVX-NEXT: retl 480; 481; X64-SSE-LABEL: mul_2xi16: 482; X64-SSE: # %bb.0: # %entry 483; X64-SSE-NEXT: movq c(%rip), %rax 484; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 485; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 486; X64-SSE-NEXT: movdqa %xmm1, %xmm2 487; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 488; X64-SSE-NEXT: pmullw %xmm0, %xmm1 489; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 490; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 491; X64-SSE-NEXT: retq 492; 493; X64-AVX-LABEL: mul_2xi16: 494; X64-AVX: # %bb.0: # %entry 495; X64-AVX-NEXT: movq c(%rip), %rax 496; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 497; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 498; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 499; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 500; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 501; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 502; X64-AVX-NEXT: retq 503entry: 504 %pre = load ptr, ptr @c 505 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 506 %wide.load = load <2 x i16>, ptr %tmp6, align 1 507 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 508 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 509 %wide.load17 = load <2 x i16>, ptr %tmp10, align 1 510 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 511 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 512 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 513 store <2 x i32> %tmp13, ptr %tmp14, align 4 514 ret void 515} 516 517; %val1 = load <4 x i16> 518; %op1 = zext<4 x i32> %val1 519; %val2 = load <4 x i16> 520; %op2 = zext<4 x i32> %val2 521; %rst = mul <4 x i32> %op1, %op2 522; 523define void @mul_4xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 524; X86-SSE-LABEL: mul_4xi16: 525; X86-SSE: # %bb.0: # %entry 526; X86-SSE-NEXT: pushl %esi 527; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 528; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 529; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 530; X86-SSE-NEXT: movl c, %esi 531; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 532; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 533; X86-SSE-NEXT: movdqa %xmm1, %xmm2 534; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 535; X86-SSE-NEXT: pmullw %xmm0, %xmm1 536; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 537; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) 538; X86-SSE-NEXT: popl %esi 539; X86-SSE-NEXT: retl 540; 541; X86-AVX-LABEL: mul_4xi16: 542; X86-AVX: # %bb.0: # %entry 543; X86-AVX-NEXT: pushl %esi 544; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 545; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 546; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 547; X86-AVX-NEXT: movl c, %esi 548; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 549; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 550; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 551; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) 552; X86-AVX-NEXT: popl %esi 553; X86-AVX-NEXT: retl 554; 555; X64-SSE-LABEL: mul_4xi16: 556; X64-SSE: # %bb.0: # %entry 557; X64-SSE-NEXT: movq c(%rip), %rax 558; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 559; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 560; X64-SSE-NEXT: movdqa %xmm1, %xmm2 561; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 562; X64-SSE-NEXT: pmullw %xmm0, %xmm1 563; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 564; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) 565; X64-SSE-NEXT: retq 566; 567; X64-AVX-LABEL: mul_4xi16: 568; X64-AVX: # %bb.0: # %entry 569; X64-AVX-NEXT: movq c(%rip), %rax 570; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 571; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 572; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 573; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) 574; X64-AVX-NEXT: retq 575entry: 576 %pre = load ptr, ptr @c 577 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 578 %wide.load = load <4 x i16>, ptr %tmp6, align 1 579 %tmp8 = zext <4 x i16> %wide.load to <4 x i32> 580 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 581 %wide.load17 = load <4 x i16>, ptr %tmp10, align 1 582 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> 583 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 584 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 585 store <4 x i32> %tmp13, ptr %tmp14, align 4 586 ret void 587} 588 589; %val1 = load <8 x i16> 590; %op1 = zext<8 x i32> %val1 591; %val2 = load <8 x i16> 592; %op2 = zext<8 x i32> %val2 593; %rst = mul <8 x i32> %op1, %op2 594; 595define void @mul_8xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 596; X86-SSE-LABEL: mul_8xi16: 597; X86-SSE: # %bb.0: # %entry 598; X86-SSE-NEXT: pushl %esi 599; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 600; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 601; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 602; X86-SSE-NEXT: movl c, %esi 603; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 604; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 605; X86-SSE-NEXT: movdqa %xmm1, %xmm2 606; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 607; X86-SSE-NEXT: pmullw %xmm0, %xmm1 608; X86-SSE-NEXT: movdqa %xmm1, %xmm0 609; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 610; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 611; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) 612; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) 613; X86-SSE-NEXT: popl %esi 614; X86-SSE-NEXT: retl 615; 616; X86-AVX1-LABEL: mul_8xi16: 617; X86-AVX1: # %bb.0: # %entry 618; X86-AVX1-NEXT: pushl %esi 619; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 620; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 621; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 622; X86-AVX1-NEXT: movl c, %esi 623; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 624; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 625; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 626; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 627; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 628; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 629; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) 630; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) 631; X86-AVX1-NEXT: popl %esi 632; X86-AVX1-NEXT: retl 633; 634; X86-AVX2-LABEL: mul_8xi16: 635; X86-AVX2: # %bb.0: # %entry 636; X86-AVX2-NEXT: pushl %esi 637; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 638; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 639; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 640; X86-AVX2-NEXT: movl c, %esi 641; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 642; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 643; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 644; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) 645; X86-AVX2-NEXT: popl %esi 646; X86-AVX2-NEXT: vzeroupper 647; X86-AVX2-NEXT: retl 648; 649; X64-SSE-LABEL: mul_8xi16: 650; X64-SSE: # %bb.0: # %entry 651; X64-SSE-NEXT: movq c(%rip), %rax 652; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 653; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 654; X64-SSE-NEXT: movdqa %xmm1, %xmm2 655; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 656; X64-SSE-NEXT: pmullw %xmm0, %xmm1 657; X64-SSE-NEXT: movdqa %xmm1, %xmm0 658; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 659; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 660; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 661; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) 662; X64-SSE-NEXT: retq 663; 664; X64-AVX1-LABEL: mul_8xi16: 665; X64-AVX1: # %bb.0: # %entry 666; X64-AVX1-NEXT: movq c(%rip), %rax 667; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 668; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 669; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 670; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 671; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 672; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 673; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) 674; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) 675; X64-AVX1-NEXT: retq 676; 677; X64-AVX2-LABEL: mul_8xi16: 678; X64-AVX2: # %bb.0: # %entry 679; X64-AVX2-NEXT: movq c(%rip), %rax 680; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 681; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 682; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 683; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) 684; X64-AVX2-NEXT: vzeroupper 685; X64-AVX2-NEXT: retq 686entry: 687 %pre = load ptr, ptr @c 688 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 689 %wide.load = load <8 x i16>, ptr %tmp6, align 1 690 %tmp8 = zext <8 x i16> %wide.load to <8 x i32> 691 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 692 %wide.load17 = load <8 x i16>, ptr %tmp10, align 1 693 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> 694 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 695 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 696 store <8 x i32> %tmp13, ptr %tmp14, align 4 697 ret void 698} 699 700; %val1 = load <16 x i16> 701; %op1 = zext<16 x i32> %val1 702; %val2 = load <16 x i16> 703; %op2 = zext<16 x i32> %val2 704; %rst = mul <16 x i32> %op1, %op2 705; 706define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 707; X86-SSE-LABEL: mul_16xi16: 708; X86-SSE: # %bb.0: # %entry 709; X86-SSE-NEXT: pushl %esi 710; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 711; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 712; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi 713; X86-SSE-NEXT: movl c, %ecx 714; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2 715; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3 716; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 717; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1 718; X86-SSE-NEXT: movdqa %xmm0, %xmm4 719; X86-SSE-NEXT: pmulhuw %xmm2, %xmm4 720; X86-SSE-NEXT: pmullw %xmm2, %xmm0 721; X86-SSE-NEXT: movdqa %xmm0, %xmm2 722; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 723; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 724; X86-SSE-NEXT: movdqa %xmm1, %xmm4 725; X86-SSE-NEXT: pmulhuw %xmm3, %xmm4 726; X86-SSE-NEXT: pmullw %xmm3, %xmm1 727; X86-SSE-NEXT: movdqa %xmm1, %xmm3 728; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 729; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 730; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) 731; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) 732; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) 733; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) 734; X86-SSE-NEXT: popl %esi 735; X86-SSE-NEXT: retl 736; 737; X86-AVX1-LABEL: mul_16xi16: 738; X86-AVX1: # %bb.0: # %entry 739; X86-AVX1-NEXT: pushl %esi 740; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 741; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 742; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi 743; X86-AVX1-NEXT: movl c, %ecx 744; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 745; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 746; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 747; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 748; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 749; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 750; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 751; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 752; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 753; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 754; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 755; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 756; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) 757; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) 758; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) 759; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) 760; X86-AVX1-NEXT: popl %esi 761; X86-AVX1-NEXT: retl 762; 763; X86-AVX2-LABEL: mul_16xi16: 764; X86-AVX2: # %bb.0: # %entry 765; X86-AVX2-NEXT: pushl %esi 766; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 767; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 768; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 769; X86-AVX2-NEXT: movl c, %esi 770; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 771; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 772; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 773; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 774; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 775; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 776; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 777; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 778; X86-AVX2-NEXT: popl %esi 779; X86-AVX2-NEXT: vzeroupper 780; X86-AVX2-NEXT: retl 781; 782; X64-SSE-LABEL: mul_16xi16: 783; X64-SSE: # %bb.0: # %entry 784; X64-SSE-NEXT: movq c(%rip), %rax 785; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 786; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 787; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 788; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 789; X64-SSE-NEXT: movdqa %xmm2, %xmm4 790; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 791; X64-SSE-NEXT: pmullw %xmm0, %xmm2 792; X64-SSE-NEXT: movdqa %xmm2, %xmm0 793; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 794; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 795; X64-SSE-NEXT: movdqa %xmm3, %xmm4 796; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 797; X64-SSE-NEXT: pmullw %xmm1, %xmm3 798; X64-SSE-NEXT: movdqa %xmm3, %xmm1 799; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 800; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 801; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) 802; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 803; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) 804; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) 805; X64-SSE-NEXT: retq 806; 807; X64-AVX1-LABEL: mul_16xi16: 808; X64-AVX1: # %bb.0: # %entry 809; X64-AVX1-NEXT: movq c(%rip), %rax 810; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 811; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 812; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 813; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 814; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 815; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 816; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 817; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 818; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 819; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 820; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 821; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 822; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) 823; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) 824; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) 825; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) 826; X64-AVX1-NEXT: retq 827; 828; X64-AVX2-LABEL: mul_16xi16: 829; X64-AVX2: # %bb.0: # %entry 830; X64-AVX2-NEXT: movq c(%rip), %rax 831; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 832; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 833; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 834; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 835; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 836; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 837; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 838; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 839; X64-AVX2-NEXT: vzeroupper 840; X64-AVX2-NEXT: retq 841entry: 842 %pre = load ptr, ptr @c 843 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 844 %wide.load = load <16 x i16>, ptr %tmp6, align 1 845 %tmp8 = zext <16 x i16> %wide.load to <16 x i32> 846 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 847 %wide.load17 = load <16 x i16>, ptr %tmp10, align 1 848 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> 849 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 850 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 851 store <16 x i32> %tmp13, ptr %tmp14, align 4 852 ret void 853} 854 855; %val1 = load <2 x i8> 856; %op1 = sext<2 x i32> %val1 857; %val2 = load <2 x i8> 858; %op2 = sext<2 x i32> %val2 859; %rst = mul <2 x i32> %op1, %op2 860; 861define void @mul_2xi8_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 862; X86-SSE-LABEL: mul_2xi8_sext: 863; X86-SSE: # %bb.0: # %entry 864; X86-SSE-NEXT: pushl %esi 865; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 866; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 867; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi 868; X86-SSE-NEXT: movl c, %ecx 869; X86-SSE-NEXT: movzwl (%esi,%eax), %esi 870; X86-SSE-NEXT: movd %esi, %xmm0 871; X86-SSE-NEXT: movzwl (%edx,%eax), %edx 872; X86-SSE-NEXT: movd %edx, %xmm1 873; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 874; X86-SSE-NEXT: psraw $8, %xmm0 875; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 876; X86-SSE-NEXT: psraw $8, %xmm1 877; X86-SSE-NEXT: pmullw %xmm0, %xmm1 878; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] 879; X86-SSE-NEXT: psrad $16, %xmm0 880; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) 881; X86-SSE-NEXT: popl %esi 882; X86-SSE-NEXT: retl 883; 884; X86-AVX-LABEL: mul_2xi8_sext: 885; X86-AVX: # %bb.0: # %entry 886; X86-AVX-NEXT: pushl %esi 887; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 888; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 889; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 890; X86-AVX-NEXT: movl c, %esi 891; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx 892; X86-AVX-NEXT: vmovd %edx, %xmm0 893; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 894; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax 895; X86-AVX-NEXT: vmovd %eax, %xmm1 896; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 897; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 898; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 899; X86-AVX-NEXT: popl %esi 900; X86-AVX-NEXT: retl 901; 902; X64-SSE-LABEL: mul_2xi8_sext: 903; X64-SSE: # %bb.0: # %entry 904; X64-SSE-NEXT: movq c(%rip), %rax 905; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 906; X64-SSE-NEXT: movd %ecx, %xmm0 907; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 908; X64-SSE-NEXT: movd %ecx, %xmm1 909; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 910; X64-SSE-NEXT: psraw $8, %xmm0 911; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 912; X64-SSE-NEXT: psraw $8, %xmm1 913; X64-SSE-NEXT: pmullw %xmm0, %xmm1 914; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] 915; X64-SSE-NEXT: psrad $16, %xmm0 916; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) 917; X64-SSE-NEXT: retq 918; 919; X64-AVX-LABEL: mul_2xi8_sext: 920; X64-AVX: # %bb.0: # %entry 921; X64-AVX-NEXT: movq c(%rip), %rax 922; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx 923; X64-AVX-NEXT: vmovd %ecx, %xmm0 924; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 925; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx 926; X64-AVX-NEXT: vmovd %ecx, %xmm1 927; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 928; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 929; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 930; X64-AVX-NEXT: retq 931entry: 932 %pre = load ptr, ptr @c 933 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 934 %wide.load = load <2 x i8>, ptr %tmp6, align 1 935 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 936 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 937 %wide.load17 = load <2 x i8>, ptr %tmp10, align 1 938 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> 939 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 940 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 941 store <2 x i32> %tmp13, ptr %tmp14, align 4 942 ret void 943} 944 945; %val1 = load <2 x i8> 946; %op1 = sext<2 x i32> %val1 947; %val2 = load <2 x i8> 948; %op2 = zext<2 x i32> %val2 949; %rst = mul <2 x i32> %op1, %op2 950; 951define void @mul_2xi8_sext_zext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 952; X86-SSE-LABEL: mul_2xi8_sext_zext: 953; X86-SSE: # %bb.0: # %entry 954; X86-SSE-NEXT: pushl %esi 955; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 956; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 957; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi 958; X86-SSE-NEXT: movl c, %ecx 959; X86-SSE-NEXT: movzwl (%esi,%eax), %esi 960; X86-SSE-NEXT: movd %esi, %xmm0 961; X86-SSE-NEXT: movzwl (%edx,%eax), %edx 962; X86-SSE-NEXT: movd %edx, %xmm1 963; X86-SSE-NEXT: pxor %xmm2, %xmm2 964; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 965; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 966; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 967; X86-SSE-NEXT: psraw $8, %xmm0 968; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 969; X86-SSE-NEXT: pmaddwd %xmm1, %xmm0 970; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) 971; X86-SSE-NEXT: popl %esi 972; X86-SSE-NEXT: retl 973; 974; X86-AVX-LABEL: mul_2xi8_sext_zext: 975; X86-AVX: # %bb.0: # %entry 976; X86-AVX-NEXT: pushl %esi 977; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 978; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 979; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 980; X86-AVX-NEXT: movl c, %esi 981; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx 982; X86-AVX-NEXT: vmovd %edx, %xmm0 983; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 984; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax 985; X86-AVX-NEXT: vmovd %eax, %xmm1 986; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 987; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 988; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 989; X86-AVX-NEXT: popl %esi 990; X86-AVX-NEXT: retl 991; 992; X64-SSE-LABEL: mul_2xi8_sext_zext: 993; X64-SSE: # %bb.0: # %entry 994; X64-SSE-NEXT: movq c(%rip), %rax 995; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 996; X64-SSE-NEXT: movd %ecx, %xmm0 997; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 998; X64-SSE-NEXT: movd %ecx, %xmm1 999; X64-SSE-NEXT: pxor %xmm2, %xmm2 1000; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1001; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 1002; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1003; X64-SSE-NEXT: psraw $8, %xmm0 1004; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1005; X64-SSE-NEXT: pmaddwd %xmm1, %xmm0 1006; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) 1007; X64-SSE-NEXT: retq 1008; 1009; X64-AVX-LABEL: mul_2xi8_sext_zext: 1010; X64-AVX: # %bb.0: # %entry 1011; X64-AVX-NEXT: movq c(%rip), %rax 1012; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx 1013; X64-AVX-NEXT: vmovd %ecx, %xmm0 1014; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1015; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx 1016; X64-AVX-NEXT: vmovd %ecx, %xmm1 1017; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1018; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 1019; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1020; X64-AVX-NEXT: retq 1021entry: 1022 %pre = load ptr, ptr @c 1023 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1024 %wide.load = load <2 x i8>, ptr %tmp6, align 1 1025 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1026 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 1027 %wide.load17 = load <2 x i8>, ptr %tmp10, align 1 1028 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 1029 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1030 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1031 store <2 x i32> %tmp13, ptr %tmp14, align 4 1032 ret void 1033} 1034 1035; %val1 = load <2 x i16> 1036; %op1 = sext<2 x i32> %val1 1037; %val2 = load <2 x i16> 1038; %op2 = sext<2 x i32> %val2 1039; %rst = mul <2 x i32> %op1, %op2 1040; 1041define void @mul_2xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 1042; X86-SSE-LABEL: mul_2xi16_sext: 1043; X86-SSE: # %bb.0: # %entry 1044; X86-SSE-NEXT: pushl %esi 1045; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1046; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1047; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1048; X86-SSE-NEXT: movl c, %esi 1049; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1050; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1051; X86-SSE-NEXT: pxor %xmm2, %xmm2 1052; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1053; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 1054; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1 1055; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 1056; X86-SSE-NEXT: popl %esi 1057; X86-SSE-NEXT: retl 1058; 1059; X86-AVX-LABEL: mul_2xi16_sext: 1060; X86-AVX: # %bb.0: # %entry 1061; X86-AVX-NEXT: pushl %esi 1062; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1063; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1064; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 1065; X86-AVX-NEXT: movl c, %esi 1066; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1067; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1068; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1069; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1070; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 1071; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 1072; X86-AVX-NEXT: popl %esi 1073; X86-AVX-NEXT: retl 1074; 1075; X64-SSE-LABEL: mul_2xi16_sext: 1076; X64-SSE: # %bb.0: # %entry 1077; X64-SSE-NEXT: movq c(%rip), %rax 1078; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1079; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1080; X64-SSE-NEXT: pxor %xmm2, %xmm2 1081; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1082; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 1083; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1 1084; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 1085; X64-SSE-NEXT: retq 1086; 1087; X64-AVX-LABEL: mul_2xi16_sext: 1088; X64-AVX: # %bb.0: # %entry 1089; X64-AVX-NEXT: movq c(%rip), %rax 1090; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1091; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1092; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1093; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1094; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 1095; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1096; X64-AVX-NEXT: retq 1097entry: 1098 %pre = load ptr, ptr @c 1099 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1100 %wide.load = load <2 x i16>, ptr %tmp6, align 1 1101 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1102 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 1103 %wide.load17 = load <2 x i16>, ptr %tmp10, align 1 1104 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> 1105 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1106 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1107 store <2 x i32> %tmp13, ptr %tmp14, align 4 1108 ret void 1109} 1110 1111; %val1 = load <2 x i16> 1112; %op1 = sext<2 x i32> %val1 1113; %val2 = load <2 x i16> 1114; %op2 = zext<2 x i32> %val2 1115; %rst = mul <2 x i32> %op1, %op2 1116; 1117define void @mul_2xi16_sext_zext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 1118; X86-SSE-LABEL: mul_2xi16_sext_zext: 1119; X86-SSE: # %bb.0: # %entry 1120; X86-SSE-NEXT: pushl %esi 1121; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1122; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1123; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi 1124; X86-SSE-NEXT: movl c, %ecx 1125; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1126; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1127; X86-SSE-NEXT: psrad $16, %xmm0 1128; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1129; X86-SSE-NEXT: pxor %xmm2, %xmm2 1130; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1131; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1132; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 1133; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1134; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 1135; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1136; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4) 1137; X86-SSE-NEXT: popl %esi 1138; X86-SSE-NEXT: retl 1139; 1140; X86-AVX-LABEL: mul_2xi16_sext_zext: 1141; X86-AVX: # %bb.0: # %entry 1142; X86-AVX-NEXT: pushl %esi 1143; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1144; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1145; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 1146; X86-AVX-NEXT: movl c, %esi 1147; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1148; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1149; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1150; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1151; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1152; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 1153; X86-AVX-NEXT: popl %esi 1154; X86-AVX-NEXT: retl 1155; 1156; X64-SSE-LABEL: mul_2xi16_sext_zext: 1157; X64-SSE: # %bb.0: # %entry 1158; X64-SSE-NEXT: movq c(%rip), %rax 1159; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1160; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1161; X64-SSE-NEXT: psrad $16, %xmm0 1162; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1163; X64-SSE-NEXT: pxor %xmm2, %xmm2 1164; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1165; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1166; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 1167; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1168; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 1169; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1170; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 1171; X64-SSE-NEXT: retq 1172; 1173; X64-AVX-LABEL: mul_2xi16_sext_zext: 1174; X64-AVX: # %bb.0: # %entry 1175; X64-AVX-NEXT: movq c(%rip), %rax 1176; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1177; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1178; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1179; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1180; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1181; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1182; X64-AVX-NEXT: retq 1183entry: 1184 %pre = load ptr, ptr @c 1185 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1186 %wide.load = load <2 x i16>, ptr %tmp6, align 1 1187 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1188 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 1189 %wide.load17 = load <2 x i16>, ptr %tmp10, align 1 1190 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 1191 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1192 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1193 store <2 x i32> %tmp13, ptr %tmp14, align 4 1194 ret void 1195} 1196 1197; %val1 = load <16 x i16> 1198; %op1 = sext<16 x i32> %val1 1199; %val2 = load <16 x i16> 1200; %op2 = sext<16 x i32> %val2 1201; %rst = mul <16 x i32> %op1, %op2 1202; 1203define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind { 1204; X86-SSE-LABEL: mul_16xi16_sext: 1205; X86-SSE: # %bb.0: # %entry 1206; X86-SSE-NEXT: pushl %esi 1207; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1208; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1209; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi 1210; X86-SSE-NEXT: movl c, %ecx 1211; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2 1212; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3 1213; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 1214; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1 1215; X86-SSE-NEXT: movdqa %xmm0, %xmm4 1216; X86-SSE-NEXT: pmulhw %xmm2, %xmm4 1217; X86-SSE-NEXT: pmullw %xmm2, %xmm0 1218; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1219; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 1220; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1221; X86-SSE-NEXT: movdqa %xmm1, %xmm4 1222; X86-SSE-NEXT: pmulhw %xmm3, %xmm4 1223; X86-SSE-NEXT: pmullw %xmm3, %xmm1 1224; X86-SSE-NEXT: movdqa %xmm1, %xmm3 1225; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1226; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 1227; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) 1228; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) 1229; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) 1230; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) 1231; X86-SSE-NEXT: popl %esi 1232; X86-SSE-NEXT: retl 1233; 1234; X86-AVX1-LABEL: mul_16xi16_sext: 1235; X86-AVX1: # %bb.0: # %entry 1236; X86-AVX1-NEXT: pushl %esi 1237; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 1238; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 1239; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi 1240; X86-AVX1-NEXT: movl c, %ecx 1241; X86-AVX1-NEXT: vpmovsxwd 24(%esi,%eax), %xmm0 1242; X86-AVX1-NEXT: vpmovsxwd 16(%esi,%eax), %xmm1 1243; X86-AVX1-NEXT: vpmovsxwd 8(%esi,%eax), %xmm2 1244; X86-AVX1-NEXT: vpmovsxwd (%esi,%eax), %xmm3 1245; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%eax), %xmm4 1246; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 1247; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%eax), %xmm4 1248; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 1249; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%eax), %xmm4 1250; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 1251; X86-AVX1-NEXT: vpmovsxwd (%edx,%eax), %xmm4 1252; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 1253; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) 1254; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) 1255; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) 1256; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) 1257; X86-AVX1-NEXT: popl %esi 1258; X86-AVX1-NEXT: retl 1259; 1260; X86-AVX2-LABEL: mul_16xi16_sext: 1261; X86-AVX2: # %bb.0: # %entry 1262; X86-AVX2-NEXT: pushl %esi 1263; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 1264; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 1265; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 1266; X86-AVX2-NEXT: movl c, %esi 1267; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 1268; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 1269; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 1270; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 1271; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 1272; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1273; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 1274; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 1275; X86-AVX2-NEXT: popl %esi 1276; X86-AVX2-NEXT: vzeroupper 1277; X86-AVX2-NEXT: retl 1278; 1279; X64-SSE-LABEL: mul_16xi16_sext: 1280; X64-SSE: # %bb.0: # %entry 1281; X64-SSE-NEXT: movq c(%rip), %rax 1282; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 1283; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 1284; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 1285; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 1286; X64-SSE-NEXT: movdqa %xmm2, %xmm4 1287; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 1288; X64-SSE-NEXT: pmullw %xmm0, %xmm2 1289; X64-SSE-NEXT: movdqa %xmm2, %xmm0 1290; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1291; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1292; X64-SSE-NEXT: movdqa %xmm3, %xmm4 1293; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 1294; X64-SSE-NEXT: pmullw %xmm1, %xmm3 1295; X64-SSE-NEXT: movdqa %xmm3, %xmm1 1296; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1297; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 1298; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) 1299; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 1300; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) 1301; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) 1302; X64-SSE-NEXT: retq 1303; 1304; X64-AVX1-LABEL: mul_16xi16_sext: 1305; X64-AVX1: # %bb.0: # %entry 1306; X64-AVX1-NEXT: movq c(%rip), %rax 1307; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0 1308; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1 1309; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2 1310; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3 1311; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 1312; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 1313; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 1314; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 1315; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 1316; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 1317; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 1318; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 1319; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) 1320; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) 1321; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) 1322; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) 1323; X64-AVX1-NEXT: retq 1324; 1325; X64-AVX2-LABEL: mul_16xi16_sext: 1326; X64-AVX2: # %bb.0: # %entry 1327; X64-AVX2-NEXT: movq c(%rip), %rax 1328; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 1329; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 1330; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 1331; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 1332; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 1333; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1334; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 1335; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 1336; X64-AVX2-NEXT: vzeroupper 1337; X64-AVX2-NEXT: retq 1338entry: 1339 %pre = load ptr, ptr @c 1340 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1341 %wide.load = load <16 x i16>, ptr %tmp6, align 1 1342 %tmp8 = sext <16 x i16> %wide.load to <16 x i32> 1343 %tmp10 = getelementptr inbounds i8, ptr %b, i64 %index 1344 %wide.load17 = load <16 x i16>, ptr %tmp10, align 1 1345 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> 1346 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 1347 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1348 store <16 x i32> %tmp13, ptr %tmp14, align 4 1349 ret void 1350} 1351 1352; %val = load <2 x i8> 1353; %op1 = zext<2 x i32> %val 1354; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) 1355; %rst = mul <2 x i32> %op1, %op2 1356; 1357define void @mul_2xi8_varconst1(ptr nocapture readonly %a, i64 %index) { 1358; X86-SSE-LABEL: mul_2xi8_varconst1: 1359; X86-SSE: # %bb.0: # %entry 1360; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1361; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1362; X86-SSE-NEXT: movl c, %edx 1363; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1364; X86-SSE-NEXT: movd %ecx, %xmm0 1365; X86-SSE-NEXT: pxor %xmm1, %xmm1 1366; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1367; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1368; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,255,0,u,u,u,u] 1369; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1370; X86-SSE-NEXT: retl 1371; 1372; X86-AVX-LABEL: mul_2xi8_varconst1: 1373; X86-AVX: # %bb.0: # %entry 1374; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1375; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1376; X86-AVX-NEXT: movl c, %edx 1377; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1378; X86-AVX-NEXT: vmovd %ecx, %xmm0 1379; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1380; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,0,255,0,u,u,u,u] 1381; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1382; X86-AVX-NEXT: retl 1383; 1384; X64-SSE-LABEL: mul_2xi8_varconst1: 1385; X64-SSE: # %bb.0: # %entry 1386; X64-SSE-NEXT: movq c(%rip), %rax 1387; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1388; X64-SSE-NEXT: movd %ecx, %xmm0 1389; X64-SSE-NEXT: pxor %xmm1, %xmm1 1390; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1391; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1392; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,255,0,u,u,u,u] 1393; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1394; X64-SSE-NEXT: retq 1395; 1396; X64-AVX-LABEL: mul_2xi8_varconst1: 1397; X64-AVX: # %bb.0: # %entry 1398; X64-AVX-NEXT: movq c(%rip), %rax 1399; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1400; X64-AVX-NEXT: vmovd %ecx, %xmm0 1401; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1402; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,255,0,u,u,u,u] 1403; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1404; X64-AVX-NEXT: retq 1405entry: 1406 %pre = load ptr, ptr @c 1407 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1408 %wide.load = load <2 x i8>, ptr %tmp6, align 1 1409 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1410 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255> 1411 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1412 store <2 x i32> %tmp13, ptr %tmp14, align 4 1413 ret void 1414} 1415 1416; %val = load <2 x i8> 1417; %op1 = sext<2 x i32> %val 1418; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) 1419; %rst = mul <2 x i32> %op1, %op2 1420; 1421define void @mul_2xi8_varconst2(ptr nocapture readonly %a, i64 %index) { 1422; X86-SSE-LABEL: mul_2xi8_varconst2: 1423; X86-SSE: # %bb.0: # %entry 1424; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1425; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1426; X86-SSE-NEXT: movl c, %edx 1427; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1428; X86-SSE-NEXT: movd %ecx, %xmm0 1429; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1430; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1431; X86-SSE-NEXT: psrad $24, %xmm0 1432; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65408,0,127,0,u,u,u,u] 1433; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1434; X86-SSE-NEXT: retl 1435; 1436; X86-AVX-LABEL: mul_2xi8_varconst2: 1437; X86-AVX: # %bb.0: # %entry 1438; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1439; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1440; X86-AVX-NEXT: movl c, %edx 1441; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1442; X86-AVX-NEXT: vmovd %ecx, %xmm0 1443; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1444; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65408,0,127,0,u,u,u,u] 1445; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1446; X86-AVX-NEXT: retl 1447; 1448; X64-SSE-LABEL: mul_2xi8_varconst2: 1449; X64-SSE: # %bb.0: # %entry 1450; X64-SSE-NEXT: movq c(%rip), %rax 1451; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1452; X64-SSE-NEXT: movd %ecx, %xmm0 1453; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1454; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1455; X64-SSE-NEXT: psrad $24, %xmm0 1456; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65408,0,127,0,u,u,u,u] 1457; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1458; X64-SSE-NEXT: retq 1459; 1460; X64-AVX-LABEL: mul_2xi8_varconst2: 1461; X64-AVX: # %bb.0: # %entry 1462; X64-AVX-NEXT: movq c(%rip), %rax 1463; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1464; X64-AVX-NEXT: vmovd %ecx, %xmm0 1465; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1466; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65408,0,127,0,u,u,u,u] 1467; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1468; X64-AVX-NEXT: retq 1469entry: 1470 %pre = load ptr, ptr @c 1471 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1472 %wide.load = load <2 x i8>, ptr %tmp6, align 1 1473 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1474 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127> 1475 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1476 store <2 x i32> %tmp13, ptr %tmp14, align 4 1477 ret void 1478} 1479 1480; %val = load <2 x i8> 1481; %op1 = zext<2 x i32> %val 1482; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) 1483; %rst = mul <2 x i32> %op1, %op2 1484; 1485define void @mul_2xi8_varconst3(ptr nocapture readonly %a, i64 %index) { 1486; X86-SSE-LABEL: mul_2xi8_varconst3: 1487; X86-SSE: # %bb.0: # %entry 1488; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1489; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1490; X86-SSE-NEXT: movl c, %edx 1491; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1492; X86-SSE-NEXT: movd %ecx, %xmm0 1493; X86-SSE-NEXT: pxor %xmm1, %xmm1 1494; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1495; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1496; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,256,0,u,u,u,u] 1497; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1498; X86-SSE-NEXT: retl 1499; 1500; X86-AVX-LABEL: mul_2xi8_varconst3: 1501; X86-AVX: # %bb.0: # %entry 1502; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1503; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1504; X86-AVX-NEXT: movl c, %edx 1505; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1506; X86-AVX-NEXT: vmovd %ecx, %xmm0 1507; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1508; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,0,256,0,u,u,u,u] 1509; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1510; X86-AVX-NEXT: retl 1511; 1512; X64-SSE-LABEL: mul_2xi8_varconst3: 1513; X64-SSE: # %bb.0: # %entry 1514; X64-SSE-NEXT: movq c(%rip), %rax 1515; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1516; X64-SSE-NEXT: movd %ecx, %xmm0 1517; X64-SSE-NEXT: pxor %xmm1, %xmm1 1518; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1519; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1520; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,256,0,u,u,u,u] 1521; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1522; X64-SSE-NEXT: retq 1523; 1524; X64-AVX-LABEL: mul_2xi8_varconst3: 1525; X64-AVX: # %bb.0: # %entry 1526; X64-AVX-NEXT: movq c(%rip), %rax 1527; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1528; X64-AVX-NEXT: vmovd %ecx, %xmm0 1529; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1530; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,256,0,u,u,u,u] 1531; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1532; X64-AVX-NEXT: retq 1533entry: 1534 %pre = load ptr, ptr @c 1535 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1536 %wide.load = load <2 x i8>, ptr %tmp6, align 1 1537 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1538 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256> 1539 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1540 store <2 x i32> %tmp13, ptr %tmp14, align 4 1541 ret void 1542} 1543 1544; %val = load <2 x i8> 1545; %op1 = zext<2 x i32> %val 1546; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) 1547; %rst = mul <2 x i32> %op1, %op2 1548; 1549define void @mul_2xi8_varconst4(ptr nocapture readonly %a, i64 %index) { 1550; X86-SSE-LABEL: mul_2xi8_varconst4: 1551; X86-SSE: # %bb.0: # %entry 1552; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1553; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1554; X86-SSE-NEXT: movl c, %edx 1555; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1556; X86-SSE-NEXT: movd %ecx, %xmm0 1557; X86-SSE-NEXT: pxor %xmm1, %xmm1 1558; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1559; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1560; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65535,0,255,0,u,u,u,u] 1561; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1562; X86-SSE-NEXT: retl 1563; 1564; X86-AVX-LABEL: mul_2xi8_varconst4: 1565; X86-AVX: # %bb.0: # %entry 1566; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1567; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1568; X86-AVX-NEXT: movl c, %edx 1569; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1570; X86-AVX-NEXT: vmovd %ecx, %xmm0 1571; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1572; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65535,0,255,0,u,u,u,u] 1573; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1574; X86-AVX-NEXT: retl 1575; 1576; X64-SSE-LABEL: mul_2xi8_varconst4: 1577; X64-SSE: # %bb.0: # %entry 1578; X64-SSE-NEXT: movq c(%rip), %rax 1579; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1580; X64-SSE-NEXT: movd %ecx, %xmm0 1581; X64-SSE-NEXT: pxor %xmm1, %xmm1 1582; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1583; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1584; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65535,0,255,0,u,u,u,u] 1585; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1586; X64-SSE-NEXT: retq 1587; 1588; X64-AVX-LABEL: mul_2xi8_varconst4: 1589; X64-AVX: # %bb.0: # %entry 1590; X64-AVX-NEXT: movq c(%rip), %rax 1591; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1592; X64-AVX-NEXT: vmovd %ecx, %xmm0 1593; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1594; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65535,0,255,0,u,u,u,u] 1595; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1596; X64-AVX-NEXT: retq 1597entry: 1598 %pre = load ptr, ptr @c 1599 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1600 %wide.load = load <2 x i8>, ptr %tmp6, align 1 1601 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1602 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255> 1603 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1604 store <2 x i32> %tmp13, ptr %tmp14, align 4 1605 ret void 1606} 1607 1608; %val = load <2 x i8> 1609; %op1 = sext<2 x i32> %val 1610; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) 1611; %rst = mul <2 x i32> %op1, %op2 1612; 1613define void @mul_2xi8_varconst5(ptr nocapture readonly %a, i64 %index) { 1614; X86-SSE-LABEL: mul_2xi8_varconst5: 1615; X86-SSE: # %bb.0: # %entry 1616; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1617; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1618; X86-SSE-NEXT: movl c, %edx 1619; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1620; X86-SSE-NEXT: movd %ecx, %xmm0 1621; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1622; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1623; X86-SSE-NEXT: psrad $24, %xmm0 1624; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65407,0,127,0,u,u,u,u] 1625; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1626; X86-SSE-NEXT: retl 1627; 1628; X86-AVX-LABEL: mul_2xi8_varconst5: 1629; X86-AVX: # %bb.0: # %entry 1630; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1631; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1632; X86-AVX-NEXT: movl c, %edx 1633; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1634; X86-AVX-NEXT: vmovd %ecx, %xmm0 1635; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1636; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65407,0,127,0,u,u,u,u] 1637; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1638; X86-AVX-NEXT: retl 1639; 1640; X64-SSE-LABEL: mul_2xi8_varconst5: 1641; X64-SSE: # %bb.0: # %entry 1642; X64-SSE-NEXT: movq c(%rip), %rax 1643; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1644; X64-SSE-NEXT: movd %ecx, %xmm0 1645; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1646; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1647; X64-SSE-NEXT: psrad $24, %xmm0 1648; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65407,0,127,0,u,u,u,u] 1649; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1650; X64-SSE-NEXT: retq 1651; 1652; X64-AVX-LABEL: mul_2xi8_varconst5: 1653; X64-AVX: # %bb.0: # %entry 1654; X64-AVX-NEXT: movq c(%rip), %rax 1655; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1656; X64-AVX-NEXT: vmovd %ecx, %xmm0 1657; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1658; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65407,0,127,0,u,u,u,u] 1659; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1660; X64-AVX-NEXT: retq 1661entry: 1662 %pre = load ptr, ptr @c 1663 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1664 %wide.load = load <2 x i8>, ptr %tmp6, align 1 1665 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1666 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127> 1667 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1668 store <2 x i32> %tmp13, ptr %tmp14, align 4 1669 ret void 1670} 1671 1672; %val = load <2 x i8> 1673; %op1 = sext<2 x i32> %val 1674; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) 1675; %rst = mul <2 x i32> %op1, %op2 1676; 1677define void @mul_2xi8_varconst6(ptr nocapture readonly %a, i64 %index) { 1678; X86-SSE-LABEL: mul_2xi8_varconst6: 1679; X86-SSE: # %bb.0: # %entry 1680; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1681; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1682; X86-SSE-NEXT: movl c, %edx 1683; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1684; X86-SSE-NEXT: movd %ecx, %xmm0 1685; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1686; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1687; X86-SSE-NEXT: psrad $24, %xmm0 1688; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65408,0,128,0,u,u,u,u] 1689; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1690; X86-SSE-NEXT: retl 1691; 1692; X86-AVX-LABEL: mul_2xi8_varconst6: 1693; X86-AVX: # %bb.0: # %entry 1694; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1695; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1696; X86-AVX-NEXT: movl c, %edx 1697; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx 1698; X86-AVX-NEXT: vmovd %ecx, %xmm0 1699; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1700; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65408,0,128,0,u,u,u,u] 1701; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1702; X86-AVX-NEXT: retl 1703; 1704; X64-SSE-LABEL: mul_2xi8_varconst6: 1705; X64-SSE: # %bb.0: # %entry 1706; X64-SSE-NEXT: movq c(%rip), %rax 1707; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1708; X64-SSE-NEXT: movd %ecx, %xmm0 1709; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1710; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1711; X64-SSE-NEXT: psrad $24, %xmm0 1712; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65408,0,128,0,u,u,u,u] 1713; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1714; X64-SSE-NEXT: retq 1715; 1716; X64-AVX-LABEL: mul_2xi8_varconst6: 1717; X64-AVX: # %bb.0: # %entry 1718; X64-AVX-NEXT: movq c(%rip), %rax 1719; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx 1720; X64-AVX-NEXT: vmovd %ecx, %xmm0 1721; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 1722; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65408,0,128,0,u,u,u,u] 1723; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1724; X64-AVX-NEXT: retq 1725entry: 1726 %pre = load ptr, ptr @c 1727 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1728 %wide.load = load <2 x i8>, ptr %tmp6, align 1 1729 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1730 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128> 1731 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1732 store <2 x i32> %tmp13, ptr %tmp14, align 4 1733 ret void 1734} 1735 1736; %val = load <2 x i16> 1737; %op1 = zext<2 x i32> %val 1738; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) 1739; %rst = mul <2 x i32> %op1, %op2 1740; 1741define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) { 1742; X86-SSE-LABEL: mul_2xi16_varconst1: 1743; X86-SSE: # %bb.0: # %entry 1744; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1745; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1746; X86-SSE-NEXT: movl c, %edx 1747; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1748; X86-SSE-NEXT: movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0] 1749; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1750; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 1751; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1752; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1753; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1754; X86-SSE-NEXT: retl 1755; 1756; X86-AVX-LABEL: mul_2xi16_varconst1: 1757; X86-AVX: # %bb.0: # %entry 1758; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1759; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1760; X86-AVX-NEXT: movl c, %edx 1761; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1762; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1763; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 1764; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1765; X86-AVX-NEXT: retl 1766; 1767; X64-SSE-LABEL: mul_2xi16_varconst1: 1768; X64-SSE: # %bb.0: # %entry 1769; X64-SSE-NEXT: movq c(%rip), %rax 1770; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1771; X64-SSE-NEXT: movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0] 1772; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1773; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 1774; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1775; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1776; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1777; X64-SSE-NEXT: retq 1778; 1779; X64-AVX-LABEL: mul_2xi16_varconst1: 1780; X64-AVX: # %bb.0: # %entry 1781; X64-AVX-NEXT: movq c(%rip), %rax 1782; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1783; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1784; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1785; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1786; X64-AVX-NEXT: retq 1787entry: 1788 %pre = load ptr, ptr @c 1789 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1790 %wide.load = load <2 x i16>, ptr %tmp6, align 1 1791 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 1792 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535> 1793 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1794 store <2 x i32> %tmp13, ptr %tmp14, align 4 1795 ret void 1796} 1797 1798; %val = load <2 x i16> 1799; %op1 = sext<2 x i32> %val 1800; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) 1801; %rst = mul <2 x i32> %op1, %op2 1802; 1803define void @mul_2xi16_varconst2(ptr nocapture readonly %a, i64 %index) { 1804; X86-SSE-LABEL: mul_2xi16_varconst2: 1805; X86-SSE: # %bb.0: # %entry 1806; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1807; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1808; X86-SSE-NEXT: movl c, %edx 1809; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1810; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] 1811; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,0,32767,0,u,u,u,u] 1812; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1813; X86-SSE-NEXT: retl 1814; 1815; X86-AVX-LABEL: mul_2xi16_varconst2: 1816; X86-AVX: # %bb.0: # %entry 1817; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1818; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1819; X86-AVX-NEXT: movl c, %edx 1820; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1821; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1822; X86-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [32768,0,32767,0,u,u,u,u] 1823; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1824; X86-AVX-NEXT: retl 1825; 1826; X64-SSE-LABEL: mul_2xi16_varconst2: 1827; X64-SSE: # %bb.0: # %entry 1828; X64-SSE-NEXT: movq c(%rip), %rax 1829; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1830; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] 1831; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,32767,0,u,u,u,u] 1832; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1833; X64-SSE-NEXT: retq 1834; 1835; X64-AVX-LABEL: mul_2xi16_varconst2: 1836; X64-AVX: # %bb.0: # %entry 1837; X64-AVX-NEXT: movq c(%rip), %rax 1838; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1839; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1840; X64-AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,32767,0,u,u,u,u] 1841; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1842; X64-AVX-NEXT: retq 1843entry: 1844 %pre = load ptr, ptr @c 1845 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1846 %wide.load = load <2 x i16>, ptr %tmp6, align 1 1847 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1848 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767> 1849 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1850 store <2 x i32> %tmp13, ptr %tmp14, align 4 1851 ret void 1852} 1853 1854; %val = load <2 x i16> 1855; %op1 = zext<2 x i32> %val 1856; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) 1857; %rst = mul <2 x i32> %op1, %op2 1858; 1859define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) { 1860; X86-SSE-LABEL: mul_2xi16_varconst3: 1861; X86-SSE: # %bb.0: # %entry 1862; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1863; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1864; X86-SSE-NEXT: movl c, %edx 1865; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1866; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] 1867; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1868; X86-SSE-NEXT: psllq $32, %xmm0 1869; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1870; X86-SSE-NEXT: retl 1871; 1872; X86-AVX-LABEL: mul_2xi16_varconst3: 1873; X86-AVX: # %bb.0: # %entry 1874; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1875; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1876; X86-AVX-NEXT: movl c, %edx 1877; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1878; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1879; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 1880; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1881; X86-AVX-NEXT: retl 1882; 1883; X64-SSE-LABEL: mul_2xi16_varconst3: 1884; X64-SSE: # %bb.0: # %entry 1885; X64-SSE-NEXT: movq c(%rip), %rax 1886; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1887; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] 1888; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1889; X64-SSE-NEXT: psllq $32, %xmm0 1890; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1891; X64-SSE-NEXT: retq 1892; 1893; X64-AVX-LABEL: mul_2xi16_varconst3: 1894; X64-AVX: # %bb.0: # %entry 1895; X64-AVX-NEXT: movq c(%rip), %rax 1896; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1897; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1898; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1899; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1900; X64-AVX-NEXT: retq 1901entry: 1902 %pre = load ptr, ptr @c 1903 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1904 %wide.load = load <2 x i16>, ptr %tmp6, align 1 1905 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 1906 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536> 1907 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1908 store <2 x i32> %tmp13, ptr %tmp14, align 4 1909 ret void 1910} 1911 1912; %val = load <2 x i16> 1913; %op1 = sext<2 x i32> %val 1914; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) 1915; %rst = mul <2 x i32> %op1, %op2 1916; 1917define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { 1918; X86-SSE-LABEL: mul_2xi16_varconst4: 1919; X86-SSE: # %bb.0: # %entry 1920; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1921; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1922; X86-SSE-NEXT: movl c, %edx 1923; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1924; X86-SSE-NEXT: psrad $16, %xmm0 1925; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1926; X86-SSE-NEXT: psllq $32, %xmm0 1927; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1928; X86-SSE-NEXT: retl 1929; 1930; X86-AVX-LABEL: mul_2xi16_varconst4: 1931; X86-AVX: # %bb.0: # %entry 1932; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1933; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1934; X86-AVX-NEXT: movl c, %edx 1935; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1936; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1937; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 1938; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1939; X86-AVX-NEXT: retl 1940; 1941; X64-SSE-LABEL: mul_2xi16_varconst4: 1942; X64-SSE: # %bb.0: # %entry 1943; X64-SSE-NEXT: movq c(%rip), %rax 1944; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1945; X64-SSE-NEXT: psrad $16, %xmm0 1946; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1947; X64-SSE-NEXT: psllq $32, %xmm0 1948; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1949; X64-SSE-NEXT: retq 1950; 1951; X64-AVX-LABEL: mul_2xi16_varconst4: 1952; X64-AVX: # %bb.0: # %entry 1953; X64-AVX-NEXT: movq c(%rip), %rax 1954; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1955; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 1956; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1957; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1958; X64-AVX-NEXT: retq 1959entry: 1960 %pre = load ptr, ptr @c 1961 %tmp6 = getelementptr inbounds i8, ptr %a, i64 %index 1962 %wide.load = load <2 x i16>, ptr %tmp6, align 1 1963 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1964 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768> 1965 %tmp14 = getelementptr inbounds i32, ptr %pre, i64 %index 1966 store <2 x i32> %tmp13, ptr %tmp14, align 4 1967 ret void 1968} 1969 1970; 1971; Illegal Types 1972; 1973 1974define void @PR34947(ptr %p0, ptr %p1) nounwind { 1975; X86-SSE-LABEL: PR34947: 1976; X86-SSE: # %bb.0: 1977; X86-SSE-NEXT: pushl %ebp 1978; X86-SSE-NEXT: pushl %ebx 1979; X86-SSE-NEXT: pushl %edi 1980; X86-SSE-NEXT: pushl %esi 1981; X86-SSE-NEXT: pushl %eax 1982; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1983; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1984; X86-SSE-NEXT: movzwl 16(%eax), %edx 1985; X86-SSE-NEXT: movl %edx, (%esp) # 4-byte Spill 1986; X86-SSE-NEXT: movdqa (%eax), %xmm2 1987; X86-SSE-NEXT: pxor %xmm1, %xmm1 1988; X86-SSE-NEXT: movdqa %xmm2, %xmm0 1989; X86-SSE-NEXT: pextrw $7, %xmm2, %eax 1990; X86-SSE-NEXT: pextrw $4, %xmm2, %esi 1991; X86-SSE-NEXT: pextrw $1, %xmm2, %edi 1992; X86-SSE-NEXT: pextrw $0, %xmm2, %ebx 1993; X86-SSE-NEXT: pextrw $3, %xmm2, %ebp 1994; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1995; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1996; X86-SSE-NEXT: xorl %edx, %edx 1997; X86-SSE-NEXT: divl 28(%ecx) 1998; X86-SSE-NEXT: movd %edx, %xmm1 1999; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 2000; X86-SSE-NEXT: movd %xmm3, %eax 2001; X86-SSE-NEXT: xorl %edx, %edx 2002; X86-SSE-NEXT: divl 24(%ecx) 2003; X86-SSE-NEXT: movd %edx, %xmm3 2004; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2005; X86-SSE-NEXT: movl %esi, %eax 2006; X86-SSE-NEXT: xorl %edx, %edx 2007; X86-SSE-NEXT: divl 16(%ecx) 2008; X86-SSE-NEXT: movd %edx, %xmm1 2009; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 2010; X86-SSE-NEXT: movd %xmm0, %eax 2011; X86-SSE-NEXT: xorl %edx, %edx 2012; X86-SSE-NEXT: divl 20(%ecx) 2013; X86-SSE-NEXT: movd %edx, %xmm0 2014; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2015; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 2016; X86-SSE-NEXT: movl %edi, %eax 2017; X86-SSE-NEXT: xorl %edx, %edx 2018; X86-SSE-NEXT: divl 4(%ecx) 2019; X86-SSE-NEXT: movd %edx, %xmm3 2020; X86-SSE-NEXT: movl %ebx, %eax 2021; X86-SSE-NEXT: xorl %edx, %edx 2022; X86-SSE-NEXT: divl (%ecx) 2023; X86-SSE-NEXT: movd %edx, %xmm0 2024; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2025; X86-SSE-NEXT: movl %ebp, %eax 2026; X86-SSE-NEXT: xorl %edx, %edx 2027; X86-SSE-NEXT: divl 12(%ecx) 2028; X86-SSE-NEXT: movd %edx, %xmm3 2029; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2030; X86-SSE-NEXT: movd %xmm2, %eax 2031; X86-SSE-NEXT: xorl %edx, %edx 2032; X86-SSE-NEXT: divl 8(%ecx) 2033; X86-SSE-NEXT: movd %edx, %xmm2 2034; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 2035; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2036; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload 2037; X86-SSE-NEXT: xorl %edx, %edx 2038; X86-SSE-NEXT: divl 32(%ecx) 2039; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] 2040; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2041; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 2042; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2043; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 2044; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2045; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2046; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 2047; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 2048; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2049; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 2050; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 2051; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2052; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 2053; X86-SSE-NEXT: movl %eax, (%eax) 2054; X86-SSE-NEXT: movdqa %xmm1, (%eax) 2055; X86-SSE-NEXT: movdqa %xmm0, (%eax) 2056; X86-SSE-NEXT: addl $4, %esp 2057; X86-SSE-NEXT: popl %esi 2058; X86-SSE-NEXT: popl %edi 2059; X86-SSE-NEXT: popl %ebx 2060; X86-SSE-NEXT: popl %ebp 2061; X86-SSE-NEXT: retl 2062; 2063; X86-AVX1-LABEL: PR34947: 2064; X86-AVX1: # %bb.0: 2065; X86-AVX1-NEXT: pushl %ebp 2066; X86-AVX1-NEXT: pushl %ebx 2067; X86-AVX1-NEXT: pushl %edi 2068; X86-AVX1-NEXT: pushl %esi 2069; X86-AVX1-NEXT: subl $16, %esp 2070; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 2071; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 2072; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2073; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2074; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2075; X86-AVX1-NEXT: vmovd %xmm2, %eax 2076; X86-AVX1-NEXT: xorl %edx, %edx 2077; X86-AVX1-NEXT: divl 32(%ecx) 2078; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 2079; X86-AVX1-NEXT: vpextrd $3, %xmm1, %eax 2080; X86-AVX1-NEXT: xorl %edx, %edx 2081; X86-AVX1-NEXT: divl 28(%ecx) 2082; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 2083; X86-AVX1-NEXT: vpextrd $2, %xmm1, %eax 2084; X86-AVX1-NEXT: xorl %edx, %edx 2085; X86-AVX1-NEXT: divl 24(%ecx) 2086; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 2087; X86-AVX1-NEXT: vpextrd $1, %xmm1, %eax 2088; X86-AVX1-NEXT: xorl %edx, %edx 2089; X86-AVX1-NEXT: divl 20(%ecx) 2090; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill 2091; X86-AVX1-NEXT: vmovd %xmm1, %eax 2092; X86-AVX1-NEXT: xorl %edx, %edx 2093; X86-AVX1-NEXT: divl 16(%ecx) 2094; X86-AVX1-NEXT: movl %edx, %ebp 2095; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax 2096; X86-AVX1-NEXT: xorl %edx, %edx 2097; X86-AVX1-NEXT: divl 12(%ecx) 2098; X86-AVX1-NEXT: movl %edx, %ebx 2099; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax 2100; X86-AVX1-NEXT: xorl %edx, %edx 2101; X86-AVX1-NEXT: divl 8(%ecx) 2102; X86-AVX1-NEXT: movl %edx, %esi 2103; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax 2104; X86-AVX1-NEXT: xorl %edx, %edx 2105; X86-AVX1-NEXT: divl 4(%ecx) 2106; X86-AVX1-NEXT: movl %edx, %edi 2107; X86-AVX1-NEXT: vmovd %xmm0, %eax 2108; X86-AVX1-NEXT: xorl %edx, %edx 2109; X86-AVX1-NEXT: divl (%ecx) 2110; X86-AVX1-NEXT: vmovd %edx, %xmm0 2111; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 2112; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 2113; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 2114; X86-AVX1-NEXT: vmovd %ebp, %xmm1 2115; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload 2116; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload 2117; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload 2118; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload 2119; X86-AVX1-NEXT: # imm = 0x2007 2120; X86-AVX1-NEXT: movl %eax, (%eax) 2121; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8199,8199,8199,8199] 2122; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 2123; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 2124; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax) 2125; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax) 2126; X86-AVX1-NEXT: addl $16, %esp 2127; X86-AVX1-NEXT: popl %esi 2128; X86-AVX1-NEXT: popl %edi 2129; X86-AVX1-NEXT: popl %ebx 2130; X86-AVX1-NEXT: popl %ebp 2131; X86-AVX1-NEXT: retl 2132; 2133; X86-AVX2-LABEL: PR34947: 2134; X86-AVX2: # %bb.0: 2135; X86-AVX2-NEXT: pushl %esi 2136; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi 2137; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 2138; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2139; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2140; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2141; X86-AVX2-NEXT: vpextrd $1, %xmm2, %eax 2142; X86-AVX2-NEXT: xorl %edx, %edx 2143; X86-AVX2-NEXT: divl 20(%esi) 2144; X86-AVX2-NEXT: movl %edx, %ecx 2145; X86-AVX2-NEXT: vmovd %xmm2, %eax 2146; X86-AVX2-NEXT: xorl %edx, %edx 2147; X86-AVX2-NEXT: divl 16(%esi) 2148; X86-AVX2-NEXT: vmovd %edx, %xmm3 2149; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 2150; X86-AVX2-NEXT: vpextrd $2, %xmm2, %eax 2151; X86-AVX2-NEXT: xorl %edx, %edx 2152; X86-AVX2-NEXT: divl 24(%esi) 2153; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3 2154; X86-AVX2-NEXT: vpextrd $3, %xmm2, %eax 2155; X86-AVX2-NEXT: xorl %edx, %edx 2156; X86-AVX2-NEXT: divl 28(%esi) 2157; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm2 2158; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax 2159; X86-AVX2-NEXT: xorl %edx, %edx 2160; X86-AVX2-NEXT: divl 4(%esi) 2161; X86-AVX2-NEXT: movl %edx, %ecx 2162; X86-AVX2-NEXT: vmovd %xmm1, %eax 2163; X86-AVX2-NEXT: xorl %edx, %edx 2164; X86-AVX2-NEXT: divl (%esi) 2165; X86-AVX2-NEXT: vmovd %edx, %xmm3 2166; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 2167; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax 2168; X86-AVX2-NEXT: xorl %edx, %edx 2169; X86-AVX2-NEXT: divl 8(%esi) 2170; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3 2171; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax 2172; X86-AVX2-NEXT: xorl %edx, %edx 2173; X86-AVX2-NEXT: divl 12(%esi) 2174; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1 2175; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2176; X86-AVX2-NEXT: vmovd %xmm0, %eax 2177; X86-AVX2-NEXT: xorl %edx, %edx 2178; X86-AVX2-NEXT: divl 32(%esi) 2179; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] 2180; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 2181; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 2182; X86-AVX2-NEXT: movl %eax, (%eax) 2183; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) 2184; X86-AVX2-NEXT: popl %esi 2185; X86-AVX2-NEXT: vzeroupper 2186; X86-AVX2-NEXT: retl 2187; 2188; X64-SSE-LABEL: PR34947: 2189; X64-SSE: # %bb.0: 2190; X64-SSE-NEXT: movzwl 16(%rdi), %ecx 2191; X64-SSE-NEXT: movdqa (%rdi), %xmm2 2192; X64-SSE-NEXT: pxor %xmm1, %xmm1 2193; X64-SSE-NEXT: movdqa %xmm2, %xmm0 2194; X64-SSE-NEXT: pextrw $7, %xmm2, %eax 2195; X64-SSE-NEXT: pextrw $4, %xmm2, %edi 2196; X64-SSE-NEXT: pextrw $1, %xmm2, %r8d 2197; X64-SSE-NEXT: pextrw $0, %xmm2, %r9d 2198; X64-SSE-NEXT: pextrw $3, %xmm2, %r10d 2199; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 2200; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2201; X64-SSE-NEXT: xorl %edx, %edx 2202; X64-SSE-NEXT: divl 28(%rsi) 2203; X64-SSE-NEXT: movd %edx, %xmm1 2204; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 2205; X64-SSE-NEXT: movd %xmm3, %eax 2206; X64-SSE-NEXT: xorl %edx, %edx 2207; X64-SSE-NEXT: divl 24(%rsi) 2208; X64-SSE-NEXT: movd %edx, %xmm3 2209; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2210; X64-SSE-NEXT: movl %edi, %eax 2211; X64-SSE-NEXT: xorl %edx, %edx 2212; X64-SSE-NEXT: divl 16(%rsi) 2213; X64-SSE-NEXT: movd %edx, %xmm1 2214; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 2215; X64-SSE-NEXT: movd %xmm0, %eax 2216; X64-SSE-NEXT: xorl %edx, %edx 2217; X64-SSE-NEXT: divl 20(%rsi) 2218; X64-SSE-NEXT: movd %edx, %xmm0 2219; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2220; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 2221; X64-SSE-NEXT: movl %r8d, %eax 2222; X64-SSE-NEXT: xorl %edx, %edx 2223; X64-SSE-NEXT: divl 4(%rsi) 2224; X64-SSE-NEXT: movd %edx, %xmm0 2225; X64-SSE-NEXT: movl %r9d, %eax 2226; X64-SSE-NEXT: xorl %edx, %edx 2227; X64-SSE-NEXT: divl (%rsi) 2228; X64-SSE-NEXT: movd %edx, %xmm3 2229; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2230; X64-SSE-NEXT: movl %r10d, %eax 2231; X64-SSE-NEXT: xorl %edx, %edx 2232; X64-SSE-NEXT: divl 12(%rsi) 2233; X64-SSE-NEXT: movd %edx, %xmm0 2234; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2235; X64-SSE-NEXT: movd %xmm2, %eax 2236; X64-SSE-NEXT: xorl %edx, %edx 2237; X64-SSE-NEXT: divl 8(%rsi) 2238; X64-SSE-NEXT: movd %edx, %xmm2 2239; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2240; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] 2241; X64-SSE-NEXT: movl %ecx, %eax 2242; X64-SSE-NEXT: xorl %edx, %edx 2243; X64-SSE-NEXT: divl 32(%rsi) 2244; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] 2245; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 2246; X64-SSE-NEXT: pmuludq %xmm0, %xmm3 2247; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2248; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 2249; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2250; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2251; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2252; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 2253; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2254; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 2255; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2256; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2257; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 2258; X64-SSE-NEXT: movl %eax, (%rax) 2259; X64-SSE-NEXT: movdqa %xmm1, (%rax) 2260; X64-SSE-NEXT: movdqa %xmm3, (%rax) 2261; X64-SSE-NEXT: retq 2262; 2263; X64-AVX1-LABEL: PR34947: 2264; X64-AVX1: # %bb.0: 2265; X64-AVX1-NEXT: pushq %rbp 2266; X64-AVX1-NEXT: pushq %rbx 2267; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2268; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2269; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2270; X64-AVX1-NEXT: vmovd %xmm2, %eax 2271; X64-AVX1-NEXT: xorl %edx, %edx 2272; X64-AVX1-NEXT: divl 32(%rsi) 2273; X64-AVX1-NEXT: movl %edx, %ecx 2274; X64-AVX1-NEXT: vpextrd $3, %xmm1, %eax 2275; X64-AVX1-NEXT: xorl %edx, %edx 2276; X64-AVX1-NEXT: divl 28(%rsi) 2277; X64-AVX1-NEXT: movl %edx, %edi 2278; X64-AVX1-NEXT: vpextrd $2, %xmm1, %eax 2279; X64-AVX1-NEXT: xorl %edx, %edx 2280; X64-AVX1-NEXT: divl 24(%rsi) 2281; X64-AVX1-NEXT: movl %edx, %r8d 2282; X64-AVX1-NEXT: vpextrd $1, %xmm1, %eax 2283; X64-AVX1-NEXT: xorl %edx, %edx 2284; X64-AVX1-NEXT: divl 20(%rsi) 2285; X64-AVX1-NEXT: movl %edx, %r9d 2286; X64-AVX1-NEXT: vmovd %xmm1, %eax 2287; X64-AVX1-NEXT: xorl %edx, %edx 2288; X64-AVX1-NEXT: divl 16(%rsi) 2289; X64-AVX1-NEXT: movl %edx, %r10d 2290; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax 2291; X64-AVX1-NEXT: xorl %edx, %edx 2292; X64-AVX1-NEXT: divl 12(%rsi) 2293; X64-AVX1-NEXT: movl %edx, %r11d 2294; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax 2295; X64-AVX1-NEXT: xorl %edx, %edx 2296; X64-AVX1-NEXT: divl 8(%rsi) 2297; X64-AVX1-NEXT: movl %edx, %ebx 2298; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax 2299; X64-AVX1-NEXT: xorl %edx, %edx 2300; X64-AVX1-NEXT: divl 4(%rsi) 2301; X64-AVX1-NEXT: movl %edx, %ebp 2302; X64-AVX1-NEXT: vmovd %xmm0, %eax 2303; X64-AVX1-NEXT: xorl %edx, %edx 2304; X64-AVX1-NEXT: divl (%rsi) 2305; X64-AVX1-NEXT: vmovd %edx, %xmm0 2306; X64-AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 2307; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 2308; X64-AVX1-NEXT: vpinsrd $3, %r11d, %xmm0, %xmm0 2309; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8199,8199,8199,8199] 2310; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2311; X64-AVX1-NEXT: vmovd %r10d, %xmm2 2312; X64-AVX1-NEXT: vpinsrd $1, %r9d, %xmm2, %xmm2 2313; X64-AVX1-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2 2314; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 2315; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 2316; X64-AVX1-NEXT: imull $8199, %ecx, %eax # imm = 0x2007 2317; X64-AVX1-NEXT: movl %eax, (%rax) 2318; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax) 2319; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax) 2320; X64-AVX1-NEXT: popq %rbx 2321; X64-AVX1-NEXT: popq %rbp 2322; X64-AVX1-NEXT: retq 2323; 2324; X64-AVX2-LABEL: PR34947: 2325; X64-AVX2: # %bb.0: 2326; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2327; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2328; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2329; X64-AVX2-NEXT: vpextrd $1, %xmm2, %eax 2330; X64-AVX2-NEXT: xorl %edx, %edx 2331; X64-AVX2-NEXT: divl 20(%rsi) 2332; X64-AVX2-NEXT: movl %edx, %ecx 2333; X64-AVX2-NEXT: vmovd %xmm2, %eax 2334; X64-AVX2-NEXT: xorl %edx, %edx 2335; X64-AVX2-NEXT: divl 16(%rsi) 2336; X64-AVX2-NEXT: vmovd %edx, %xmm3 2337; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 2338; X64-AVX2-NEXT: vpextrd $2, %xmm2, %eax 2339; X64-AVX2-NEXT: xorl %edx, %edx 2340; X64-AVX2-NEXT: divl 24(%rsi) 2341; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3 2342; X64-AVX2-NEXT: vpextrd $3, %xmm2, %eax 2343; X64-AVX2-NEXT: xorl %edx, %edx 2344; X64-AVX2-NEXT: divl 28(%rsi) 2345; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm2 2346; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax 2347; X64-AVX2-NEXT: xorl %edx, %edx 2348; X64-AVX2-NEXT: divl 4(%rsi) 2349; X64-AVX2-NEXT: movl %edx, %ecx 2350; X64-AVX2-NEXT: vmovd %xmm1, %eax 2351; X64-AVX2-NEXT: xorl %edx, %edx 2352; X64-AVX2-NEXT: divl (%rsi) 2353; X64-AVX2-NEXT: vmovd %edx, %xmm3 2354; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 2355; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax 2356; X64-AVX2-NEXT: xorl %edx, %edx 2357; X64-AVX2-NEXT: divl 8(%rsi) 2358; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3 2359; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax 2360; X64-AVX2-NEXT: xorl %edx, %edx 2361; X64-AVX2-NEXT: divl 12(%rsi) 2362; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1 2363; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2364; X64-AVX2-NEXT: vmovd %xmm0, %eax 2365; X64-AVX2-NEXT: xorl %edx, %edx 2366; X64-AVX2-NEXT: divl 32(%rsi) 2367; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] 2368; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 2369; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 2370; X64-AVX2-NEXT: movl %eax, (%rax) 2371; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) 2372; X64-AVX2-NEXT: vzeroupper 2373; X64-AVX2-NEXT: retq 2374 %a0 = load <9 x i16>, ptr %p0, align 64 2375 %a1 = load <9 x i32>, ptr %p1, align 64 2376 %ext0 = zext <9 x i16> %a0 to <9 x i32> 2377 %rem = urem <9 x i32> %ext0, %a1 2378 %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem 2379 store <9 x i32> %mul, ptr undef, align 64 2380 ret void 2381} 2382