1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 5 6declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone 7declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone 8 9define <4 x i32> @combine_pmaddwd_zero(<8 x i16> %a0, <8 x i16> %a1) { 10; SSE-LABEL: combine_pmaddwd_zero: 11; SSE: # %bb.0: 12; SSE-NEXT: xorps %xmm0, %xmm0 13; SSE-NEXT: retq 14; 15; AVX-LABEL: combine_pmaddwd_zero: 16; AVX: # %bb.0: 17; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 18; AVX-NEXT: retq 19 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> zeroinitializer) 20 ret <4 x i32> %1 21} 22 23define <4 x i32> @combine_pmaddwd_zero_commute(<8 x i16> %a0, <8 x i16> %a1) { 24; SSE-LABEL: combine_pmaddwd_zero_commute: 25; SSE: # %bb.0: 26; SSE-NEXT: xorps %xmm0, %xmm0 27; SSE-NEXT: retq 28; 29; AVX-LABEL: combine_pmaddwd_zero_commute: 30; AVX: # %bb.0: 31; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 32; AVX-NEXT: retq 33 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> zeroinitializer, <8 x i16> %a0) 34 ret <4 x i32> %1 35} 36 37define <8 x i32> @combine_pmaddwd_concat(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { 38; SSE-LABEL: combine_pmaddwd_concat: 39; SSE: # %bb.0: 40; SSE-NEXT: pmaddwd %xmm1, %xmm0 41; SSE-NEXT: pmaddwd %xmm3, %xmm2 42; SSE-NEXT: movdqa %xmm2, %xmm1 43; SSE-NEXT: retq 44; 45; AVX1-LABEL: combine_pmaddwd_concat: 46; AVX1: # %bb.0: 47; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 48; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm1 49; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 50; AVX1-NEXT: retq 51; 52; AVX2-LABEL: combine_pmaddwd_concat: 53; AVX2: # %bb.0: 54; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 55; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 56; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 57; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 58; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 59; AVX2-NEXT: retq 60 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 61 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a2, <8 x i16> %a3) 62 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 63 ret <8 x i32> %3 64} 65 66define <8 x i32> @combine_pmaddwd_concat_freeze(<8 x i16> %a0, <8 x i16> %a1) { 67; SSE-LABEL: combine_pmaddwd_concat_freeze: 68; SSE: # %bb.0: 69; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] 70; SSE-NEXT: pmaddwd %xmm2, %xmm0 71; SSE-NEXT: pmaddwd %xmm2, %xmm1 72; SSE-NEXT: retq 73; 74; AVX1-LABEL: combine_pmaddwd_concat_freeze: 75; AVX1: # %bb.0: 76; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] 77; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 78; AVX1-NEXT: vpmaddwd %xmm2, %xmm1, %xmm1 79; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 80; AVX1-NEXT: retq 81; 82; AVX2-LABEL: combine_pmaddwd_concat_freeze: 83; AVX2: # %bb.0: 84; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 85; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 86; AVX2-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 87; AVX2-NEXT: retq 88 %lo = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) 89 %hi = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) 90 %flo = freeze <4 x i32> %lo 91 %fhi = freeze <4 x i32> %hi 92 %res = shufflevector <4 x i32> %flo, <4 x i32> %fhi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 93 ret <8 x i32> %res 94} 95 96define <4 x i32> @combine_pmaddwd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) { 97; SSE-LABEL: combine_pmaddwd_demandedelts: 98; SSE: # %bb.0: 99; SSE-NEXT: pmaddwd %xmm1, %xmm0 100; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 101; SSE-NEXT: retq 102; 103; AVX1-LABEL: combine_pmaddwd_demandedelts: 104; AVX1: # %bb.0: 105; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 106; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 107; AVX1-NEXT: retq 108; 109; AVX2-LABEL: combine_pmaddwd_demandedelts: 110; AVX2: # %bb.0: 111; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 112; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 113; AVX2-NEXT: retq 114 %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> 115 %2 = shufflevector <8 x i16> %a1, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 7, i32 7> 116 %3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %1, <8 x i16> %2) 117 %4 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> zeroinitializer 118 ret <4 x i32> %4 119} 120 121; [2]: (-5*13)+(6*-15) = -155 = 4294967141 122define <4 x i32> @combine_pmaddwd_constant() { 123; SSE-LABEL: combine_pmaddwd_constant: 124; SSE: # %bb.0: 125; SSE-NEXT: movaps {{.*#+}} xmm0 = [19,17,4294967141,271] 126; SSE-NEXT: retq 127; 128; AVX-LABEL: combine_pmaddwd_constant: 129; AVX: # %bb.0: 130; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [19,17,4294967141,271] 131; AVX-NEXT: retq 132 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8>, <8 x i16> <i16 -5, i16 7, i16 -9, i16 -11, i16 13, i16 -15, i16 17, i16 -19>) 133 ret <4 x i32> %1 134} 135 136; ensure we don't assume pmaddwd performs add nsw 137; [0]: (-32768*-32768)+(-32768*-32768) = 0x80000000 = 2147483648 138define <4 x i32> @combine_pmaddwd_constant_nsw() { 139; SSE-LABEL: combine_pmaddwd_constant_nsw: 140; SSE: # %bb.0: 141; SSE-NEXT: movaps {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] 142; SSE-NEXT: retq 143; 144; AVX-LABEL: combine_pmaddwd_constant_nsw: 145; AVX: # %bb.0: 146; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] 147; AVX-NEXT: retq 148 %1 = insertelement <8 x i16> undef, i16 32768, i32 0 149 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer 150 %3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %2, <8 x i16> %2) 151 ret <4 x i32> %3 152} 153 154define <8 x i16> @combine_pmaddubsw_zero(<16 x i8> %a0, <16 x i8> %a1) { 155; SSE-LABEL: combine_pmaddubsw_zero: 156; SSE: # %bb.0: 157; SSE-NEXT: xorps %xmm0, %xmm0 158; SSE-NEXT: retq 159; 160; AVX-LABEL: combine_pmaddubsw_zero: 161; AVX: # %bb.0: 162; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 163; AVX-NEXT: retq 164 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> zeroinitializer) 165 ret <8 x i16> %1 166} 167 168define <8 x i16> @combine_pmaddubsw_zero_commute(<16 x i8> %a0, <16 x i8> %a1) { 169; SSE-LABEL: combine_pmaddubsw_zero_commute: 170; SSE: # %bb.0: 171; SSE-NEXT: xorps %xmm0, %xmm0 172; SSE-NEXT: retq 173; 174; AVX-LABEL: combine_pmaddubsw_zero_commute: 175; AVX: # %bb.0: 176; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 177; AVX-NEXT: retq 178 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> zeroinitializer, <16 x i8> %a0) 179 ret <8 x i16> %1 180} 181 182define <16 x i16> @combine_pmaddubsw_concat(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> %a3) { 183; SSE-LABEL: combine_pmaddubsw_concat: 184; SSE: # %bb.0: 185; SSE-NEXT: pmaddubsw %xmm1, %xmm0 186; SSE-NEXT: pmaddubsw %xmm3, %xmm2 187; SSE-NEXT: movdqa %xmm2, %xmm1 188; SSE-NEXT: retq 189; 190; AVX1-LABEL: combine_pmaddubsw_concat: 191; AVX1: # %bb.0: 192; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 193; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm1 194; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 195; AVX1-NEXT: retq 196; 197; AVX2-LABEL: combine_pmaddubsw_concat: 198; AVX2: # %bb.0: 199; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 200; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 201; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 202; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 203; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 204; AVX2-NEXT: retq 205 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 206 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a2, <16 x i8> %a3) 207 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 208 ret <16 x i16> %3 209} 210 211define <16 x i16> @combine_pmaddubsw_concat_freeze(<16 x i8> %a0, <16 x i8> %a1) { 212; SSE-LABEL: combine_pmaddubsw_concat_freeze: 213; SSE: # %bb.0: 214; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 215; SSE-NEXT: pmaddubsw %xmm2, %xmm0 216; SSE-NEXT: pmaddubsw %xmm2, %xmm1 217; SSE-NEXT: retq 218; 219; AVX1-LABEL: combine_pmaddubsw_concat_freeze: 220; AVX1: # %bb.0: 221; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 222; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 223; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 224; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 225; AVX1-NEXT: retq 226; 227; AVX2-LABEL: combine_pmaddubsw_concat_freeze: 228; AVX2: # %bb.0: 229; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 230; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 231; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 232; AVX2-NEXT: retq 233 %lo = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>) 234 %hi = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>) 235 %flo = freeze <8 x i16> %lo 236 %fhi = freeze <8 x i16> %hi 237 %res = shufflevector <8 x i16> %flo, <8 x i16> %fhi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 238 ret <16 x i16> %res 239} 240 241define <8 x i16> @combine_pmaddubsw_demandedelts(<16 x i8> %a0, <16 x i8> %a1) { 242; SSE-LABEL: combine_pmaddubsw_demandedelts: 243; SSE: # %bb.0: 244; SSE-NEXT: pmaddubsw %xmm1, %xmm0 245; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 246; SSE-NEXT: retq 247; 248; AVX1-LABEL: combine_pmaddubsw_demandedelts: 249; AVX1: # %bb.0: 250; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 251; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 252; AVX1-NEXT: retq 253; 254; AVX2-LABEL: combine_pmaddubsw_demandedelts: 255; AVX2: # %bb.0: 256; AVX2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 257; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 258; AVX2-NEXT: retq 259 %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 260 %2 = shufflevector <16 x i8> %a1, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 261 %3 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %1, <16 x i8> %2) 262 %4 = shufflevector <8 x i16> %3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 263 ret <8 x i16> %4 264} 265 266; [3]: ((uint16_t)-6*7)+(7*-8) = (250*7)+(7*-8) = 1694 267define i32 @combine_pmaddubsw_constant() { 268; CHECK-LABEL: combine_pmaddubsw_constant: 269; CHECK: # %bb.0: 270; CHECK-NEXT: movl $1694, %eax # imm = 0x69E 271; CHECK-NEXT: retq 272 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>) 273 %2 = extractelement <8 x i16> %1, i32 3 274 %3 = sext i16 %2 to i32 275 ret i32 %3 276} 277 278; [0]: add_sat_i16(((uint16_t)-1*-128),((uint16_t)-1*-128)_ = add_sat_i16(255*-128),(255*-128)) = sat_i16(-65280) = -32768 279define i32 @combine_pmaddubsw_constant_sat() { 280; CHECK-LABEL: combine_pmaddubsw_constant_sat: 281; CHECK: # %bb.0: 282; CHECK-NEXT: movl $-32768, %eax # imm = 0x8000 283; CHECK-NEXT: retq 284 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>) 285 %2 = extractelement <8 x i16> %1, i32 0 286 %3 = sext i16 %2 to i32 287 ret i32 %3 288} 289 290; Constant folding PMADDWD was causing an infinite loop in the PCMPGT commuting between 2 constant values. 291define i1 @pmaddwd_pcmpgt_infinite_loop() { 292; CHECK-LABEL: pmaddwd_pcmpgt_infinite_loop: 293; CHECK: # %bb.0: 294; CHECK-NEXT: movb $1, %al 295; CHECK-NEXT: retq 296 %1 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>) 297 %2 = icmp eq <4 x i32> %1, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 298 %3 = select <4 x i1> %2, <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> zeroinitializer 299 %4 = add <4 x i32> %3, <i32 -8, i32 -9, i32 -10, i32 -11> 300 %.not = trunc <4 x i32> %3 to <4 x i1> 301 %5 = icmp sgt <4 x i32> %4, <i32 2147483640, i32 2147483639, i32 2147483638, i32 2147483637> 302 %6 = select <4 x i1> %.not, <4 x i1> %5, <4 x i1> zeroinitializer 303 %7 = bitcast <4 x i1> %6 to i4 304 %8 = icmp eq i4 %7, 0 305 ret i1 %8 306} 307